WOEEncoder 的转换器,来自 categorical_encoder

WOEEncoder 是在 categorical_encoder 中实现的转换器,因此,任何转换器都不会包含在 *sklearn-onnx* 中,*sklearn-onnx* 只实现 *scikit-learn* 模型的转换器。无论如何,本示例演示了如何为 *WOEEncoder* 实现自定义转换器。此代码尚未针对原始编码器可能处理的所有情况进行全面测试。

一个简单示例

我们以 Iris 数据集为例。每个特征都被转换为整数。

import numpy as np
from onnxruntime import InferenceSession
from sklearn.datasets import load_iris
from sklearn.preprocessing import OrdinalEncoder as SklOrdinalEncoder
from category_encoders import WOEEncoder, OrdinalEncoder
from skl2onnx import update_registered_converter, to_onnx, get_model_alias
from skl2onnx.common.data_types import FloatTensorType
from skl2onnx.common.utils import check_input_and_output_numbers
from skl2onnx.algebra.onnx_ops import OnnxCast
from skl2onnx.algebra.onnx_operator import OnnxSubEstimator
from skl2onnx.sklapi import WOETransformer
import skl2onnx.sklapi.register  # noqa: F401

data = load_iris()
X, y = data.data, data.target
X = X.astype(np.int64)[:, :2]
y = (y == 2).astype(np.int64)

woe = WOEEncoder(cols=[0]).fit(X, y)
print(woe.transform(X[:5]))
          0  1
0 -1.405712  3
1 -1.724166  3
2 -1.724166  3
3 -1.724166  3
4 -1.405712  3

我们来看看模型的训练参数。似乎 WOEEncoder 使用了一个 OrdinalEncoder,但不是 scikit-learn 中的那个。我们需要为这个模型工具添加一个转换器。

print("encoder", type(woe.ordinal_encoder), woe.ordinal_encoder)
print("mapping", woe.mapping)
print("encoder.mapping", woe.ordinal_encoder.mapping)
print("encoder.cols", woe.ordinal_encoder.cols)
encoder <class 'category_encoders.ordinal.OrdinalEncoder'> OrdinalEncoder(cols=[0],
               mapping=[{'col': 0, 'data_type': dtype('int64'),
                         'mapping': 5.0    1
4.0    2
7.0    3
6.0    4
NaN   -2
dtype: int64}])
mapping {0: 0
 1   -1.405712
 2   -1.724166
 3    2.545531
 4    0.961411
-1    0.000000
-2    0.000000
dtype: float64}
encoder.mapping [{'col': 0, 'mapping': 5.0    1
4.0    2
7.0    3
6.0    4
NaN   -2
dtype: int64, 'data_type': dtype('int64')}]
encoder.cols [0]

OrdinalEncoder 的自定义转换器

我们从示例 实现一个新的转换器 开始,然后编写转换代码。

def ordenc_to_sklearn(op_mapping):
    "Converts OrdinalEncoder mapping to scikit-learn OrdinalEncoder."
    cats = []
    for column_map in op_mapping:
        col = column_map["col"]
        while len(cats) <= col:
            cats.append(None)
        mapping = column_map["mapping"]
        res = []
        for i in range(mapping.shape[0]):
            if np.isnan(mapping.index[i]):
                continue
            ind = mapping.iloc[i]
            while len(res) <= ind:
                res.append(0)
            res[ind] = mapping.index[i]
        cats[col] = np.array(res, dtype=np.int64)

    skl_ord = SklOrdinalEncoder(categories=cats, dtype=np.int64)
    skl_ord.categories_ = cats
    return skl_ord


def ordinal_encoder_shape_calculator(operator):
    check_input_and_output_numbers(operator, input_count_range=1, output_count_range=1)
    input_type = operator.inputs[0].type.__class__
    input_dim = operator.inputs[0].get_first_dimension()
    shape = operator.inputs[0].type.shape
    second_dim = None if len(shape) != 2 else shape[1]
    output_type = input_type([input_dim, second_dim])
    operator.outputs[0].type = output_type


def ordinal_encoder_converter(scope, operator, container):
    op = operator.raw_operator
    opv = container.target_opset
    X = operator.inputs[0]

    skl_ord = ordenc_to_sklearn(op.mapping)
    cat = OnnxSubEstimator(
        skl_ord, X, op_version=opv, output_names=operator.outputs[:1]
    )
    cat.add_to(scope, container)


update_registered_converter(
    OrdinalEncoder,
    "CategoricalEncoderOrdinalEncoder",
    ordinal_encoder_shape_calculator,
    ordinal_encoder_converter,
)

我们来计算一个简短示例的输出。

enc = OrdinalEncoder(cols=[0, 1])
enc.fit(X)
print(enc.transform(X[:5]))
   0  1
0  1  1
1  2  1
2  2  1
3  2  1
4  1  1

我们来检查 ONNX 转换是否产生相同的结果。

ord_onx = to_onnx(enc, X[:1], target_opset=14)
sess = InferenceSession(ord_onx.SerializeToString(), providers=["CPUExecutionProvider"])
print(sess.run(None, {"X": X[:5]})[0])
[[1 1]
 [2 1]
 [2 1]
 [2 1]
 [1 1]]

这有效。

WOEEncoder 的自定义转换器

我们从示例 实现一个新的转换器 开始,然后编写转换代码。

def woeenc_to_sklearn(op_mapping):
    "Converts WOEEncoder mapping to scikit-learn OrdinalEncoder."
    cats = []
    ws = []
    for column_map in op_mapping.items():
        col = column_map[0]
        while len(cats) <= col:
            cats.append("passthrough")
            ws.append(None)
        mapping = column_map[1]
        intervals = []
        weights = []
        for i in range(mapping.shape[0]):
            ind = mapping.index[i]
            if ind < 0:
                continue
            intervals.append((float(ind - 1), float(ind), False, True))
            weights.append(mapping.iloc[i])
        cats[col] = intervals
        ws[col] = weights

    skl = WOETransformer(intervals=cats, weights=ws, onehot=False)
    skl.fit(None)
    return skl


def woe_encoder_parser(scope, model, inputs, custom_parsers=None):
    if len(inputs) != 1:
        raise RuntimeError("Unexpected number of inputs: %d != 1." % len(inputs))
    if inputs[0].type is None:
        raise RuntimeError("Unexpected type: %r." % (inputs[0],))
    alias = get_model_alias(type(model))
    this_operator = scope.declare_local_operator(alias, model)
    this_operator.inputs.append(inputs[0])
    this_operator.outputs.append(
        scope.declare_local_variable("catwoe", FloatTensorType())
    )
    return this_operator.outputs


def woe_encoder_shape_calculator(operator):
    check_input_and_output_numbers(operator, input_count_range=1, output_count_range=1)
    input_dim = operator.inputs[0].get_first_dimension()
    shape = operator.inputs[0].type.shape
    second_dim = None if len(shape) != 2 else shape[1]
    output_type = FloatTensorType([input_dim, second_dim])
    operator.outputs[0].type = output_type


def woe_encoder_converter(scope, operator, container):
    op = operator.raw_operator
    opv = container.target_opset
    X = operator.inputs[0]

    sub = OnnxSubEstimator(op.ordinal_encoder, X, op_version=opv)
    cast = OnnxCast(sub, op_version=opv, to=np.float32)
    skl_ord = woeenc_to_sklearn(op.mapping)
    cat = OnnxSubEstimator(
        skl_ord,
        cast,
        op_version=opv,
        output_names=operator.outputs[:1],
        input_types=[FloatTensorType()],
    )
    cat.add_to(scope, container)


update_registered_converter(
    WOEEncoder,
    "CategoricalEncoderWOEEncoder",
    woe_encoder_shape_calculator,
    woe_encoder_converter,
    parser=woe_encoder_parser,
)

我们来计算一个简短示例的输出。

woe = WOEEncoder(cols=[0, 1]).fit(X, y)
print(woe.transform(X[:5]))
          0         1
0 -1.405712 -0.035947
1 -1.724166 -0.035947
2 -1.724166 -0.035947
3 -1.724166 -0.035947
4 -1.405712 -0.035947

我们来检查 ONNX 转换是否产生相同的结果。

woe_onx = to_onnx(woe, X[:1], target_opset=14)
sess = InferenceSession(woe_onx.SerializeToString(), providers=["CPUExecutionProvider"])
print(sess.run(None, {"X": X[:5]})[0])
[[-1.4057125  -0.03594739]
 [-1.7241662  -0.03594739]
 [-1.7241662  -0.03594739]
 [-1.7241662  -0.03594739]
 [-1.4057125  -0.03594739]]

脚本总运行时间:(0 分 0.512 秒)

Sphinx-Gallery 生成的图库