FeatureHasher、pandas 值和意外差异

一场寻找错误并发现多个地方的游戏。

初始示例

import logging
import numpy as np
from pandas import DataFrame
from onnxruntime import InferenceSession, SessionOptions
from onnxruntime_extensions import get_library_path
from sklearn.feature_extraction import FeatureHasher
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import GradientBoostingClassifier
from skl2onnx import to_onnx
from skl2onnx.common.data_types import StringTensorType

log = logging.getLogger("skl2onnx")
log.setLevel(logging.ERROR)


df = DataFrame(
    {
        "Cat1": ["a", "b", "d", "abd", "e", "z", "ez"],
        "Cat2": ["A", "B", "D", "ABD", "e", "z", "ez"],
        "Label": [1, 1, 0, 0, 1, 0, 0],
    }
)

cat_features = [c for c in df.columns if "Cat" in c]
X_train = df[cat_features]

X_train["cat_features"] = df[cat_features].values.tolist()
X_train = X_train.drop(cat_features, axis=1)
y_train = df["Label"]

pipe = Pipeline(
    steps=[
        (
            "preprocessor",
            ColumnTransformer(
                [
                    (
                        "cat_preprocessor",
                        FeatureHasher(
                            n_features=8,
                            input_type="string",
                            alternate_sign=False,
                            dtype=np.float32,
                        ),
                        "cat_features",
                    )
                ],
                sparse_threshold=0.0,
            ),
        ),
        ("classifier", GradientBoostingClassifier(n_estimators=2, max_depth=2)),
    ],
)
pipe.fit(X_train, y_train)
Pipeline(steps=[('preprocessor',
                 ColumnTransformer(sparse_threshold=0.0,
                                   transformers=[('cat_preprocessor',
                                                  FeatureHasher(alternate_sign=False,
                                                                dtype=<class 'numpy.float32'>,
                                                                input_type='string',
                                                                n_features=8),
                                                  'cat_features')])),
                ('classifier',
                 GradientBoostingClassifier(max_depth=2, n_estimators=2))])
在 Jupyter 环境中,请重新运行此单元格以显示 HTML 表示,或信任该笔记本。
在 GitHub 上,HTML 表示无法渲染,请尝试使用 nbviewer.org 加载此页面。


转换为 ONNX。

onx = to_onnx(
    pipe,
    initial_types=[("cat_features", StringTensorType([None, None]))],
    options={"zipmap": False},
)

存在许多差异吗?

expected_proba = pipe.predict_proba(X_train)
sess = InferenceSession(onx.SerializeToString(), providers=["CPUExecutionProvider"])


got = sess.run(None, dict(cat_features=X_train.values))


print("expected probabilities")
print(expected_proba)

print("onnx probabilities")
print(got[1])
expected probabilities
[[0.53360884 0.46639116]
 [0.53637279 0.46362721]
 [0.61595526 0.38404474]
 [0.61595526 0.38404474]
 [0.46226277 0.53773723]
 [0.61595526 0.38404474]
 [0.61595526 0.38404474]]
onnx probabilities
[[0.6159553  0.3840447 ]
 [0.5363728  0.46362722]
 [0.5363728  0.46362722]
 [0.6159553  0.3840447 ]
 [0.6159553  0.3840447 ]
 [0.6159553  0.3840447 ]
 [0.53360885 0.46639115]]

让我们检查 FeatureHasher

我们只是移除了分类器。

pipe_hash = Pipeline(
    steps=[
        (
            "preprocessor",
            ColumnTransformer(
                [
                    (
                        "cat_preprocessor",
                        FeatureHasher(
                            n_features=8,
                            input_type="string",
                            alternate_sign=False,
                            dtype=np.float32,
                        ),
                        "cat_features",
                    )
                ],
                sparse_threshold=0.0,
            ),
        ),
    ],
)
pipe_hash.fit(X_train, y_train)

onx = to_onnx(
    pipe_hash,
    initial_types=[("cat_features", StringTensorType([None, None]))],
    options={"zipmap": False},
)

expected = pipe_hash.transform(X_train)
sess = InferenceSession(onx.SerializeToString(), providers=["CPUExecutionProvider"])


got = sess.run(None, dict(cat_features=X_train.values))


print("expected hashed features")
print(expected)

print("onnx hashed features")
print(got[0])
expected hashed features
[[0. 0. 1. 0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 0. 1. 1. 0.]
 [0. 0. 0. 1. 0. 0. 1. 0.]
 [0. 0. 0. 0. 1. 0. 1. 0.]
 [0. 0. 0. 0. 0. 0. 0. 2.]
 [0. 2. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 2. 0.]]
onnx hashed features
[[1. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 1. 0. 0.]
 [0. 0. 0. 0. 0. 1. 0. 0.]
 [0. 1. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 1. 0. 0. 0.]
 [0. 0. 0. 1. 0. 0. 0. 0.]
 [0. 0. 1. 0. 0. 0. 0. 0.]]

似乎没有任何作用。

第一个提议

指令 X_train["cat_features"] = df[cat_features].values.tolist() 创建了一个 DataFrame,其中一列是包含两个值的列表。scikit-learn 期望列表类型,并且可以处理每个列表的可变数量的元素。onnxruntime 无法做到这一点。它必须更改为以下内容。

pipe_hash = Pipeline(
    steps=[
        (
            "preprocessor",
            ColumnTransformer(
                [
                    (
                        "cat_preprocessor1",
                        FeatureHasher(
                            n_features=8,
                            input_type="string",
                            alternate_sign=False,
                            dtype=np.float32,
                        ),
                        [0],
                    ),
                    (
                        "cat_preprocessor2",
                        FeatureHasher(
                            n_features=8,
                            input_type="string",
                            alternate_sign=False,
                            dtype=np.float32,
                        ),
                        [1],
                    ),
                ],
                sparse_threshold=0.0,
            ),
        ),
    ],
)

X_train_skl = df[cat_features].copy()
for c in cat_features:
    X_train_skl[c] = X_train_skl[c].values.tolist()

pipe_hash.fit(X_train_skl.values, y_train)

onx = to_onnx(
    pipe_hash,
    initial_types=[
        ("cat1", StringTensorType([None, 1])),
        ("cat2", StringTensorType([None, 1])),
    ],
    options={"zipmap": False},
)


expected = pipe_hash.transform(X_train_skl.values)
sess = InferenceSession(onx.SerializeToString(), providers=["CPUExecutionProvider"])


got = sess.run(
    None,
    dict(
        cat1=df["Cat1"].values.reshape((-1, 1)), cat2=df["Cat2"].values.reshape((-1, 1))
    ),
)


print("expected fixed hashed features")
print(expected)

print("onnx fixed hashed features")
print(got[0])
expected fixed hashed features
[[0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 1. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 1.]
 [0. 1. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 1. 0.]]
onnx fixed hashed features
[[0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 1. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 1.]
 [0. 1. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 1. 0.]]

这并非原始的管道。它有 16 列而不是 8 列,但它确实产生了相同的结果。一个选项是使用自定义转换器将前 8 列添加到其他 8 列。

第二个提议

我们使用相同的初始管道,但我们调整 onnxruntime 接收的输入。

pipe_hash = Pipeline(
    steps=[
        (
            "preprocessor",
            ColumnTransformer(
                [
                    (
                        "cat_preprocessor",
                        FeatureHasher(
                            n_features=8,
                            input_type="string",
                            alternate_sign=False,
                            dtype=np.float32,
                        ),
                        "cat_features",
                    )
                ],
                sparse_threshold=0.0,
            ),
        ),
    ],
)
pipe_hash.fit(X_train, y_train)

onx = to_onnx(
    pipe_hash,
    initial_types=[("cat_features", StringTensorType([None, 1]))],
    options={"zipmap": False, "preprocessor__cat_preprocessor__separator": "#"},
)

expected = pipe_hash.transform(X_train)


so = SessionOptions()
so.register_custom_ops_library(get_library_path())
sess = InferenceSession(onx.SerializeToString(), so, providers=["CPUExecutionProvider"])

# We merged both columns cat1 and cat2 into a single cat_features.
df_fixed = DataFrame()
df_fixed["cat_features"] = np.array([f"{a}#{b}" for a, b in X_train["cat_features"]])

got = sess.run(None, {"cat_features": df_fixed[["cat_features"]].values})

print("expected original hashed features")
print(expected)

print("onnx fixed original hashed features")
print(got[0])
expected original hashed features
[[0. 0. 1. 0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 0. 1. 1. 0.]
 [0. 0. 0. 1. 0. 0. 1. 0.]
 [0. 0. 0. 0. 1. 0. 1. 0.]
 [0. 0. 0. 0. 0. 0. 0. 2.]
 [0. 2. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 2. 0.]]
onnx fixed original hashed features
[[0. 0. 1. 0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 0. 1. 1. 0.]
 [0. 0. 0. 1. 0. 0. 1. 0.]
 [0. 0. 0. 0. 1. 0. 1. 0.]
 [0. 0. 0. 0. 0. 0. 0. 2.]
 [0. 2. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 2. 0.]]

现在它工作了。

稀疏性?

现在让我们尝试使用分类器,并且不使用 sparse_threshold=0.0

pipe = Pipeline(
    steps=[
        (
            "preprocessor",
            ColumnTransformer(
                [
                    (
                        "cat_preprocessor",
                        FeatureHasher(
                            n_features=8,
                            input_type="string",
                            alternate_sign=False,
                            dtype=np.float32,
                        ),
                        "cat_features",
                    )
                ],
                # sparse_threshold=0.0,
            ),
        ),
        ("classifier", GradientBoostingClassifier(n_estimators=2, max_depth=2)),
    ],
)
pipe.fit(X_train, y_train)
expected = pipe.predict_proba(X_train)


onx = to_onnx(
    pipe,
    initial_types=[("cat_features", StringTensorType([None, 1]))],
    options={"zipmap": False, "preprocessor__cat_preprocessor__separator": "#"},
)

so = SessionOptions()
so.register_custom_ops_library(get_library_path())
sess = InferenceSession(onx.SerializeToString(), so, providers=["CPUExecutionProvider"])
got = sess.run(None, {"cat_features": df_fixed[["cat_features"]].values})


print("expected probabilies")
print(expected)

print("onnx probabilies")
print(got[1])
expected probabilies
[[0.53360884 0.46639116]
 [0.46226277 0.53773723]
 [0.61595526 0.38404474]
 [0.61595526 0.38404474]
 [0.53637279 0.46362721]
 [0.61595526 0.38404474]
 [0.61595526 0.38404474]]
onnx probabilies
[[0.53360885 0.46639115]
 [0.4622628  0.5377372 ]
 [0.6159553  0.3840447 ]
 [0.6159553  0.3840447 ]
 [0.5363728  0.46362722]
 [0.6159553  0.3840447 ]
 [0.6159553  0.3840447 ]]

scikit-learn 保留 FeatureHasher 的稀疏输出。onnxruntime 不支持稀疏特征。如果此步骤旁边的模型对缺失的稀疏值和零做出区分,这可能会对转换产生影响。对于此模型而言似乎并非如此,但其他模型或库的行为可能不同。

print(pipe.steps[0][-1].transform(X_train))
<Compressed Sparse Row sparse matrix of dtype 'float32'
        with 11 stored elements and shape (7, 8)>
  Coords        Values
  (0, 2)        1.0
  (0, 6)        1.0
  (1, 5)        1.0
  (1, 6)        1.0
  (2, 3)        1.0
  (2, 6)        1.0
  (3, 4)        1.0
  (3, 6)        1.0
  (4, 7)        2.0
  (5, 1)        2.0
  (6, 6)        2.0

脚本总运行时间: (0 分钟 4.693 秒)

Sphinx-Gallery 生成的图库