处理差异 (tf-idf)

TfidfVectorizer 是一种转换器,其对应的转换后的 onnx 模型可能会产生不同的结果。词汇量越大,得到不同结果的概率就越高。本示例提出了一个没有差异的等效模型。

导入、设置

所有导入。它还注册了 xgboostlightgbm 的 onnx 转换器。

import pprint
import numpy
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from onnxruntime import InferenceSession
from skl2onnx import to_onnx


def print_sparse_matrix(m):
    nonan = numpy.nan_to_num(m)
    mi, ma = nonan.min(), nonan.max()
    if mi == ma:
        ma += 1
    mat = numpy.empty(m.shape, dtype=numpy.str_)
    mat[:, :] = "."
    if hasattr(m, "todense"):
        dense = m.todense()
    else:
        dense = m
    for i in range(m.shape[0]):
        for j in range(m.shape[1]):
            if dense[i, j] > 0:
                c = int((dense[i, j] - mi) / (ma - mi) * 25)
                mat[i, j] = chr(ord("A") + c)
    return "\n".join("".join(line) for line in mat)


def diff(a, b):
    if a.shape != b.shape:
        raise ValueError(
            f"Cannot compare matrices with different shapes {a.shape} != {b.shape}."
        )
    d = numpy.abs(a - b).sum() / a.size
    return d

人工数据集

鸢尾花 + 一个文本列。

strings = numpy.array(
    [
        "This a sentence.",
        "This a sentence with more characters $^*&'(-...",
        """var = ClassName(var2, user=mail@anywhere.com, pwd"""
        """=")_~-('&]@^\\`|[{#")""",
        "c79857654",
        "https://complex-url.com/;76543u3456?g=hhh&h=23",
        "01-03-05T11:12:13",
        "https://complex-url.com/;dd76543u3456?g=ddhhh&h=23",
    ]
).reshape((-1, 1))

pprint.pprint(strings)
array([['This a sentence.'],
       ["This a sentence with more characters $^*&'(-..."],
       ['var = ClassName(var2, user=mail@anywhere.com, pwd=")_~-(\'&]@^\\`|[{#")'],
       ['c79857654'],
       ['https://complex-url.com/;76543u3456?g=hhh&h=23'],
       ['01-03-05T11:12:13'],
       ['https://complex-url.com/;dd76543u3456?g=ddhhh&h=23']],
      dtype='<U69')

拟合 TfIdfVectorizer

tfidf = Pipeline([("pre", ColumnTransformer([("tfidf", TfidfVectorizer(), 0)]))])

我们从训练集中排除了一些字符串。

tfidf.fit(strings[:-2])
tr = tfidf.transform(strings)
tfidf_step = tfidf.steps[0][1].transformers_[0][1]
# print(f"output columns: {tfidf_step.get_feature_names_out()}")
print("rendered outputs")
print(print_sparse_matrix(tr))
rendered outputs
..............RR.....
.....M......M.JJ....M
...J..JH...J.J...JJJ.
....Z................
JJJ....HJJJ.....J....
.....................
K.K....IK.K.....K....

转换为 ONNX

onx = to_onnx(tfidf, strings)

使用 ONNX 执行

sess = InferenceSession(onx.SerializeToString(), providers=["CPUExecutionProvider"])
got = sess.run(None, {"X": strings})[0]
print(f"differences={diff(tr, got):g}")
print(print_sparse_matrix(got))
differences=3.25823e-08
..............RR.....
.....M......M.JJ....M
...J..JH...J.J...JJJ.
....Z................
JJJ....HJJJ.....J....
.....................
K.K....IK.K.....K....

脚本总运行时间: (0 分钟 0.030 秒)

Sphinx-Gallery 生成的图库