Max1798 commited on
Commit
bf19a0a
·
verified ·
1 Parent(s): 29e1ab5

Update inference.py

Browse files
Files changed (1) hide show
  1. inference.py +44 -15
inference.py CHANGED
@@ -1,18 +1,47 @@
1
- # inference.py
2
- from tokenizers import Tokenizer
3
- from typing import List, Dict
4
 
5
- # 加载 tokenizer
6
- tokenizer = Tokenizer.from_pretrained(".") # 从当前目录加载
 
7
 
8
- def tokenize(text: str) -> Dict[str, List[str]]:
9
- """接收文本,返回分词结果"""
10
- encoded = tokenizer.encode(text)
11
- return {
12
- "tokens": encoded.tokens,
13
- "ids": encoded.ids
14
- }
 
 
 
 
 
 
15
 
16
- # 测试示例(可选)
17
- if __name__ == "__main__":
18
- print(tokenize("Hello, this is a test."))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import Pipeline, PreTrainedTokenizer, AutoTokenizer
2
+ from typing import Dict, Union, List
3
+ import torch
4
 
5
+ class TokenizerPipeline(Pipeline):
6
+ def __init__(self, **kwargs):
7
+ super().__init__(**kwargs)
8
 
9
+ def _sanitize_parameters(self, **kwargs):
10
+ # 处理传入参数:是否解码、padding等
11
+ preprocess_kwargs = {}
12
+ if "padding" in kwargs:
13
+ preprocess_kwargs["padding"] = kwargs["padding"]
14
+ if "truncation" in kwargs:
15
+ preprocess_kwargs["truncation"] = kwargs["truncation"]
16
+
17
+ postprocess_kwargs = {}
18
+ if "return_tokens" in kwargs:
19
+ postprocess_kwargs["return_tokens"] = kwargs["return_tokens"]
20
+
21
+ return preprocess_kwargs, {}, postprocess_kwargs
22
 
23
+ def preprocess(self, inputs, **kwargs) -> Dict:
24
+ # 使用Tokenizer处理输入文本
25
+ return self.tokenizer(inputs, return_tensors="pt", **kwargs)
26
+
27
+ def _forward(self, inputs) -> Dict:
28
+ # 直接返回预处理结果(无模型推理)
29
+ return inputs
30
+
31
+ def postprocess(self, model_outputs, **kwargs) -> Dict:
32
+ # 转换输出为可读格式
33
+ input_ids = model_outputs["input_ids"][0]
34
+
35
+ if kwargs.get("return_tokens", True):
36
+ tokens = self.tokenizer.convert_ids_to_tokens(input_ids)
37
+ return {"tokens": tokens}
38
+ else:
39
+ return {"input_ids": input_ids.tolist()}
40
+
41
+ # 关键:创建并导出pipeline实例
42
+ tokenizer = AutoTokenizer.from_pretrained(".")
43
+ pipeline = TokenizerPipeline(tokenizer=tokenizer)
44
+
45
+ # 可选:添加类型提示供HF解析
46
+ def get_pipeline() -> Pipeline:
47
+ return pipeline