Wind-xixi commited on
Commit
00fb643
·
verified ·
1 Parent(s): 449395a

Update predictor.py

Browse files
Files changed (1) hide show
  1. predictor.py +104 -76
predictor.py CHANGED
@@ -1,87 +1,115 @@
 
 
1
  import json
2
- import re
3
- from typing import List, Dict, Tuple
4
  import numpy as np
5
- from onnxruntime import InferenceSession
 
 
 
 
 
 
 
 
 
 
 
6
 
7
- # 加载ONNX模型
8
- MODEL_SESSION = InferenceSession('model_quantized.onnx')
9
 
10
- # 评分等级映射
11
- GRADE_MAPPING = {
12
- 0: 'A',
13
- 1: 'B',
14
- 2: 'C',
15
- 3: 'D',
16
- 4: 'E'
17
- }
18
 
19
- def analyze_text_with_keywords(text: str, keywords_data: Dict) -> List[str]:
20
- """
21
- 使用关键词库分析文本,提取相关句子
22
- """
23
- # 按句子分割(简单的中文分句)
24
- sentences = re.split(r'[。!?;]', text)
25
- sentences = [s.strip() for s in sentences if len(s.strip()) > 5] # 过滤短句
26
-
27
- relevant_sentences = []
28
-
29
- # 检查每个句子是否包含关键词
30
- for sentence in sentences:
31
- for category in keywords_data.values():
32
- for keyword in category:
33
  if keyword in sentence:
34
  relevant_sentences.append(sentence)
35
- break # 找到一个关键词即可
36
-
37
- return relevant_sentences
38
-
39
- def preprocess_text(sentence: str) -> np.ndarray:
40
- """
41
- 文本预处理(根据你的模型需要修改)
42
- """
43
- # 这里应该添加你的tokenizer逻辑
44
- # 示例: 返回随机向量 (实际使用时替换为真实预处理)
45
- return np.random.rand(1, 768).astype(np.float32)
46
 
47
- def predict_grade(sentences: List[str]) -> Tuple[str, List[str]]:
48
- """
49
- 预测句子等级并计算平均分
50
- """
51
- if not sentences:
52
- return "N/A", []
53
-
54
- sentence_grades = []
55
- grade_scores = []
56
-
57
- for sentence in sentences:
58
- # 1. 预处理
59
- inputs = preprocess_text(sentence)
60
 
61
- # 2. 运行ONNX模型
62
- outputs = MODEL_SESSION.run(None, {'input': inputs})
63
- pred_class = np.argmax(outputs[0])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
64
 
65
- # 3. 记录结果
66
- grade = GRADE_MAPPING.get(pred_class, 'E')
67
- sentence_grades.append(grade)
68
- grade_scores.append(pred_class)
69
-
70
- # 4. 计算平均分
71
- avg_score = np.mean(grade_scores)
72
- final_grade = GRADE_MAPPING.get(int(round(avg_score)), 'E')
73
-
74
- return final_grade, sentence_grades
 
 
 
 
 
 
 
 
 
 
 
 
75
 
76
- # 测试用
77
- if __name__ == '__main__':
78
- test_text = "这个学生表现很好。创新性不足。逻辑清晰。完成度一般。"
79
- with open('evaluation_keywords2.json', 'r', encoding='utf-8') as f:
80
- keywords = json.load(f)
81
-
82
- relevant = analyze_text_with_keywords(test_text, keywords)
83
- print(f"匹配的句子: {relevant}")
84
-
85
- grade, details = predict_grade(relevant)
86
- print(f"最终等级: {grade}")
87
- print(f"详细评分: {details}")
 
1
+ # predictor.py
2
+
3
  import json
 
 
4
  import numpy as np
5
+ import onnxruntime as ort
6
+ from transformers import BertTokenizer
7
+ import re
8
+
9
+ class Predictor:
10
+ def __init__(self):
11
+ """
12
+ 在服务启动时,一次性加载所有必要的模型和文件。
13
+ """
14
+ # 1. 加载分词器 (Tokenizer)
15
+ # Hugging Face Spaces会自动下载git仓库中的所有文件到当前目录
16
+ self.tokenizer = BertTokenizer.from_pretrained('.')
17
 
18
+ # 2. 加载ONNX模型并创建推理会话
19
+ self.ort_session = ort.InferenceSession('model_quantized.onnx')
20
 
21
+ # 3. 加载关键词词集
22
+ with open('evaluation_keywords2.json', 'r', encoding='utf-8') as f:
23
+ self.keywords = json.load(f)
24
+
25
+ # 4. 定义等级映射
26
+ self.id2label = {0: 'a', 1: 'b', 2: 'c', 3: 'd', 4: 'e'}
27
+ self.label2score = {'a': 5, 'b': 4, 'c': 3, 'd': 2, 'e': 1} # 用于计算平均值
 
28
 
29
+ def _extract_relevant_sentences(self, text):
30
+ """
31
+ 根据关键词提取相关的句子。
32
+ """
33
+ # 使用正则表达式按标点符号分割句子,更准确
34
+ sentences = re.split(r'[。!?]', text)
35
+ relevant_sentences = []
36
+ for sentence in sentences:
37
+ if not sentence:
38
+ continue
39
+ for keyword in self.keywords:
 
 
 
40
  if keyword in sentence:
41
  relevant_sentences.append(sentence)
42
+ break # 找到一个关键词就添加,避免重复
43
+ return relevant_sentences
 
 
 
 
 
 
 
 
 
44
 
45
+ def _predict_single_sentence(self, sentence):
46
+ """
47
+ 对单个句子进行模型推理,返回预测的等级标签。
48
+ """
49
+ # 使用分词器处理文本
50
+ inputs = self.tokenizer(sentence, return_tensors="np", padding='max_length', truncation=True, max_length=128)
51
+
52
+ # 准备ONNX模型的输入
53
+ ort_inputs = {self.ort_session.get_inputs()[0].name: inputs['input_ids']}
54
+
55
+ # 执行推理
56
+ ort_outs = self.ort_session.run(None, ort_inputs)
 
57
 
58
+ # 处理输出结果
59
+ prediction = np.argmax(ort_outs[0], axis=1)[0]
60
+ return self.id2label[prediction]
61
+
62
+ def predict(self, text):
63
+ """
64
+ 执行完整的预测流程:提取句子 -> 逐句评分 -> 计算平均等级。
65
+ 这是暴露给app.py调用的主方法。
66
+ """
67
+ # 步骤1: 提取包含关键词的句子
68
+ relevant_sentences = self._extract_relevant_sentences(text)
69
+
70
+ if not relevant_sentences:
71
+ return {
72
+ "grade": "c", # 如果没有找到相关句子,返回一个默认的中间等级
73
+ "summary": "文本中未检测到可用于评价的关键词句,无法进行有效分析。",
74
+ "analyzed_sentences_count": 0
75
+ }
76
+
77
+ # 步骤2: 对每个相关句子进行评分
78
+ scores = []
79
+ for sentence in relevant_sentences:
80
+ label = self._predict_single_sentence(sentence)
81
+ scores.append(self.label2score[label])
82
+
83
+ # 步骤3: 计算平均分并转换为最终等级
84
+ if not scores:
85
+ return {
86
+ "grade": "c",
87
+ "summary": "虽然找到相关句子,但模型未能给出评分。",
88
+ "analyzed_sentences_count": len(relevant_sentences)
89
+ }
90
+
91
+ average_score = sum(scores) / len(scores)
92
 
93
+ # 将平均分四舍五入后映射回最终等级
94
+ final_grade = ""
95
+ if average_score >= 4.5:
96
+ final_grade = "a"
97
+ elif average_score >= 3.5:
98
+ final_grade = "b"
99
+ elif average_score >= 2.5:
100
+ final_grade = "c"
101
+ elif average_score >= 1.5:
102
+ final_grade = "d"
103
+ else:
104
+ final_grade = "e"
105
+
106
+ # 步骤4: 生成总结性文本
107
+ summary = f"系统分析了 {len(relevant_sentences)} 个关键句子,综合评定等级为“{final_grade.upper()}”。"
108
+
109
+ return {
110
+ "grade": final_grade,
111
+ "summary": summary,
112
+ "analyzed_sentences_count": len(relevant_sentences)
113
+ }
114
+
115