lllouo commited on
Commit
bb4c1e0
·
1 Parent(s): a7581b9
Files changed (2) hide show
  1. app.py +148 -28
  2. requirements.txt +5 -2
app.py CHANGED
@@ -10,18 +10,25 @@ import re
10
  import spacy
11
  from spellchecker import SpellChecker
12
  import difflib
 
 
13
 
14
  # ======================== 新增:WAC-GEC导入 ========================
15
  try:
16
  from whitespace_correction import WhitespaceCorrector
17
  WAC_GEC_AVAILABLE = True
18
- # 初始化WAC-GEC模型(使用CPU,HF Space通常没有GPU
19
- wac_corrector = None # 延迟初始化
20
  except ImportError:
21
  WAC_GEC_AVAILABLE = False
22
  wac_corrector = None
23
  print("⚠️ whitespace_correction未安装,WAC-GEC功能将不可用")
24
 
 
 
 
 
 
25
  # ======================== API配置 ========================
26
  DEEPSEEK_API_KEY = os.getenv("DEEPSEEK_API_KEY", "")
27
  DEEPSEEK_BASE_URL = "https://dashscope.aliyuncs.com/compatible-mode/v1"
@@ -71,38 +78,126 @@ Next, please correct the following sentence according to the above requirements.
71
 
72
  [input]: """
73
 
74
- # ======================== 新增:WAC-GEC初始化函数 ========================
75
  def initialize_wac_gec():
76
- """延迟初始化WAC-GEC模型"""
77
- global wac_corrector
 
 
78
  if not WAC_GEC_AVAILABLE:
 
79
  return False
80
 
81
  if wac_corrector is None:
82
  try:
83
- device = "cpu" # HF Space默认使用CPU
84
  wac_corrector = WhitespaceCorrector.from_pretrained(
85
  model="eo_larger_byte",
86
  device=device,
87
- download_dir="./models"
88
  )
89
- print(f"✅ WAC-GEC模型已加载 (设备: {device})")
90
- return True
91
  except Exception as e:
92
- print(f"❌ WAC-GEC模型加载失败: {e}")
93
  return False
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
94
  return True
95
 
96
- # ======================== 新增:WAC-GEC处理函数 ========================
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
97
  def call_wac_gec(text):
98
- """使用WAC-GEC纠正空白符错误"""
 
 
 
 
99
  if not initialize_wac_gec():
100
  raise ValueError("⚠️ WAC-GEC模型未安装或加载失败")
101
 
102
  try:
103
- corrected = wac_corrector.correct_text(text)
 
 
 
 
 
 
 
 
 
104
  # 格式化输出以匹配DeepSeek的格式
105
- return f"[output]: {corrected}"
 
106
  except Exception as e:
107
  raise Exception(f"WAC-GEC处理错误: {str(e)}")
108
 
@@ -483,7 +578,7 @@ def clean_dataset(file_path, question_column, model_choice, temperature, max_sam
483
  log_text += f" 变化: {delta_sed:+.2f}% {'✅ 改善' if delta_sed < 0 else '⚠️ 增加'}\n"
484
 
485
  if model_choice == "WAC-GEC":
486
- log_text += f"\n💡 注意: WAC-GEC仅修正空白符错误,不修拼写和语法错误\n"
487
 
488
  log_text += f"{'='*50}\n"
489
 
@@ -510,11 +605,17 @@ ABOUT_TEXT = """
510
  - **优势**: 综合性强,能处理多种类型的错误
511
  - **配置**: 需要在Space Settings中配置DEEPSEEK_API_KEY
512
 
513
- #### 2. WAC-GEC (Whitespace Correction)
514
- - **功能**: 专注于空白符错误纠正(多余空格、缺失空格等)
515
- - **优势**: 轻量级,无需API密钥,处理速度快
516
- - **限制**: 仅修正空白符错误,不处理拼写和语法问题
517
- - **适用场景**: 数据集中主要存在空白符异常的情况
 
 
 
 
 
 
518
 
519
  ### 核心算法
520
 
@@ -525,7 +626,10 @@ ABOUT_TEXT = """
525
 
526
  2. **模型去噪**
527
  - **DeepSeek**: 使用API进行全面错误修正,重试机制最多5次
528
- - **WAC-GEC**: 使用本地模型进行空白符纠正,重试机制最多3次
 
 
 
529
 
530
  3. **格式验证**
531
  - 验证输出格式正确性
@@ -555,7 +659,9 @@ ABOUT_TEXT = """
555
  ### 技术栈
556
 
557
  - **LLM**: DeepSeek API (deepseek-r1-distill-llama-8b)
558
- - **本地模型**: WAC-GEC (Whitespace Correction)
 
 
559
  - **前端**: Gradio 4.16.0
560
  - **数据处理**: Pandas + PyArrow (Parquet)
561
  - **差异对比**: Python difflib
@@ -570,14 +676,28 @@ ABOUT_TEXT = """
570
 
571
  ### 模型选择建议
572
 
573
- - **需要全面去噪**: 选择 DeepSeek-R1
574
- - **仅需修正空格**: 选择 WAC-GEC(更快,无需API
575
- - **预算有限**: 优先使用 WAC-GEC
576
- - **追求最佳效果**: 使用 DeepSeek-R1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
577
 
578
  ---
579
 
580
- **研究生毕业论文成果展示** | Powered by DeepSeek API & WAC-GEC
581
  """
582
 
583
  # ======================== Gradio界面 ========================
@@ -678,7 +798,7 @@ with demo:
678
  choices=["deepseek-r1-distill-llama-8b", "WAC-GEC"],
679
  value="deepseek-r1-distill-llama-8b",
680
  label="🤖 选择模型",
681
- info="DeepSeek: 全面纠错 | WAC-GEC: 空白符纠正"
682
  )
683
 
684
  temperature = gr.Slider(
 
10
  import spacy
11
  from spellchecker import SpellChecker
12
  import difflib
13
+ import torch
14
+ from transformers import AutoTokenizer, AutoModelForCausalLM
15
 
16
  # ======================== 新增:WAC-GEC导入 ========================
17
  try:
18
  from whitespace_correction import WhitespaceCorrector
19
  WAC_GEC_AVAILABLE = True
20
+ # 初始化WAC-GEC模型(延迟加载
21
+ wac_corrector = None
22
  except ImportError:
23
  WAC_GEC_AVAILABLE = False
24
  wac_corrector = None
25
  print("⚠️ whitespace_correction未安装,WAC-GEC功能将不可用")
26
 
27
+ # 初始化GEC模型(延迟加载)
28
+ gec_tokenizer = None
29
+ gec_model = None
30
+ GEC_MODEL_NAME = "lllouo/gec_Chat-LLaMa-2-7B-FT" # 你的HF模型地址
31
+
32
  # ======================== API配置 ========================
33
  DEEPSEEK_API_KEY = os.getenv("DEEPSEEK_API_KEY", "")
34
  DEEPSEEK_BASE_URL = "https://dashscope.aliyuncs.com/compatible-mode/v1"
 
78
 
79
  [input]: """
80
 
81
+ # ======================== 新增:初始化函数(WAC + GEC) ========================
82
  def initialize_wac_gec():
83
+ """延迟初始化WAC-GEC模型(空白符纠正 + 语法纠错)"""
84
+ global wac_corrector, gec_tokenizer, gec_model
85
+
86
+ # 1. 初始化WAC(空白符纠正)
87
  if not WAC_GEC_AVAILABLE:
88
+ print("❌ WAC模块未安装")
89
  return False
90
 
91
  if wac_corrector is None:
92
  try:
93
+ device = "cuda" if torch.cuda.is_available() else "cpu"
94
  wac_corrector = WhitespaceCorrector.from_pretrained(
95
  model="eo_larger_byte",
96
  device=device,
97
+ download_dir="./models/wac"
98
  )
99
+ print(f"✅ WAC空白符纠正模型已加载 (设备: {device})")
 
100
  except Exception as e:
101
+ print(f"❌ WAC模型加载失败: {e}")
102
  return False
103
+
104
+ # 2. 初始化GEC(语法纠错)
105
+ if gec_model is None or gec_tokenizer is None:
106
+ try:
107
+ device = "cuda" if torch.cuda.is_available() else "cpu"
108
+
109
+ print(f"📥 正在从HuggingFace下载GEC模型: {GEC_MODEL_NAME}")
110
+ gec_tokenizer = AutoTokenizer.from_pretrained(
111
+ GEC_MODEL_NAME,
112
+ trust_remote_code=True
113
+ )
114
+ gec_model = AutoModelForCausalLM.from_pretrained(
115
+ GEC_MODEL_NAME,
116
+ device_map="auto" if device == "cuda" else None,
117
+ torch_dtype=torch.bfloat16 if device == "cuda" else torch.float32,
118
+ trust_remote_code=True
119
+ )
120
+
121
+ # 如果是CPU模式,手动移动模型
122
+ if device == "cpu":
123
+ gec_model = gec_model.to(device)
124
+
125
+ # 设置tokenizer的pad_token和padding_side
126
+ gec_tokenizer.pad_token_id = gec_tokenizer.eos_token_id
127
+ gec_tokenizer.padding_side = "left"
128
+
129
+ print(f"✅ GEC语法纠错模型已加载 (设备: {device})")
130
+
131
+ except Exception as e:
132
+ print(f"❌ GEC模型加载失败: {e}")
133
+ return False
134
+
135
  return True
136
 
137
+ # ======================== 新增:GEC语法纠错函数 ========================
138
+ def correct_sentence_gec(input_sentence):
139
+ """
140
+ 使用GEC模型进行语法纠错
141
+ 参数:
142
+ input_sentence (str): 需要纠正的句子
143
+ 返回:
144
+ str: 纠正后的句子
145
+ """
146
+ if gec_model is None or gec_tokenizer is None:
147
+ raise ValueError("GEC模型未初始化")
148
+
149
+ # 构建提示词
150
+ prompt = f"""Rewrite the following sentence to correct grammatical errors. Return ONLY the corrected sentence.
151
+ Original: {input_sentence}
152
+ Corrected:"""
153
+
154
+ # 生成修正
155
+ inputs = gec_tokenizer(prompt, return_tensors="pt").to(gec_model.device)
156
+
157
+ with torch.no_grad():
158
+ outputs = gec_model.generate(
159
+ **inputs,
160
+ max_new_tokens=512,
161
+ num_beams=4,
162
+ do_sample=False,
163
+ temperature=None,
164
+ top_p=None
165
+ )
166
+
167
+ # 提取并清理输出
168
+ full_output = gec_tokenizer.decode(outputs[0], skip_special_tokens=True)
169
+ corrected_text = full_output.replace(prompt, "").strip()
170
+
171
+ # 进一步清理可能的前缀
172
+ if corrected_text.startswith("Corrected:"):
173
+ corrected_text = corrected_text[len("Corrected:"):].strip()
174
+
175
+ return corrected_text
176
+
177
+ # ======================== 新增:WAC-GEC组合处理函数 ========================
178
  def call_wac_gec(text):
179
+ """
180
+ 使用WAC-GEC两步纠正:
181
+ 1. GEC模型进行语法和拼写纠正
182
+ 2. WAC模型进行空白符纠正
183
+ """
184
  if not initialize_wac_gec():
185
  raise ValueError("⚠️ WAC-GEC模型未安装或加载失败")
186
 
187
  try:
188
+ # Step 1: 使用GEC模型进行语法纠错
189
+ print(f"🔍 GEC处理: {text[:50]}...")
190
+ gec_corrected = correct_sentence_gec(text)
191
+ print(f"✅ GEC结果: {gec_corrected[:50]}...")
192
+
193
+ # Step 2: 使用WAC模型进行空白符纠正
194
+ print(f"🔍 WAC处理: {gec_corrected[:50]}...")
195
+ final_corrected = wac_corrector.correct_text(gec_corrected)
196
+ print(f"✅ WAC结果: {final_corrected[:50]}...")
197
+
198
  # 格式化输出以匹配DeepSeek的格式
199
+ return f"[output]: {final_corrected}"
200
+
201
  except Exception as e:
202
  raise Exception(f"WAC-GEC处理错误: {str(e)}")
203
 
 
578
  log_text += f" 变化: {delta_sed:+.2f}% {'✅ 改善' if delta_sed < 0 else '⚠️ 增加'}\n"
579
 
580
  if model_choice == "WAC-GEC":
581
+ log_text += f"\n💡 注意: WAC-GEC使用两步纠(GEC语法纠错 + WAC空白符\n"
582
 
583
  log_text += f"{'='*50}\n"
584
 
 
605
  - **优势**: 综合性强,能处理多种类型的错误
606
  - **配置**: 需要在Space Settings中配置DEEPSEEK_API_KEY
607
 
608
+ #### 2. WAC-GEC (Whitespace + Grammar Error Correction)
609
+ - **功能**: 两步纠正流程
610
+ - **Step 1 (GEC)**: 使用LLaMA-2-7B微调模型进行语法和拼写纠错
611
+ - **Step 2 (WAC)**: 使用空白符纠正模型修正空格问题
612
+ - **优势**:
613
+ - 完全本地化,无需API密钥
614
+ - 组合两个专门模型,各司其职
615
+ - 适合离线环境和预算有限的场景
616
+ - **模型来源**:
617
+ - GEC: [lllouo/gec_Chat-LLaMa-2-7B-FT](https://huggingface.co/lllouo/gec_Chat-LLaMa-2-7B-FT)
618
+ - WAC: whitespace_correction库
619
 
620
  ### 核心算法
621
 
 
626
 
627
  2. **模型去噪**
628
  - **DeepSeek**: 使用API进行全面错误修正,重试机制最多5次
629
+ - **WAC-GEC**:
630
+ - 先使用GEC模型进行语法和拼写纠正
631
+ - 再使用WAC模型进行空白符纠正
632
+ - 重试机制最多3次
633
 
634
  3. **格式验证**
635
  - 验证输出格式正确性
 
659
  ### 技术栈
660
 
661
  - **LLM**: DeepSeek API (deepseek-r1-distill-llama-8b)
662
+ - **本地模型**:
663
+ - GEC: LLaMA-2-7B (微调于语法纠错任务)
664
+ - WAC: Whitespace Correction Model
665
  - **前端**: Gradio 4.16.0
666
  - **数据处理**: Pandas + PyArrow (Parquet)
667
  - **差异对比**: Python difflib
 
676
 
677
  ### 模型选择建议
678
 
679
+ - **需要全面去噪 + 有API预算**: 选择 DeepSeek-R1
680
+ - **本地化部署 + 完整纠错**: 选择 WAC-GEC(推荐
681
+ - **仅需修正空格**: 单独使用WAC模块
682
+ - **追求最快速度**: 使用GPU加速的WAC-GEC
683
+
684
+ ### WAC-GEC处理流程示例
685
+
686
+ ```
687
+ 原始文本: "This is anexample with spellingerorr."
688
+
689
+ [Step 1: GEC语法纠错]
690
+
691
+ 中间结果: "This is an example with spelling error."
692
+
693
+ [Step 2: WAC空白符纠正]
694
+
695
+ 最终结果: "This is an example with spelling error."
696
+ ```
697
 
698
  ---
699
 
700
+ **研究生毕业论文成果展示** | Powered by DeepSeek API & WAC-GEC (LLaMA-2-7B + Whitespace Correction)
701
  """
702
 
703
  # ======================== Gradio界面 ========================
 
798
  choices=["deepseek-r1-distill-llama-8b", "WAC-GEC"],
799
  value="deepseek-r1-distill-llama-8b",
800
  label="🤖 选择模型",
801
+ info="DeepSeek: 全面纠错 | WAC-GEC: 语法+空白符纠正(���地模型)"
802
  )
803
 
804
  temperature = gr.Slider(
requirements.txt CHANGED
@@ -1,8 +1,11 @@
1
  gradio
2
- requests
3
  pandas
4
  pyarrow
 
5
  openai
6
  spacy
7
  pyspellchecker
8
- whitespace-correction
 
 
 
 
1
  gradio
 
2
  pandas
3
  pyarrow
4
+ requests
5
  openai
6
  spacy
7
  pyspellchecker
8
+ torch
9
+ transformers
10
+ accelerate
11
+ whitespace_correction