Spaces:
Sleeping
Sleeping
WAC-GEC
Browse files- app.py +148 -28
- requirements.txt +5 -2
app.py
CHANGED
|
@@ -10,18 +10,25 @@ import re
|
|
| 10 |
import spacy
|
| 11 |
from spellchecker import SpellChecker
|
| 12 |
import difflib
|
|
|
|
|
|
|
| 13 |
|
| 14 |
# ======================== 新增:WAC-GEC导入 ========================
|
| 15 |
try:
|
| 16 |
from whitespace_correction import WhitespaceCorrector
|
| 17 |
WAC_GEC_AVAILABLE = True
|
| 18 |
-
# 初始化WAC-GEC模型(
|
| 19 |
-
wac_corrector = None
|
| 20 |
except ImportError:
|
| 21 |
WAC_GEC_AVAILABLE = False
|
| 22 |
wac_corrector = None
|
| 23 |
print("⚠️ whitespace_correction未安装,WAC-GEC功能将不可用")
|
| 24 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 25 |
# ======================== API配置 ========================
|
| 26 |
DEEPSEEK_API_KEY = os.getenv("DEEPSEEK_API_KEY", "")
|
| 27 |
DEEPSEEK_BASE_URL = "https://dashscope.aliyuncs.com/compatible-mode/v1"
|
|
@@ -71,38 +78,126 @@ Next, please correct the following sentence according to the above requirements.
|
|
| 71 |
|
| 72 |
[input]: """
|
| 73 |
|
| 74 |
-
# ======================== 新增:
|
| 75 |
def initialize_wac_gec():
|
| 76 |
-
"""延迟初始化WAC-GEC模型"""
|
| 77 |
-
global wac_corrector
|
|
|
|
|
|
|
| 78 |
if not WAC_GEC_AVAILABLE:
|
|
|
|
| 79 |
return False
|
| 80 |
|
| 81 |
if wac_corrector is None:
|
| 82 |
try:
|
| 83 |
-
device = "
|
| 84 |
wac_corrector = WhitespaceCorrector.from_pretrained(
|
| 85 |
model="eo_larger_byte",
|
| 86 |
device=device,
|
| 87 |
-
download_dir="./models"
|
| 88 |
)
|
| 89 |
-
print(f"✅ WAC
|
| 90 |
-
return True
|
| 91 |
except Exception as e:
|
| 92 |
-
print(f"❌ WAC
|
| 93 |
return False
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 94 |
return True
|
| 95 |
|
| 96 |
-
# ======================== 新增:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 97 |
def call_wac_gec(text):
|
| 98 |
-
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
| 99 |
if not initialize_wac_gec():
|
| 100 |
raise ValueError("⚠️ WAC-GEC模型未安装或加载失败")
|
| 101 |
|
| 102 |
try:
|
| 103 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 104 |
# 格式化输出以匹配DeepSeek的格式
|
| 105 |
-
return f"[output]: {
|
|
|
|
| 106 |
except Exception as e:
|
| 107 |
raise Exception(f"WAC-GEC处理错误: {str(e)}")
|
| 108 |
|
|
@@ -483,7 +578,7 @@ def clean_dataset(file_path, question_column, model_choice, temperature, max_sam
|
|
| 483 |
log_text += f" 变化: {delta_sed:+.2f}% {'✅ 改善' if delta_sed < 0 else '⚠️ 增加'}\n"
|
| 484 |
|
| 485 |
if model_choice == "WAC-GEC":
|
| 486 |
-
log_text += f"\n💡 注意: WAC-GEC
|
| 487 |
|
| 488 |
log_text += f"{'='*50}\n"
|
| 489 |
|
|
@@ -510,11 +605,17 @@ ABOUT_TEXT = """
|
|
| 510 |
- **优势**: 综合性强,能处理多种类型的错误
|
| 511 |
- **配置**: 需要在Space Settings中配置DEEPSEEK_API_KEY
|
| 512 |
|
| 513 |
-
#### 2. WAC-GEC (Whitespace Correction)
|
| 514 |
-
- **功能**:
|
| 515 |
-
- **
|
| 516 |
-
- **
|
| 517 |
-
- **
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 518 |
|
| 519 |
### 核心算法
|
| 520 |
|
|
@@ -525,7 +626,10 @@ ABOUT_TEXT = """
|
|
| 525 |
|
| 526 |
2. **模型去噪**
|
| 527 |
- **DeepSeek**: 使用API进行全面错误修正,重试机制最多5次
|
| 528 |
-
- **WAC-GEC**:
|
|
|
|
|
|
|
|
|
|
| 529 |
|
| 530 |
3. **格式验证**
|
| 531 |
- 验证输出格式正确性
|
|
@@ -555,7 +659,9 @@ ABOUT_TEXT = """
|
|
| 555 |
### 技术栈
|
| 556 |
|
| 557 |
- **LLM**: DeepSeek API (deepseek-r1-distill-llama-8b)
|
| 558 |
-
- **本地模型**:
|
|
|
|
|
|
|
| 559 |
- **前端**: Gradio 4.16.0
|
| 560 |
- **数据处理**: Pandas + PyArrow (Parquet)
|
| 561 |
- **差异对比**: Python difflib
|
|
@@ -570,14 +676,28 @@ ABOUT_TEXT = """
|
|
| 570 |
|
| 571 |
### 模型选择建议
|
| 572 |
|
| 573 |
-
- **需要全面去噪**: 选择 DeepSeek-R1
|
| 574 |
-
- **
|
| 575 |
-
- **
|
| 576 |
-
- **追求最
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 577 |
|
| 578 |
---
|
| 579 |
|
| 580 |
-
**研究生毕业论文成果展示** | Powered by DeepSeek API & WAC-GEC
|
| 581 |
"""
|
| 582 |
|
| 583 |
# ======================== Gradio界面 ========================
|
|
@@ -678,7 +798,7 @@ with demo:
|
|
| 678 |
choices=["deepseek-r1-distill-llama-8b", "WAC-GEC"],
|
| 679 |
value="deepseek-r1-distill-llama-8b",
|
| 680 |
label="🤖 选择模型",
|
| 681 |
-
info="DeepSeek: 全面纠错 | WAC-GEC:
|
| 682 |
)
|
| 683 |
|
| 684 |
temperature = gr.Slider(
|
|
|
|
| 10 |
import spacy
|
| 11 |
from spellchecker import SpellChecker
|
| 12 |
import difflib
|
| 13 |
+
import torch
|
| 14 |
+
from transformers import AutoTokenizer, AutoModelForCausalLM
|
| 15 |
|
| 16 |
# ======================== 新增:WAC-GEC导入 ========================
|
| 17 |
try:
|
| 18 |
from whitespace_correction import WhitespaceCorrector
|
| 19 |
WAC_GEC_AVAILABLE = True
|
| 20 |
+
# 初始化WAC-GEC模型(延迟加载)
|
| 21 |
+
wac_corrector = None
|
| 22 |
except ImportError:
|
| 23 |
WAC_GEC_AVAILABLE = False
|
| 24 |
wac_corrector = None
|
| 25 |
print("⚠️ whitespace_correction未安装,WAC-GEC功能将不可用")
|
| 26 |
|
| 27 |
+
# 初始化GEC模型(延迟加载)
|
| 28 |
+
gec_tokenizer = None
|
| 29 |
+
gec_model = None
|
| 30 |
+
GEC_MODEL_NAME = "lllouo/gec_Chat-LLaMa-2-7B-FT" # 你的HF模型地址
|
| 31 |
+
|
| 32 |
# ======================== API配置 ========================
|
| 33 |
DEEPSEEK_API_KEY = os.getenv("DEEPSEEK_API_KEY", "")
|
| 34 |
DEEPSEEK_BASE_URL = "https://dashscope.aliyuncs.com/compatible-mode/v1"
|
|
|
|
| 78 |
|
| 79 |
[input]: """
|
| 80 |
|
| 81 |
+
# ======================== 新增:初始化函数(WAC + GEC) ========================
|
| 82 |
def initialize_wac_gec():
|
| 83 |
+
"""延迟初始化WAC-GEC模型(空白符纠正 + 语法纠错)"""
|
| 84 |
+
global wac_corrector, gec_tokenizer, gec_model
|
| 85 |
+
|
| 86 |
+
# 1. 初始化WAC(空白符纠正)
|
| 87 |
if not WAC_GEC_AVAILABLE:
|
| 88 |
+
print("❌ WAC模块未安装")
|
| 89 |
return False
|
| 90 |
|
| 91 |
if wac_corrector is None:
|
| 92 |
try:
|
| 93 |
+
device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 94 |
wac_corrector = WhitespaceCorrector.from_pretrained(
|
| 95 |
model="eo_larger_byte",
|
| 96 |
device=device,
|
| 97 |
+
download_dir="./models/wac"
|
| 98 |
)
|
| 99 |
+
print(f"✅ WAC空白符纠正模型已加载 (设备: {device})")
|
|
|
|
| 100 |
except Exception as e:
|
| 101 |
+
print(f"❌ WAC模型加载失败: {e}")
|
| 102 |
return False
|
| 103 |
+
|
| 104 |
+
# 2. 初始化GEC(语法纠错)
|
| 105 |
+
if gec_model is None or gec_tokenizer is None:
|
| 106 |
+
try:
|
| 107 |
+
device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 108 |
+
|
| 109 |
+
print(f"📥 正在从HuggingFace下载GEC模型: {GEC_MODEL_NAME}")
|
| 110 |
+
gec_tokenizer = AutoTokenizer.from_pretrained(
|
| 111 |
+
GEC_MODEL_NAME,
|
| 112 |
+
trust_remote_code=True
|
| 113 |
+
)
|
| 114 |
+
gec_model = AutoModelForCausalLM.from_pretrained(
|
| 115 |
+
GEC_MODEL_NAME,
|
| 116 |
+
device_map="auto" if device == "cuda" else None,
|
| 117 |
+
torch_dtype=torch.bfloat16 if device == "cuda" else torch.float32,
|
| 118 |
+
trust_remote_code=True
|
| 119 |
+
)
|
| 120 |
+
|
| 121 |
+
# 如果是CPU模式,手动移动模型
|
| 122 |
+
if device == "cpu":
|
| 123 |
+
gec_model = gec_model.to(device)
|
| 124 |
+
|
| 125 |
+
# 设置tokenizer的pad_token和padding_side
|
| 126 |
+
gec_tokenizer.pad_token_id = gec_tokenizer.eos_token_id
|
| 127 |
+
gec_tokenizer.padding_side = "left"
|
| 128 |
+
|
| 129 |
+
print(f"✅ GEC语法纠错模型已加载 (设备: {device})")
|
| 130 |
+
|
| 131 |
+
except Exception as e:
|
| 132 |
+
print(f"❌ GEC模型加载失败: {e}")
|
| 133 |
+
return False
|
| 134 |
+
|
| 135 |
return True
|
| 136 |
|
| 137 |
+
# ======================== 新增:GEC语法纠错函数 ========================
|
| 138 |
+
def correct_sentence_gec(input_sentence):
|
| 139 |
+
"""
|
| 140 |
+
使用GEC模型进行语法纠错
|
| 141 |
+
参数:
|
| 142 |
+
input_sentence (str): 需要纠正的句子
|
| 143 |
+
返回:
|
| 144 |
+
str: 纠正后的句子
|
| 145 |
+
"""
|
| 146 |
+
if gec_model is None or gec_tokenizer is None:
|
| 147 |
+
raise ValueError("GEC模型未初始化")
|
| 148 |
+
|
| 149 |
+
# 构建提示词
|
| 150 |
+
prompt = f"""Rewrite the following sentence to correct grammatical errors. Return ONLY the corrected sentence.
|
| 151 |
+
Original: {input_sentence}
|
| 152 |
+
Corrected:"""
|
| 153 |
+
|
| 154 |
+
# 生成修正
|
| 155 |
+
inputs = gec_tokenizer(prompt, return_tensors="pt").to(gec_model.device)
|
| 156 |
+
|
| 157 |
+
with torch.no_grad():
|
| 158 |
+
outputs = gec_model.generate(
|
| 159 |
+
**inputs,
|
| 160 |
+
max_new_tokens=512,
|
| 161 |
+
num_beams=4,
|
| 162 |
+
do_sample=False,
|
| 163 |
+
temperature=None,
|
| 164 |
+
top_p=None
|
| 165 |
+
)
|
| 166 |
+
|
| 167 |
+
# 提取并清理输出
|
| 168 |
+
full_output = gec_tokenizer.decode(outputs[0], skip_special_tokens=True)
|
| 169 |
+
corrected_text = full_output.replace(prompt, "").strip()
|
| 170 |
+
|
| 171 |
+
# 进一步清理可能的前缀
|
| 172 |
+
if corrected_text.startswith("Corrected:"):
|
| 173 |
+
corrected_text = corrected_text[len("Corrected:"):].strip()
|
| 174 |
+
|
| 175 |
+
return corrected_text
|
| 176 |
+
|
| 177 |
+
# ======================== 新增:WAC-GEC组合处理函数 ========================
|
| 178 |
def call_wac_gec(text):
|
| 179 |
+
"""
|
| 180 |
+
使用WAC-GEC两步纠正:
|
| 181 |
+
1. GEC模型进行语法和拼写纠正
|
| 182 |
+
2. WAC模型进行空白符纠正
|
| 183 |
+
"""
|
| 184 |
if not initialize_wac_gec():
|
| 185 |
raise ValueError("⚠️ WAC-GEC模型未安装或加载失败")
|
| 186 |
|
| 187 |
try:
|
| 188 |
+
# Step 1: 使用GEC模型进行语法纠错
|
| 189 |
+
print(f"🔍 GEC处理: {text[:50]}...")
|
| 190 |
+
gec_corrected = correct_sentence_gec(text)
|
| 191 |
+
print(f"✅ GEC结果: {gec_corrected[:50]}...")
|
| 192 |
+
|
| 193 |
+
# Step 2: 使用WAC模型进行空白符纠正
|
| 194 |
+
print(f"🔍 WAC处理: {gec_corrected[:50]}...")
|
| 195 |
+
final_corrected = wac_corrector.correct_text(gec_corrected)
|
| 196 |
+
print(f"✅ WAC结果: {final_corrected[:50]}...")
|
| 197 |
+
|
| 198 |
# 格式化输出以匹配DeepSeek的格式
|
| 199 |
+
return f"[output]: {final_corrected}"
|
| 200 |
+
|
| 201 |
except Exception as e:
|
| 202 |
raise Exception(f"WAC-GEC处理错误: {str(e)}")
|
| 203 |
|
|
|
|
| 578 |
log_text += f" 变化: {delta_sed:+.2f}% {'✅ 改善' if delta_sed < 0 else '⚠️ 增加'}\n"
|
| 579 |
|
| 580 |
if model_choice == "WAC-GEC":
|
| 581 |
+
log_text += f"\n💡 注意: WAC-GEC使用两步纠正(GEC语法纠错 + WAC空白符纠正)\n"
|
| 582 |
|
| 583 |
log_text += f"{'='*50}\n"
|
| 584 |
|
|
|
|
| 605 |
- **优势**: 综合性强,能处理多种类型的错误
|
| 606 |
- **配置**: 需要在Space Settings中配置DEEPSEEK_API_KEY
|
| 607 |
|
| 608 |
+
#### 2. WAC-GEC (Whitespace + Grammar Error Correction)
|
| 609 |
+
- **功能**: 两步纠正流程
|
| 610 |
+
- **Step 1 (GEC)**: 使用LLaMA-2-7B微调模型进行语法和拼写纠错
|
| 611 |
+
- **Step 2 (WAC)**: 使用空白符纠正模型修正空格问题
|
| 612 |
+
- **优势**:
|
| 613 |
+
- 完全本地化,无需API密钥
|
| 614 |
+
- 组合两个专门模型,各司其职
|
| 615 |
+
- 适合离线环境和预算有限的场景
|
| 616 |
+
- **模型来源**:
|
| 617 |
+
- GEC: [lllouo/gec_Chat-LLaMa-2-7B-FT](https://huggingface.co/lllouo/gec_Chat-LLaMa-2-7B-FT)
|
| 618 |
+
- WAC: whitespace_correction库
|
| 619 |
|
| 620 |
### 核心算法
|
| 621 |
|
|
|
|
| 626 |
|
| 627 |
2. **模型去噪**
|
| 628 |
- **DeepSeek**: 使用API进行全面错误修正,重试机制最多5次
|
| 629 |
+
- **WAC-GEC**:
|
| 630 |
+
- 先使用GEC模型进行语法和拼写纠正
|
| 631 |
+
- 再使用WAC模型进行空白符纠正
|
| 632 |
+
- 重试机制最多3次
|
| 633 |
|
| 634 |
3. **格式验证**
|
| 635 |
- 验证输出格式正确性
|
|
|
|
| 659 |
### 技术栈
|
| 660 |
|
| 661 |
- **LLM**: DeepSeek API (deepseek-r1-distill-llama-8b)
|
| 662 |
+
- **本地模型**:
|
| 663 |
+
- GEC: LLaMA-2-7B (微调于语法纠错任务)
|
| 664 |
+
- WAC: Whitespace Correction Model
|
| 665 |
- **前端**: Gradio 4.16.0
|
| 666 |
- **数据处理**: Pandas + PyArrow (Parquet)
|
| 667 |
- **差异对比**: Python difflib
|
|
|
|
| 676 |
|
| 677 |
### 模型选择建议
|
| 678 |
|
| 679 |
+
- **需要全面去噪 + 有API预算**: 选择 DeepSeek-R1
|
| 680 |
+
- **本地化部署 + 完整纠错**: 选择 WAC-GEC(推荐)
|
| 681 |
+
- **仅需修正空格**: 单独使用WAC模块
|
| 682 |
+
- **追求最快速度**: 使用GPU加速的WAC-GEC
|
| 683 |
+
|
| 684 |
+
### WAC-GEC处理流程示例
|
| 685 |
+
|
| 686 |
+
```
|
| 687 |
+
原始文本: "This is anexample with spellingerorr."
|
| 688 |
+
↓
|
| 689 |
+
[Step 1: GEC语法纠错]
|
| 690 |
+
↓
|
| 691 |
+
中间结果: "This is an example with spelling error."
|
| 692 |
+
↓
|
| 693 |
+
[Step 2: WAC空白符纠正]
|
| 694 |
+
↓
|
| 695 |
+
最终结果: "This is an example with spelling error."
|
| 696 |
+
```
|
| 697 |
|
| 698 |
---
|
| 699 |
|
| 700 |
+
**研究生毕业论文成果展示** | Powered by DeepSeek API & WAC-GEC (LLaMA-2-7B + Whitespace Correction)
|
| 701 |
"""
|
| 702 |
|
| 703 |
# ======================== Gradio界面 ========================
|
|
|
|
| 798 |
choices=["deepseek-r1-distill-llama-8b", "WAC-GEC"],
|
| 799 |
value="deepseek-r1-distill-llama-8b",
|
| 800 |
label="🤖 选择模型",
|
| 801 |
+
info="DeepSeek: 全面纠错 | WAC-GEC: 语法+空白符纠正(���地模型)"
|
| 802 |
)
|
| 803 |
|
| 804 |
temperature = gr.Slider(
|
requirements.txt
CHANGED
|
@@ -1,8 +1,11 @@
|
|
| 1 |
gradio
|
| 2 |
-
requests
|
| 3 |
pandas
|
| 4 |
pyarrow
|
|
|
|
| 5 |
openai
|
| 6 |
spacy
|
| 7 |
pyspellchecker
|
| 8 |
-
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
gradio
|
|
|
|
| 2 |
pandas
|
| 3 |
pyarrow
|
| 4 |
+
requests
|
| 5 |
openai
|
| 6 |
spacy
|
| 7 |
pyspellchecker
|
| 8 |
+
torch
|
| 9 |
+
transformers
|
| 10 |
+
accelerate
|
| 11 |
+
whitespace_correction
|