Spaces:

lllouo
/

BD_framework_test

Sleeping

App Files Files Community

lllouo commited on Jan 27

Commit

bb4c1e0

1 Parent(s): a7581b9

WAC-GEC

Browse files

Files changed (2) hide show

app.py +148 -28
requirements.txt +5 -2

app.py CHANGED Viewed

@@ -10,18 +10,25 @@ import re
 import spacy
 from spellchecker import SpellChecker
 import difflib
 # ======================== 新增：WAC-GEC导入 ========================
 try:
     from whitespace_correction import WhitespaceCorrector
     WAC_GEC_AVAILABLE = True
-    # 初始化WAC-GEC模型（使用CPU，HF Space通常没有GPU）
-    wac_corrector = None  # 延迟初始化
 except ImportError:
     WAC_GEC_AVAILABLE = False
     wac_corrector = None
     print("⚠️ whitespace_correction未安装，WAC-GEC功能将不可用")
 # ======================== API配置 ========================
 DEEPSEEK_API_KEY = os.getenv("DEEPSEEK_API_KEY", "")
 DEEPSEEK_BASE_URL = "https://dashscope.aliyuncs.com/compatible-mode/v1"
@@ -71,38 +78,126 @@ Next, please correct the following sentence according to the above requirements.
 [input]: """
-# ======================== 新增：WAC-GEC初始化函数 ========================
 def initialize_wac_gec():
-    """延迟初始化WAC-GEC模型"""
-    global wac_corrector
     if not WAC_GEC_AVAILABLE:
         return False
     if wac_corrector is None:
         try:
-            device = "cpu"  # HF Space默认使用CPU
             wac_corrector = WhitespaceCorrector.from_pretrained(
                 model="eo_larger_byte",
                 device=device,
-                download_dir="./models"
             )
-            print(f"✅ WAC-GEC模型已加载 (设备: {device})")
-            return True
         except Exception as e:
-            print(f"❌ WAC-GEC模型加载失败: {e}")
             return False
     return True
-# ======================== 新增：WAC-GEC处理函数 ========================
 def call_wac_gec(text):
-    """使用WAC-GEC纠正空白符错误"""
     if not initialize_wac_gec():
         raise ValueError("⚠️ WAC-GEC模型未安装或加载失败")
     try:
-        corrected = wac_corrector.correct_text(text)
         # 格式化输出以匹配DeepSeek的格式
-        return f"[output]: {corrected}"
     except Exception as e:
         raise Exception(f"WAC-GEC处理错误: {str(e)}")
@@ -483,7 +578,7 @@ def clean_dataset(file_path, question_column, model_choice, temperature, max_sam
         log_text += f"   变化: {delta_sed:+.2f}% {'✅ 改善' if delta_sed < 0 else '⚠️ 增加'}\n"
         if model_choice == "WAC-GEC":
-            log_text += f"\n💡 注意: WAC-GEC仅修正空白符错误，不修正拼写和语法错误\n"
         log_text += f"{'='*50}\n"
@@ -510,11 +605,17 @@ ABOUT_TEXT = """
 - **优势**: 综合性强，能处理多种类型的错误
 - **配置**: 需要在Space Settings中配置DEEPSEEK_API_KEY
-#### 2. WAC-GEC (Whitespace Correction)
-- **功能**: 专注于空白符错误纠正（多余空格、缺失空格等）
-- **优势**: 轻量级，无需API密钥，处理速度快
-- **限制**: 仅修正空白符错误，不处理拼写和语法问题
-- **适用场景**: 数据集中主要存在空白符异常的情况
 ### 核心算法
@@ -525,7 +626,10 @@ ABOUT_TEXT = """
 2. **模型去噪**
    - **DeepSeek**: 使用API进行全面错误修正，重试机制最多5次
-   - **WAC-GEC**: 使用本地模型进行空白符纠正，重试机制最多3次
 3. **格式验证**
    - 验证输出格式正确性
@@ -555,7 +659,9 @@ ABOUT_TEXT = """
 ### 技术栈
 - **LLM**: DeepSeek API (deepseek-r1-distill-llama-8b)
-- **本地模型**: WAC-GEC (Whitespace Correction)
 - **前端**: Gradio 4.16.0
 - **数据处理**: Pandas + PyArrow (Parquet)
 - **差异对比**: Python difflib
@@ -570,14 +676,28 @@ ABOUT_TEXT = """
 ### 模型选择建议
-- **需要全面去噪**: 选择 DeepSeek-R1
-- **仅需修正空格**: 选择 WAC-GEC（更快，无需API）
-- **预算有限**: 优先使用 WAC-GEC
-- **追求最佳效果**: 使用 DeepSeek-R1
 ---
-**研究生毕业论文成果展示** | Powered by DeepSeek API & WAC-GEC
 """
 # ======================== Gradio界面 ========================
@@ -678,7 +798,7 @@ with demo:
                         choices=["deepseek-r1-distill-llama-8b", "WAC-GEC"],
                         value="deepseek-r1-distill-llama-8b",
                         label="🤖 选择模型",
-                        info="DeepSeek: 全面纠错 | WAC-GEC: 仅空白符纠正"
                     )
                     temperature = gr.Slider(

 import spacy
 from spellchecker import SpellChecker
 import difflib
+import torch
+from transformers import AutoTokenizer, AutoModelForCausalLM
 # ======================== 新增：WAC-GEC导入 ========================
 try:
     from whitespace_correction import WhitespaceCorrector
     WAC_GEC_AVAILABLE = True
+    # 初始化WAC-GEC模型（延迟加载）
+    wac_corrector = None
 except ImportError:
     WAC_GEC_AVAILABLE = False
     wac_corrector = None
     print("⚠️ whitespace_correction未安装，WAC-GEC功能将不可用")
+# 初始化GEC模型（延迟加载）
+gec_tokenizer = None
+gec_model = None
+GEC_MODEL_NAME = "lllouo/gec_Chat-LLaMa-2-7B-FT"  # 你的HF模型地址
 # ======================== API配置 ========================
 DEEPSEEK_API_KEY = os.getenv("DEEPSEEK_API_KEY", "")
 DEEPSEEK_BASE_URL = "https://dashscope.aliyuncs.com/compatible-mode/v1"
 [input]: """
+# ======================== 新增：初始化函数（WAC + GEC） ========================
 def initialize_wac_gec():
+    """延迟初始化WAC-GEC模型（空白符纠正 + 语法纠错）"""
+    global wac_corrector, gec_tokenizer, gec_model
+    # 1. 初始化WAC（空白符纠正）
     if not WAC_GEC_AVAILABLE:
+        print("❌ WAC模块未安装")
         return False
     if wac_corrector is None:
         try:
+            device = "cuda" if torch.cuda.is_available() else "cpu"
             wac_corrector = WhitespaceCorrector.from_pretrained(
                 model="eo_larger_byte",
                 device=device,
+                download_dir="./models/wac"
             )
+            print(f"✅ WAC空白符纠正模型已加载 (设备: {device})")
         except Exception as e:
+            print(f"❌ WAC模型加载失败: {e}")
             return False
+    # 2. 初始化GEC（语法纠错）
+    if gec_model is None or gec_tokenizer is None:
+        try:
+            device = "cuda" if torch.cuda.is_available() else "cpu"
+            print(f"📥 正在从HuggingFace下载GEC模型: {GEC_MODEL_NAME}")
+            gec_tokenizer = AutoTokenizer.from_pretrained(
+                GEC_MODEL_NAME,
+                trust_remote_code=True
+            )
+            gec_model = AutoModelForCausalLM.from_pretrained(
+                GEC_MODEL_NAME,
+                device_map="auto" if device == "cuda" else None,
+                torch_dtype=torch.bfloat16 if device == "cuda" else torch.float32,
+                trust_remote_code=True
+            )
+            # 如果是CPU模式，手动移动模型
+            if device == "cpu":
+                gec_model = gec_model.to(device)
+            # 设置tokenizer的pad_token和padding_side
+            gec_tokenizer.pad_token_id = gec_tokenizer.eos_token_id
+            gec_tokenizer.padding_side = "left"
+            print(f"✅ GEC语法纠错模型已加载 (设备: {device})")
+        except Exception as e:
+            print(f"❌ GEC模型加载失败: {e}")
+            return False
     return True
+# ======================== 新增：GEC语法纠错函数 ========================
+def correct_sentence_gec(input_sentence):
+    """
+    使用GEC模型进行语法纠错
+    参数:
+        input_sentence (str): 需要纠正的句子
+    返回:
+        str: 纠正后的句子
+    """
+    if gec_model is None or gec_tokenizer is None:
+        raise ValueError("GEC模型未初始化")
+    # 构建提示词
+    prompt = f"""Rewrite the following sentence to correct grammatical errors. Return ONLY the corrected sentence.
+Original: {input_sentence}
+Corrected:"""
+    # 生成修正
+    inputs = gec_tokenizer(prompt, return_tensors="pt").to(gec_model.device)
+    with torch.no_grad():
+        outputs = gec_model.generate(
+            **inputs,
+            max_new_tokens=512,
+            num_beams=4,
+            do_sample=False,
+            temperature=None,
+            top_p=None
+        )
+    # 提取并清理输出
+    full_output = gec_tokenizer.decode(outputs[0], skip_special_tokens=True)
+    corrected_text = full_output.replace(prompt, "").strip()
+    # 进一步清理可能的前缀
+    if corrected_text.startswith("Corrected:"):
+        corrected_text = corrected_text[len("Corrected:"):].strip()
+    return corrected_text
+# ======================== 新增：WAC-GEC组合处理函数 ========================
 def call_wac_gec(text):
+    """
+    使用WAC-GEC两步纠正：
+    1. GEC模型进行语法和拼写纠正
+    2. WAC模型进行空白符纠正
+    """
     if not initialize_wac_gec():
         raise ValueError("⚠️ WAC-GEC模型未安装或加载失败")
     try:
+        # Step 1: 使用GEC模型进行语法纠错
+        print(f"🔍 GEC处理: {text[:50]}...")
+        gec_corrected = correct_sentence_gec(text)
+        print(f"✅ GEC结果: {gec_corrected[:50]}...")
+        # Step 2: 使用WAC模型进行空白符纠正
+        print(f"🔍 WAC处理: {gec_corrected[:50]}...")
+        final_corrected = wac_corrector.correct_text(gec_corrected)
+        print(f"✅ WAC结果: {final_corrected[:50]}...")
         # 格式化输出以匹配DeepSeek的格式
+        return f"[output]: {final_corrected}"
     except Exception as e:
         raise Exception(f"WAC-GEC处理错误: {str(e)}")
         log_text += f"   变化: {delta_sed:+.2f}% {'✅ 改善' if delta_sed < 0 else '⚠️ 增加'}\n"
         if model_choice == "WAC-GEC":
+            log_text += f"\n💡 注意: WAC-GEC使用两步纠正（GEC语法纠错 + WAC空白符纠正）\n"
         log_text += f"{'='*50}\n"
 - **优势**: 综合性强，能处理多种类型的错误
 - **配置**: 需要在Space Settings中配置DEEPSEEK_API_KEY
+#### 2. WAC-GEC (Whitespace + Grammar Error Correction)
+- **功能**: 两步纠正流程
+  - **Step 1 (GEC)**: 使用LLaMA-2-7B微调模型进行语法和拼写纠错
+  - **Step 2 (WAC)**: 使用空白符纠正模型修正空格问题
+- **优势**:
+  - 完全本地化，无需API密钥
+  - 组合两个专门模型，各司其职
+  - 适合离线环境和预算有限的场景
+- **模型来源**:
+  - GEC: [lllouo/gec_Chat-LLaMa-2-7B-FT](https://huggingface.co/lllouo/gec_Chat-LLaMa-2-7B-FT)
+  - WAC: whitespace_correction库
 ### 核心算法
 2. **模型去噪**
    - **DeepSeek**: 使用API进行全面错误修正，重试机制最多5次
+   - **WAC-GEC**:
+     - 先使用GEC模型进行语法和拼写纠正
+     - 再使用WAC模型进行空白符纠正
+     - 重试机制最多3次
 3. **格式验证**
    - 验证输出格式正确性
 ### 技术栈
 - **LLM**: DeepSeek API (deepseek-r1-distill-llama-8b)
+- **本地模型**:
+  - GEC: LLaMA-2-7B (微调于语法纠错任务)
+  - WAC: Whitespace Correction Model
 - **前端**: Gradio 4.16.0
 - **数据处理**: Pandas + PyArrow (Parquet)
 - **差异对比**: Python difflib
 ### 模型选择建议
+- **需要全面去噪 + 有API预算**: 选择 DeepSeek-R1
+- **本地化部署 + 完整纠错**: 选择 WAC-GEC（推荐）
+- **仅需修正空格**: 单独使用WAC模块
+- **追求最快速度**: 使用GPU加速的WAC-GEC
+### WAC-GEC处理流程示例
+```
+原始文本: "This is anexample with spellingerorr."
+    ↓
+[Step 1: GEC语法纠错]
+    ↓
+中间结果: "This is an example with spelling error."
+    ↓
+[Step 2: WAC空白符纠正]
+    ↓
+最终结果: "This is an example with spelling error."
+```
 ---
+**研究生毕业论文成果展示** | Powered by DeepSeek API & WAC-GEC (LLaMA-2-7B + Whitespace Correction)
 """
 # ======================== Gradio界面 ========================
                         choices=["deepseek-r1-distill-llama-8b", "WAC-GEC"],
                         value="deepseek-r1-distill-llama-8b",
                         label="🤖 选择模型",
+                        info="DeepSeek: 全面纠错 | WAC-GEC: 语法+空白符纠正(���地模型)"
                     )
                     temperature = gr.Slider(

requirements.txt CHANGED Viewed

@@ -1,8 +1,11 @@
 gradio
-requests
 pandas
 pyarrow
 openai
 spacy
 pyspellchecker
-whitespace-correction

 gradio
 pandas
 pyarrow
+requests
 openai
 spacy
 pyspellchecker
+torch
+transformers
+accelerate
+whitespace_correction