Spaces:

lllouo
/

BD_framework_test

Running

App Files Files Community

lllouo commited on Jan 20

Commit

b96d100

1 Parent(s): cc7eba8

app.py

Browse files

Files changed (1) hide show

app.py +392 -450

app.py CHANGED Viewed

@@ -1,4 +1,4 @@
-# app.py - 使用 requests 调用 DeepSeek API（稳定版本）
 import gradio as gr
 import json
 import pandas as pd
@@ -11,62 +11,158 @@ import re
 import spacy
 from spellchecker import SpellChecker
-# DeepSeek API配置
 DEEPSEEK_API_KEY = os.getenv("DEEPSEEK_API_KEY", "")
 DEEPSEEK_BASE_URL = "https://dashscope.aliyuncs.com/compatible-mode/v1"
-# 在全局初始化 spaCy 和 SpellChecker（放在文件开头，DEEPSEEK_API_KEY 定义之后）
 try:
     nlp = spacy.load("en_core_web_sm")
 except OSError:
-    # 如果模型未安装，自动下载
     import subprocess
     subprocess.run(["python", "-m", "spacy", "download", "en_core_web_sm"])
     nlp = spacy.load("en_core_web_sm")
 spell = SpellChecker()
-# 空白符异常检测的正则模式
 WHITESPACE_PATTERNS = [
-    re.compile(r'[ \t]{2,}'),                 # 连续空格或 tab
-    re.compile(r'\u200B|\u2060'),              # zero-width space / word joiner
-    re.compile(r'\s+([.,!?;:])'),              # 标点前多余空格
-    re.compile(r'([.,!?;:])\s{2,}'),           # 标点后异常空格
 ]
-def check_api_key():
-    """检查API Key是否配置"""
-    if not DEEPSEEK_API_KEY:
-        raise ValueError("⚠️ 请在 Space Settings 中配置 DEEPSEEK_API_KEY！\n\n前往：Settings → Repository secrets → New secret")
-def call_deepseek_api(prompt, model="deepseek-r1-distill-llama-8b", temperature=0.1, stream=True):
-    """使用 OpenAI 客户端调用 DeepSeek API"""
-    check_api_key()
-    client = OpenAI(
-        api_key=DEEPSEEK_API_KEY,
-        base_url=DEEPSEEK_BASE_URL,
-    )
-    completion = client.chat.completions.create(
-        model="deepseek-r1-distill-llama-8b",
-        messages=[{"role": "user", "content": prompt}],
-        temperature=temperature,
-        stream=stream
-    )
-    if stream:
-        # 流式响应处理
-        response_content = ""
-        for chunk in completion:
-            if chunk.choices and chunk.choices[0].delta.content:
-                response_content += chunk.choices[0].delta.content
-        return response_content
-    else:
-        # 非流式响应
-        return completion.choices[0].message.content
-# 系统Prompt模板
 PROMPT_TEMPLATE = """## Positioning
 You are a **LANGUAGE grammatical error correction tool** that can identify and correct grammatical errors in a text.
 Reply with a corrected version of the input sentence with all **grammatical**, **spelling** and **whitespace errors** fixed, making only necessary changes.
@@ -87,10 +183,6 @@ Example 2: No errors, reply with a copy of the original sentence, don't fill in
 [input]: _______ such as bitcoin are becoming increasingly mainstream and have a whole host of associated ethical implications, for example, they are______ and more ______. However, they have also been used to engage in _______.
 [output]: _______ such as bitcoin are becoming increasingly mainstream and have a whole host of associated ethical implications, for example, they are______ and more ______. However, they have also been used to engage in _______.
-Example 3: No errors, reply with a copy of the original sentence, don't fill in the contents of ___.
-[input]: The Sun is the largest body in the solar system. The Sun is a ___.
-[output]: The Sun is the largest body in the solar system. The Sun is a ___.
 ## Task
 Next, please correct the following sentence according to the above requirements.
 **If there are no errors, reply with a copy of the original sentence. Don't fill in the contents of ___.**
@@ -98,21 +190,35 @@ Next, please correct the following sentence according to the above requirements.
 [input]: """
 def process_sentence(sentence):
-    """检查问题是否完整，不完整则添加标记"""
     sentence = sentence.strip()
-    # 判断是否为多行文本
     lines = [line.strip() for line in sentence.split('\n') if line.strip()]
     is_multiline = len(lines) > 1
-    # 根据是否多行选择处理逻辑
-    if is_multiline:
-        target_line = lines[-1]
-    else:
-        target_line = sentence
-    # 检查最后一个字符是否是标点符号
     last_char = target_line[-1] if target_line else ''
     if last_char in {'.', '?', '!', ';', ','}:
         return target_line
@@ -120,27 +226,15 @@ def process_sentence(sentence):
         return target_line + " ___."
 def is_valid_output(content_2, content_1, content_0):
-    """检查输出格式是否符合要求"""
-    # 检查基本格式
     if not (content_2.startswith('[output]:') and '\n' not in content_2):
         return False
-    # 原始句子有下划线，但生成的句子没有下划线 => 返回False
     if ('___' in content_0 or '___' in content_1) and '___' not in content_2:
         return False
-    # content_2 的字符数不能超过 content_1 的两倍
-    if len(content_2) > 2 * len(content_1):
-        return False
-    # content_1 的字符数不能超过 content_2 的两倍
-    if len(content_1) > 2 * len(content_2):
         return False
     return True
 def extract_output_content(item):
-    """提取输出内容"""
     if item.startswith('[output]:'):
         output_content = item[len('[output]:'):].strip()
         if output_content and output_content[0] == '"' and output_content[-1] == '"':
@@ -155,102 +249,114 @@ def extract_output_content(item):
         return None
 def has_missing_spaces(sentence):
-    """
-    启发式检测：长度足够 + 多个词形变化 + 无空格
-    """
     if ' ' in sentence:
         return False
     doc = nlp(sentence)
-    # 多个 alpha token 且原文无空格
     alpha_tokens = [t for t in doc if t.is_alpha]
     return len(alpha_tokens) >= 2
 def calculate_whitespace_anomaly_rate(sentences):
-    """
-    计算空白符异常率（WAR）
-    """
     if not sentences:
         return 0.0
     anomaly_count = 0
     for sent in sentences:
-        # 检测缺少空格
         if has_missing_spaces(sent):
             anomaly_count += 1
             continue
-        # 检测其他空白异常
         if any(p.search(sent) for p in WHITESPACE_PATTERNS):
             anomaly_count += 1
     return anomaly_count / len(sentences) * 100
 def normalize_tokens(text):
-    """
-    标准化文本token，用于拼写检查
-    """
     doc = nlp(text)
     tokens = []
     for t in doc:
-        if not t.is_alpha:
-            continue
-        if len(t.text) <= 2:
-            continue
-        if t.text.isupper():
             continue
         tokens.append(t.text.lower())
     return tokens
 def calculate_spelling_error_density(sentences):
-    """
-    计算拼写错误密度（SED）
-    """
     total_words = 0
     total_errors = 0
     for sent in sentences:
-        # missing-space 单独处理：不计入拼写错误
         if has_missing_spaces(sent):
             continue
         tokens = normalize_tokens(sent)
         if not tokens:
             continue
         misspelled = spell.unknown(tokens)
         total_errors += len(misspelled)
         total_words += len(tokens)
     if total_words == 0:
         return 0.0
     return total_errors / total_words * 100
 def clean_dataset(file_path, question_column, model_choice, temperature, max_samples, progress=gr.Progress()):
-    """清洗数据集的核心函数（增强版：包含WAR和SED指标计算）"""
     try:
-        # 检查 API Key
         try:
             check_api_key()
         except ValueError as e:
             return str(e), None, None
-        # 读取 parquet 文件
         progress(0.05, desc="📁 读取数据文件...")
         df = pd.read_parquet(file_path)
-        # 检查列名是否存在
         if question_column not in df.columns:
             available_columns = ", ".join(df.columns.tolist())
             return f"❌ 列名 '{question_column}' 不存在！\n可用列名: {available_columns}", None, None
-        # 提取问题数据
         data_ori = df[question_column].tolist()[:int(max_samples)]
         total = len(data_ori)
-        # === 计算原始数据的 WAR 和 SED ===
         progress(0.08, desc="📊 计算原始指标...")
         original_sentences = [str(item) for item in data_ori]
         war_original = calculate_whitespace_anomaly_rate(original_sentences)
@@ -258,13 +364,9 @@ def clean_dataset(file_path, question_column, model_choice, temperature, max_sam
         progress(0.1, desc=f"🚀 开始清洗 {total} 个样本...")
-        # 预处理：添加标记
         data_corrupt = [process_sentence(str(item)) for item in data_ori]
-        # 清洗结果
         results = []
         max_retries = 5
         log_text = f"🚀 开始处理 {total} 个样本...\n\n"
         for idx in range(total):
@@ -277,14 +379,12 @@ def clean_dataset(file_path, question_column, model_choice, temperature, max_sam
             while retry_count < max_retries:
                 try:
-                    # 调用 DeepSeek API
                     response_content = call_deepseek_api(
                         PROMPT_TEMPLATE + original_text,
                         model=model_choice,
                         temperature=float(temperature)
                     )
-                    # 验证输出格式
                     if is_valid_output(response_content, original_text, unprocess_text):
                         results.append(response_content)
                         break
@@ -295,13 +395,11 @@ def clean_dataset(file_path, question_column, model_choice, temperature, max_sam
                     retry_count += 1
                     log_text += f"⚠️ 样本 {idx+1} API错误，重试 {retry_count}/{max_retries}: {str(e)}\n"
             else:
-                # 重试次数用尽
                 results.append(f"[ERROR] Failed to process: {original_text}")
                 log_text += f"❌ 样本 {idx+1} 处理失败\n"
         progress(0.85, desc="📊 后处理中...")
-        # 提取清洗后的内容
         lst_extracted = []
         error_count = 0
         unknown_count = 0
@@ -316,7 +414,6 @@ def clean_dataset(file_path, question_column, model_choice, temperature, max_sam
                 if item.startswith('[ERROR]'):
                     error_count += 1
-        # 恢复多行格式
         lst_final = []
         for i in range(len(data_ori)):
             item = str(data_ori[i])
@@ -327,32 +424,26 @@ def clean_dataset(file_path, question_column, model_choice, temperature, max_sam
             else:
                 lst_final.append(lst_extracted[i])
-        # === 计算清洗后的 WAR 和 SED ===
         progress(0.90, desc="📊 计算清洗后指标...")
         cleaned_sentences = [str(item) for item in lst_final]
         war_cleaned = calculate_whitespace_anomaly_rate(cleaned_sentences)
         sed_cleaned = calculate_spelling_error_density(cleaned_sentences)
-        # 计算变化
         delta_war = war_cleaned - war_original
         delta_sed = sed_cleaned - sed_original
         progress(0.95, desc="💾 保存结果...")
-        # 创建新的DataFrame
         df_cleaned = df.copy()
         df_cleaned[question_column + '_cleaned'] = lst_final[:len(df)]
-        # 生成输出文件名
         original_filename = os.path.basename(file_path)
         base_name = original_filename.replace('.parquet', '')
         output_filename = f"{base_name}-Denoising.parquet"
         output_path = os.path.join(tempfile.gettempdir(), output_filename)
-        # 保存为 parquet
         df_cleaned.to_parquet(output_path, index=False)
-        # 生成统计信息（增强版）
         log_text += f"\n\n📊 处理完成！\n"
         log_text += f"{'='*50}\n"
         log_text += f"【基础统计】\n"
@@ -372,7 +463,6 @@ def clean_dataset(file_path, question_column, model_choice, temperature, max_sam
         log_text += f"   变化: {delta_sed:+.2f}% {'✅ 改善' if delta_sed < 0 else '⚠️ 增加'}\n"
         log_text += f"{'='*50}\n"
-        # 生成预览数据
         preview_df = pd.DataFrame({
             '原始问题': [str(x)[:100] for x in data_ori[:5]],
             '清洗后问题': [str(x)[:100] for x in lst_final[:5]]
@@ -387,200 +477,182 @@ def clean_dataset(file_path, question_column, model_choice, temperature, max_sam
         error_detail = traceback.format_exc()
         return f"❌ 处理出错: {str(e)}\n\n详细错误:\n{error_detail}", None, None
-def show_leaderboard():
-    """显示Leaderboard，从 JSON 文件加载并格式化为HTML"""
-    json_path = "leaderboard.json"
-    try:
-        with open(json_path, 'r', encoding='utf-8') as f:
-            data = json.load(f)
-        # 构建HTML表格
-        html = """
-        <style>
-            .leaderboard-table {
-                width: 100%;
-                border-collapse: collapse;
-                margin: 20px 0;
-                font-size: 14px;
-                box-shadow: 0 2px 8px rgba(0,0,0,0.1);
-                border-radius: 8px;
-                overflow: hidden;
-            }
-            .leaderboard-table thead tr {
-                background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
-                color: white;
-                text-align: left;
-                font-weight: bold;
-            }
-            .leaderboard-table th,
-            .leaderboard-table td {
-                padding: 12px 15px;
-                text-align: center;
-            }
-            .leaderboard-table tbody tr {
-                border-bottom: 1px solid #dddddd;
-                transition: all 0.2s ease;
-            }
-            .leaderboard-table tbody tr:nth-of-type(even) {
-                background-color: #f9fafb;
-            }
-            .leaderboard-table tbody tr:hover {
-                background-color: #e8eaf6;
-                transform: scale(1.01);
-                box-shadow: 0 2px 4px rgba(0,0,0,0.1);
-            }
-            .metric-positive {
-                color: #10b981;
-                font-weight: bold;
-            }
-            .metric-negative {
-                color: #ef4444;
-                font-weight: bold;
-            }
-            .download-link {
-                color: #667eea;
-                text-decoration: none;
-                font-weight: 500;
-                padding: 4px 12px;
-                border-radius: 4px;
-                border: 1px solid #667eea;
-                transition: all 0.2s;
-                display: inline-block;
-            }
-            .download-link:hover {
-                background-color: #667eea;
-                color: white;
-                transform: translateY(-1px);
-            }
-            .benchmark-name {
-                font-weight: 500;
-                color: #1f2937;
-            }
-        </style>
-        <table class="leaderboard-table">
-            <thead>
-                <tr>
-                    <th>ID</th>
-                    <th>Benchmark</th>
-                    <th>ΔWAR (%)</th>
-                    <th>ΔSED</th>
-                    <th>下载</th>
-                </tr>
-            </thead>
-            <tbody>
-        """
-        for item in data:
-            # 提取下载链接
-            download_text = item['Download']
-            # 从 Markdown 格式中提取 URL
-            if '[下载](' in download_text:
-                url = download_text.split('(')[1].rstrip(')')
-                download_html = f'<a href="{url}" class="download-link" target="_blank">下载</a>'
-            else:
-                download_html = download_text
-            # 格式化 ΔWAR 和 ΔSED
-            war_value = item['ΔWAR']
-            sed_value = item['ΔSED']
-            war_class = 'metric-positive' if war_value > 0 else 'metric-negative'
-            sed_class = 'metric-positive' if sed_value < 0 else 'metric-negative'
-            html += f"""
-                <tr>
-                    <td>{item['ID']}</td>
-                    <td class="benchmark-name">{item['Benchmark']}</td>
-                    <td class="{war_class}">{war_value:+.2f}</td>
-                    <td class="{sed_class}">{sed_value:+.2f}</td>
-                    <td>{download_html}</td>
-                </tr>
-            """
-        html += """
-            </tbody>
-        </table>
-        """
-        return html
-    except FileNotFoundError:
-        return "<p style='color: red;'>❌ leaderboard.json 文件未找到</p>"
-    except json.JSONDecodeError:
-        return "<p style='color: red;'>❌ JSON 格式无效</p>"
-# 创建 Gradio 界面
-with gr.Blocks(title="数据集清洗框架展示系统", css="""
-    .gradio-container {
-        max-width: 1400px !important;
-    }
-    .stats-box {
-        background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
-        color: white;
-        padding: 20px;
-        border-radius: 10px;
-        text-align: center;
-        margin: 10px 0;
-    }
-    .stats-box h3 {
-        margin: 0;
-        font-size: 24px;
-    }
-    .stats-box p {
-        margin: 5px 0;
-        font-size: 14px;
-        opacity: 0.9;
-    }
-""") as demo:
-    gr.Markdown("""
-    # 🚀 基于基准去噪框架的去噪工厂展示系统
-    """)
-    with gr.Tabs():
-        # Tab 1: Leaderboard
-        with gr.Tab("📊 Leaderboard"):
-            gr.Markdown("""
-            ## 清洗效果排行榜
-            展示主流benchmark数据集的去噪结果（按数据集排序）
-            """)
-            with gr.Row():
-                with gr.Column(scale=2):
-                    gr.HTML("""
-                        <div class="stats-box">
-                            <h3>📈 关键指标</h3>
-                            <p><strong>数据集总数:</strong> 14个主流Benchmark</p>
-                            <p><strong>去噪方法:</strong> 2种</p>
-                            <p><strong>总配置:</strong> 28种</p>
-                        </div>
-                    """)
-                    gr.Markdown("""
-                    ### 📝 指标说明
-                    - **ΔWAR**: Word Accuracy Rate变化 (↑越高越好)
-                    - **ΔSED**: Sentence Edit Distance变化 (↓越低越好)
-                    - **绿色**: 正向提升
-                    - **红色**: 负向影响
-                    """)
-                with gr.Column(scale=5):
-                    leaderboard_html = gr.HTML(
-                        value=show_leaderboard(),
-                        label="数据集去噪结果对比"
                     )
-        # Tab 2: 数据集上传与清洗 (DeepSeek-R1)
-        with gr.Tab("🔧 数据集去噪 (DeepSeek-R1-denoising)"):
-            gr.Markdown("""
-            ## 上传 Parquet 数据集进行去噪
-            **支持的数据集格式**:
-            - MMLU / GSM8K / ARC-Challenge / MedMCQA 等
-            - 文件格式: `.parquet`
-            - 清洗后文件命名: `原文件名-Denoising.parquet`
-            """)
             with gr.Row():
                 with gr.Column():
@@ -617,7 +689,7 @@ with gr.Blocks(title="数据集清洗框架展示系统", css="""
                         label="📊 处理样本数 (Demo限制)"
                     )
-                    clean_btn = gr.Button("🚀 开始清洗", variant="primary", size="lg")
                 with gr.Column():
                     output_text = gr.Textbox(
@@ -638,150 +710,20 @@ with gr.Blocks(title="数据集清洗框架展示系统", css="""
                 inputs=[file_input, question_column, model_choice, temperature, max_samples],
                 outputs=[output_text, download_file, preview_df]
             )
-        # Tab 3: 数据集上传与清洗 (WAC-GEC)
-        with gr.Tab("🔧 数据集去噪 (WAC-GEC)"):
-            gr.Markdown("""
-            ## 上传 Parquet 数据集进行去噪
-            **支持的数据集格式**:
-            - MMLU / GSM8K / ARC-Challenge / MedMCQA 等
-            - 文件格式: `.parquet`
-            - 清洗后文件命名: `原文件名-Denoising.parquet`
-            """)
-            with gr.Row():
-                with gr.Column():
-                    file_input_wac = gr.File(
-                        label="📁 上传 Parquet 文件",
-                        file_types=[".parquet"]
-                    )
-                    question_column_wac = gr.Textbox(
-                        label="📝 问题列名",
-                        value="question",
-                        placeholder="例如: question, input_text, prompt"
-                    )
-                    model_choice_wsc = gr.Dropdown(
-                        choices=["eo_larger_byte", "ed_larger_byte"],
-                        value="eo_larger_byte",
-                        label="🤖 选择WSC模型"
-                    )
-                    model_choice_gec = gr.Dropdown(
-                        choices=["Chat-Llama-2-13B", "T5-11B", "GECToR-Roberta-L"],
-                        value="Chat-Llama-2-13B",
-                        label="🤖 选择GEC模型"
-                    )
-                    temperature_wac = gr.Slider(
-                        minimum=0.0,
-                        maximum=1.0,
-                        value=0.1,
-                        step=0.1,
-                        label="🌡️ Temperature"
-                    )
-                    max_samples_wac = gr.Slider(
-                        minimum=1,
-                        maximum=100,
-                        value=5,
-                        step=1,
-                        label="📊 处理样本数"
-                    )
-                    clean_btn_wac = gr.Button("🚀 开始去噪", variant="primary", size="lg")
-                with gr.Column():
-                    output_text_wac = gr.Textbox(
-                        label="⏳ 处理进度",
-                        lines=10,
-                        max_lines=15
-                    )
-                    preview_df_wac = gr.Dataframe(
-                        label="🔍 结果预览",
-                        wrap=True
-                    )
-                    download_file_wac = gr.File(label="📥 下载去噪后的数据集")
-            # Note: This would need a separate function for WAC-GEC processing
-            gr.Markdown("⚠️ WAC-GEC 功能需要额外实现对应的处理函数")
-        # Tab 4: 关于
-        with gr.Tab("ℹ️ 关于"):
             gr.Markdown("""
-            ## 清洗流程说明
-            ### 核心算法
-            1. **预处理 (process_sentence)**
-               - 检测句子完整性
-               - 为不完整的句子添加标记 `___`
-               - 保留多行文本格式
-            2. **LLM清洗**
-               - 使用 DeepSeek API 进行语法、拼写、��格错误修正
-               - 重试机制：最多重试5次
-               - 稳定的 REST API 调用
-            3. **格式验证 (is_valid_output)**
-               - 验证输出格式正确性
-               - 检查是否保留了 `___` 标记
-               - 长度合理性检查
-            4. **后处理**
-               - 提取清洗后的内容
-               - 恢复原始多行格式
-               - 生成 `XXX-Denoising.parquet` 文件
-            ### 支持的数据集
-            - **MMLU**: 57个学科的多选题
-            - **GSM8K**: 数学推理题
-            - **ARC-Challenge**: 科学问答
-            - **MedMCQA**: 医学选择题
-            - **CoQA**: 对话问答
-            - 以及更多...
-            ### 技术栈
-            - **LLM**: DeepSeek API (deepseek-r1-distill-llama-8b)
-            - **前端**: Gradio 4.16.0
-            - **数据处理**: Pandas + PyArrow (Parquet)
-            - **API调用**: OpenAI SDK
-            - **部署**: Hugging Face Spaces
-            ### 研究成果
-            本框架在多个主流benchmark上取得了显著的性能提升，
-            通过两种不同的去噪方法（DeepSeek-R1和WAC-GEC）实现数据质量优化。
-            ### 使用说明
-            1. **配置 API Key**: Settings → Repository secrets → `DEEPSEEK_API_KEY`
-            2. **上传数据集**: 选择 `.parquet` 文件
-            3. **指定列名**: 输入包含问题的列名（通常是 `question`）
-            4. **调整参数**: 选择模型、temperature等
-            5. **开始清洗**: 点击按钮开始处理
-            6. **下载结果**: 下载 `XXX-Denoising.parquet` 文件
-            ⚠️ **重要提示**:
-            - Demo版本限制最多处理100个样本
-            - 完整版本可处理数万样本
-            - 建议 temperature=0.1 以获得稳定输出
-            ---
-            **研究生毕业论文成果展示** | Powered by DeepSeek API
-            """)
-# 启动应用
 if __name__ == "__main__":
     demo.launch(
-        server_name="0.0.0.0",
         server_port=7860,
         ssr_mode=False
     )

+# app_refactored.py - 重构后的展示系统
 import gradio as gr
 import json
 import pandas as pd
 import spacy
 from spellchecker import SpellChecker
+# ======================== CSS样式 ========================
+custom_css = """
+.gradio-container {
+    max-width: 1400px !important;
+}
+.markdown-text {
+    font-size: 16px;
+    line-height: 1.6;
+}
+.markdown-text h1 {
+    text-align: center;
+    margin-bottom: 1em;
+}
+.tab-buttons button {
+    font-size: 18px;
+    font-weight: 600;
+    padding: 12px 24px;
+}
+#leaderboard-table {
+    margin-top: 20px;
+}
+#search-bar {
+    width: 100%;
+    font-size: 16px;
+}
+#filter-columns {
+    margin-top: 10px;
+}
+#column-select {
+    font-size: 14px;
+}
+.stats-box {
+    background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
+    color: white;
+    padding: 20px;
+    border-radius: 10px;
+    text-align: center;
+    margin: 10px 0;
+}
+.stats-box h3 {
+    margin: 0;
+    font-size: 24px;
+}
+.stats-box p {
+    margin: 5px 0;
+    font-size: 14px;
+    opacity: 0.9;
+}
+.leaderboard-table {
+    width: 100%;
+    border-collapse: collapse;
+    margin: 20px 0;
+    font-size: 14px;
+    box-shadow: 0 2px 8px rgba(0,0,0,0.1);
+    border-radius: 8px;
+    overflow: hidden;
+}
+.leaderboard-table thead tr {
+    background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
+    color: white;
+    text-align: left;
+    font-weight: bold;
+}
+.leaderboard-table th,
+.leaderboard-table td {
+    padding: 12px 15px;
+    text-align: center;
+}
+.leaderboard-table tbody tr {
+    border-bottom: 1px solid #dddddd;
+    transition: all 0.2s ease;
+}
+.leaderboard-table tbody tr:nth-of-type(even) {
+    background-color: #f9fafb;
+}
+.leaderboard-table tbody tr:hover {
+    background-color: #e8eaf6;
+    transform: scale(1.01);
+    box-shadow: 0 2px 4px rgba(0,0,0,0.1);
+}
+.metric-positive {
+    color: #10b981;
+    font-weight: bold;
+}
+.metric-negative {
+    color: #ef4444;
+    font-weight: bold;
+}
+.download-link {
+    color: #667eea;
+    text-decoration: none;
+    font-weight: 500;
+    padding: 4px 12px;
+    border-radius: 4px;
+    border: 1px solid #667eea;
+    transition: all 0.2s;
+    display: inline-block;
+}
+.download-link:hover {
+    background-color: #667eea;
+    color: white;
+    transform: translateY(-1px);
+}
+.benchmark-name {
+    font-weight: 500;
+    color: #1f2937;
+}
+"""
+# ======================== API配置 ========================
 DEEPSEEK_API_KEY = os.getenv("DEEPSEEK_API_KEY", "")
 DEEPSEEK_BASE_URL = "https://dashscope.aliyuncs.com/compatible-mode/v1"
+# ======================== NLP工具初始化 ========================
 try:
     nlp = spacy.load("en_core_web_sm")
 except OSError:
     import subprocess
     subprocess.run(["python", "-m", "spacy", "download", "en_core_web_sm"])
     nlp = spacy.load("en_core_web_sm")
 spell = SpellChecker()
 WHITESPACE_PATTERNS = [
+    re.compile(r'[ \t]{2,}'),
+    re.compile(r'\u200B|\u2060'),
+    re.compile(r'\s+([.,!?;:])'),
+    re.compile(r'([.,!?;:])\s{2,}'),
 ]
+# ======================== Prompt模板 ========================
 PROMPT_TEMPLATE = """## Positioning
 You are a **LANGUAGE grammatical error correction tool** that can identify and correct grammatical errors in a text.
 Reply with a corrected version of the input sentence with all **grammatical**, **spelling** and **whitespace errors** fixed, making only necessary changes.
 [input]: _______ such as bitcoin are becoming increasingly mainstream and have a whole host of associated ethical implications, for example, they are______ and more ______. However, they have also been used to engage in _______.
 [output]: _______ such as bitcoin are becoming increasingly mainstream and have a whole host of associated ethical implications, for example, they are______ and more ______. However, they have also been used to engage in _______.
 ## Task
 Next, please correct the following sentence according to the above requirements.
 **If there are no errors, reply with a copy of the original sentence. Don't fill in the contents of ___.**
 [input]: """
+# ======================== 工具函数 ========================
+def check_api_key():
+    if not DEEPSEEK_API_KEY:
+        raise ValueError("⚠️ 请在 Space Settings 中配置 DEEPSEEK_API_KEY！")
+def call_deepseek_api(prompt, model="deepseek-r1-distill-llama-8b", temperature=0.1, stream=True):
+    check_api_key()
+    client = OpenAI(api_key=DEEPSEEK_API_KEY, base_url=DEEPSEEK_BASE_URL)
+    completion = client.chat.completions.create(
+        model=model,
+        messages=[{"role": "user", "content": prompt}],
+        temperature=temperature,
+        stream=stream
+    )
+    if stream:
+        response_content = ""
+        for chunk in completion:
+            if chunk.choices and chunk.choices[0].delta.content:
+                response_content += chunk.choices[0].delta.content
+        return response_content
+    else:
+        return completion.choices[0].message.content
 def process_sentence(sentence):
     sentence = sentence.strip()
     lines = [line.strip() for line in sentence.split('\n') if line.strip()]
     is_multiline = len(lines) > 1
+    target_line = lines[-1] if is_multiline else sentence
     last_char = target_line[-1] if target_line else ''
     if last_char in {'.', '?', '!', ';', ','}:
         return target_line
         return target_line + " ___."
 def is_valid_output(content_2, content_1, content_0):
     if not (content_2.startswith('[output]:') and '\n' not in content_2):
         return False
     if ('___' in content_0 or '___' in content_1) and '___' not in content_2:
         return False
+    if len(content_2) > 2 * len(content_1) or len(content_1) > 2 * len(content_2):
         return False
     return True
 def extract_output_content(item):
     if item.startswith('[output]:'):
         output_content = item[len('[output]:'):].strip()
         if output_content and output_content[0] == '"' and output_content[-1] == '"':
         return None
 def has_missing_spaces(sentence):
     if ' ' in sentence:
         return False
     doc = nlp(sentence)
     alpha_tokens = [t for t in doc if t.is_alpha]
     return len(alpha_tokens) >= 2
 def calculate_whitespace_anomaly_rate(sentences):
     if not sentences:
         return 0.0
     anomaly_count = 0
     for sent in sentences:
         if has_missing_spaces(sent):
             anomaly_count += 1
             continue
         if any(p.search(sent) for p in WHITESPACE_PATTERNS):
             anomaly_count += 1
     return anomaly_count / len(sentences) * 100
 def normalize_tokens(text):
     doc = nlp(text)
     tokens = []
     for t in doc:
+        if not t.is_alpha or len(t.text) <= 2 or t.text.isupper():
             continue
         tokens.append(t.text.lower())
     return tokens
 def calculate_spelling_error_density(sentences):
     total_words = 0
     total_errors = 0
     for sent in sentences:
         if has_missing_spaces(sent):
             continue
         tokens = normalize_tokens(sent)
         if not tokens:
             continue
         misspelled = spell.unknown(tokens)
         total_errors += len(misspelled)
         total_words += len(tokens)
     if total_words == 0:
         return 0.0
     return total_errors / total_words * 100
+# ======================== Leaderboard数据处理 ========================
+def load_leaderboard_data():
+    """从JSON加载Leaderboard数据并添加类型分类"""
+    json_path = "leaderboard.json"
+    try:
+        with open(json_path, 'r', encoding='utf-8') as f:
+            data = json.load(f)
+        # 添加类型分类 (示例分类规则)
+        for item in data:
+            benchmark_name = item['Benchmark'].lower()
+            if 'mmlu' in benchmark_name or 'arc' in benchmark_name:
+                item['Type'] = 'A'
+            elif 'gsm' in benchmark_name or 'math' in benchmark_name:
+                item['Type'] = 'B'
+            elif 'med' in benchmark_name or 'bio' in benchmark_name:
+                item['Type'] = 'C'
+            elif 'code' in benchmark_name or 'human' in benchmark_name:
+                item['Type'] = 'D'
+            else:
+                item['Type'] = 'E'
+        return pd.DataFrame(data)
+    except Exception as e:
+        print(f"Error loading leaderboard: {e}")
+        return pd.DataFrame()
+def make_clickable_download(download_text):
+    """将Markdown链接转换为HTML链接"""
+    if '[下载](' in download_text:
+        url = download_text.split('(')[1].rstrip(')')
+        return f'<a href="{url}" class="download-link" target="_blank">下载</a>'
+    return download_text
+def filter_leaderboard(df, query):
+    """根据类型筛选Leaderboard"""
+    if query == "all":
+        return df
+    else:
+        return df[df['Type'] == query]
+def search_leaderboard(df, query):
+    """搜索Leaderboard"""
+    if not query:
+        return df
+    return df[df['Benchmark'].str.contains(query, case=False, na=False)]
+# ======================== 数据清洗函数 ========================
 def clean_dataset(file_path, question_column, model_choice, temperature, max_samples, progress=gr.Progress()):
     try:
         try:
             check_api_key()
         except ValueError as e:
             return str(e), None, None
         progress(0.05, desc="📁 读取数据文件...")
         df = pd.read_parquet(file_path)
         if question_column not in df.columns:
             available_columns = ", ".join(df.columns.tolist())
             return f"❌ 列名 '{question_column}' 不存在！\n可用列名: {available_columns}", None, None
         data_ori = df[question_column].tolist()[:int(max_samples)]
         total = len(data_ori)
         progress(0.08, desc="📊 计算原始指标...")
         original_sentences = [str(item) for item in data_ori]
         war_original = calculate_whitespace_anomaly_rate(original_sentences)
         progress(0.1, desc=f"🚀 开始清洗 {total} 个样本...")
         data_corrupt = [process_sentence(str(item)) for item in data_ori]
         results = []
         max_retries = 5
         log_text = f"🚀 开始处理 {total} 个样本...\n\n"
         for idx in range(total):
             while retry_count < max_retries:
                 try:
                     response_content = call_deepseek_api(
                         PROMPT_TEMPLATE + original_text,
                         model=model_choice,
                         temperature=float(temperature)
                     )
                     if is_valid_output(response_content, original_text, unprocess_text):
                         results.append(response_content)
                         break
                     retry_count += 1
                     log_text += f"⚠️ 样本 {idx+1} API错误，重试 {retry_count}/{max_retries}: {str(e)}\n"
             else:
                 results.append(f"[ERROR] Failed to process: {original_text}")
                 log_text += f"❌ 样本 {idx+1} 处理失败\n"
         progress(0.85, desc="📊 后处理中...")
         lst_extracted = []
         error_count = 0
         unknown_count = 0
                 if item.startswith('[ERROR]'):
                     error_count += 1
         lst_final = []
         for i in range(len(data_ori)):
             item = str(data_ori[i])
             else:
                 lst_final.append(lst_extracted[i])
         progress(0.90, desc="📊 计算清洗后指标...")
         cleaned_sentences = [str(item) for item in lst_final]
         war_cleaned = calculate_whitespace_anomaly_rate(cleaned_sentences)
         sed_cleaned = calculate_spelling_error_density(cleaned_sentences)
         delta_war = war_cleaned - war_original
         delta_sed = sed_cleaned - sed_original
         progress(0.95, desc="💾 保存结果...")
         df_cleaned = df.copy()
         df_cleaned[question_column + '_cleaned'] = lst_final[:len(df)]
         original_filename = os.path.basename(file_path)
         base_name = original_filename.replace('.parquet', '')
         output_filename = f"{base_name}-Denoising.parquet"
         output_path = os.path.join(tempfile.gettempdir(), output_filename)
         df_cleaned.to_parquet(output_path, index=False)
         log_text += f"\n\n📊 处理完成！\n"
         log_text += f"{'='*50}\n"
         log_text += f"【基础统计】\n"
         log_text += f"   变化: {delta_sed:+.2f}% {'✅ 改善' if delta_sed < 0 else '⚠️ 增加'}\n"
         log_text += f"{'='*50}\n"
         preview_df = pd.DataFrame({
             '原始问题': [str(x)[:100] for x in data_ori[:5]],
             '清洗后问题': [str(x)[:100] for x in lst_final[:5]]
         error_detail = traceback.format_exc()
         return f"❌ 处理出错: {str(e)}\n\n详细错误:\n{error_detail}", None, None
+# ======================== 文本内容 ========================
+ABOUT_TEXT = """
+## 清洗流程说明
+### 核心算法
+1. **预处理 (process_sentence)**
+   - 检测句子完整性
+   - 为不完整的句子添加标记 `___`
+   - 保留多行文本格式
+2. **LLM清洗**
+   - 使用 DeepSeek API 进行语法、拼写、空格错误修正
+   - 重试机制：最多重试5次
+   - 稳定的 REST API 调用
+3. **格式验证 (is_valid_output)**
+   - 验证输出格式正确性
+   - 检查是否保留了 `___` 标记
+   - 长度合理性检查
+4. **后处理**
+   - 提取清洗后的内容
+   - 恢复原始多行格式
+   - 生成 `XXX-Denoising.parquet` 文件
+### 支持的数据集
+- **MMLU**: 57个学科的多选题
+- **GSM8K**: 数学推理题
+- **ARC-Challenge**: 科学问答
+- **MedMCQA**: 医学选择题
+- **CoQA**: 对话问答
+- 以及更多...
+### 技术栈
+- **LLM**: DeepSeek API (deepseek-r1-distill-llama-8b)
+- **前端**: Gradio 4.16.0
+- **数据处理**: Pandas + PyArrow (Parquet)
+- **API调用**: OpenAI SDK
+- **部署**: Hugging Face Spaces
+### 质量指标
+- **WAR (Whitespace Anomaly Rate)**: 空白符异常率
+- **SED (Spelling Error Density)**: 拼写错误密度
+### 使用说明
+1. **配置 API Key**: Settings → Repository secrets → `DEEPSEEK_API_KEY`
+2. **上传数据集**: 选择 `.parquet` 文件
+3. **指定列名**: 输入包含问题的列名（通常是 `question`）
+4. **调整参数**: 选择模型、temperature等
+5. **开始清洗**: 点击按钮开始处理
+6. **下载结果**: 下载 `XXX-Denoising.parquet` 文件
+⚠️ **重要提示**:
+- Demo版本限制最多处理100个样本
+- 完整版本可处理数万样本
+- 建议 temperature=0.1 以获得稳定输出
+---
+**研究生毕业论文成果展示** | Powered by DeepSeek API
+"""
+SUBMISSION_TEXT = """
+## 提交说明
+### 如何提交新的去噪结果
+1. **准备数据**: 使用本系统对benchmark数据集进行去噪
+2. **记录指标**: 记录ΔWAR和ΔSED指标
+3. **提交PR**: 在GitHub上提交Pull Request
+4. **审核**: 等待维护者审核
+### 数据格式要求
+提交的数据需要包含以下字段:
+- Benchmark名称
+- 去噪方法
+- ΔWAR (%)
+- ΔSED
+- 下载链接
+### 联系方式
+如有问题,请通过以下方���联系:
+- GitHub Issues
+- Email: your-email@example.com
+"""
+# ======================== Gradio界面 ========================
+demo = gr.Blocks(title="数据集清洗框架展示系统", css=custom_css)
+with demo:
+    gr.Markdown(
+        """<div style="text-align: center;"><h1>⭐ 基于基准去噪框架的 <span style='color: #e6b800;'>去噪工厂</span> 展示系统</h1></div>
+        <br>
+        <p>本系统展示了基于DeepSeek-R1和WAC-GEC两种方法对主流benchmark数据集的去噪效果。通过WAR(空白符异常率)和SED(拼写错误密度)两个指标评估去噪质量。</p>
+        """,
+        elem_classes="markdown-text"
+    )
+    # 加载leaderboard数据
+    leaderboard_data = load_leaderboard_data()
+    with gr.Tabs(elem_classes="tab-buttons") as tabs:
+        # ==================== Tab 1: Evaluation Table ====================
+        with gr.TabItem("📊 Evaluation Table", id=0):
+            with gr.Column():
+                gr.Markdown("### 清洗效果排行榜")
+                with gr.Row():
+                    search_bar = gr.Textbox(
+                        placeholder="🔍 搜索Benchmark名称并按ENTER...",
+                        show_label=False,
+                        elem_id="search-bar",
                     )
+                    filter_types = gr.Radio(
+                        label="⏚ 筛选Benchmark类型",
+                        choices=["all", "A", "B", "C", "D", "E"],
+                        value="all",
+                        elem_id="filter-columns",
+                    )
+                leaderboard_table = gr.Dataframe(
+                    value=leaderboard_data[['ID', 'Benchmark', 'ΔWAR', 'ΔSED', 'Download']],
+                    headers=['ID', 'Benchmark', 'ΔWAR (%)', 'ΔSED', '下载'],
+                    datatype=['number', 'str', 'number', 'number', 'markdown'],
+                    elem_id="leaderboard-table",
+                    interactive=False,
+                )
+                hidden_leaderboard = gr.Dataframe(
+                    value=leaderboard_data,
+                    visible=False
+                )
+                # 绑定搜索和筛选
+                search_bar.submit(
+                    lambda df, query: search_leaderboard(df, query)[['ID', 'Benchmark', 'ΔWAR', 'ΔSED', 'Download']],
+                    [hidden_leaderboard, search_bar],
+                    leaderboard_table
+                )
+                filter_types.change(
+                    lambda df, query: filter_leaderboard(df, query)[['ID', 'Benchmark', 'ΔWAR', 'ΔSED', 'Download']],
+                    [hidden_leaderboard, filter_types],
+                    leaderboard_table
+                )
+                gr.Markdown("""
+                **说明:**
+                - ΔWAR: 空白符异常率变化 (正值表示改善)
+                - ΔSED: 拼写错误密度变化 (负值表示改善)
+                - 绿色: 正向提升 | 红色: 负向影响
+                - 类型分类: A=知识问答, B=数学推理, C=医学领域, D=代码生成, E=其他
+                """, elem_classes="markdown-text")
+        # ==================== Tab 2: Performance Plot ====================
+        with gr.TabItem("📈 Performance Plot", id=1):
+            gr.Markdown("### 性能可视化分析")
+            gr.Markdown("**注意**: 性能图表功能开发中,敬请期待。")
+            # 这里可以添加性能图表
+            # 例如: WAR和SED的对比图、不同方法的效果对比等
+        # ==================== Tab 3: About ====================
+        with gr.TabItem("📝 About", id=2):
+            gr.Markdown(ABOUT_TEXT, elem_classes="markdown-text")
+        # ==================== Tab 4: Submit Results ====================
+        with gr.TabItem("🚀 Submit Results", id=3):
+            gr.Markdown("## 提交去噪结果")
             with gr.Row():
                 with gr.Column():
                         label="📊 处理样本数 (Demo限制)"
                     )
+                    clean_btn = gr.Button("🚀 开始去噪", variant="primary", size="lg")
                 with gr.Column():
                     output_text = gr.Textbox(
                 inputs=[file_input, question_column, model_choice, temperature, max_samples],
                 outputs=[output_text, download_file, preview_df]
             )
             gr.Markdown("""
+            ### WAC-GEC方法 (开发中)
+            WAC-GEC (Whitespace Anomaly Correction - Grammar Error Correction) 方法结合了:
+            - 空白符异常检测与修正
+            - 语法错误检测与修正
+            该功能即将上线,敬请期待!
+            """, elem_classes="markdown-text")
 if __name__ == "__main__":
     demo.launch(
+        server_name="0.0.0.0",
         server_port=7860,
         ssr_mode=False
     )