Spaces:

lllouo
/

BD_framework_test

Sleeping

File size: 34,987 Bytes

5524e77
c18e35f
5524e77
 
66b1f5b
 
b335dbb
b00b7bf
698e9e5
 
 
7ee1568
bb4c1e0
 
e0e242c
c18e35f
28e23fd
0bd867c
 
 
28e23fd
bb4c1e0
0bd867c
 
 
28e23fd
0bd867c
28e23fd
bb4c1e0
 
28e23fd
bb4c1e0
28e23fd
b335dbb
b00b7bf
b335dbb
28e23fd
698e9e5
 
 
 
 
 
 
 
 
 
b96d100
 
 
 
698e9e5
 
28e23fd
66b1f5b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c18e35f
66b1f5b
c18e35f
28e23fd
0bd867c
28e23fd
bb4c1e0
 
28e23fd
0bd867c
28e23fd
0bd867c
 
 
 
bb4c1e0
a855e3e
bc8fa15
 
e2221c7
a855e3e
28e23fd
0bd867c
28e23fd
0bd867c
bb4c1e0
28e23fd
bb4c1e0
 
 
 
28e23fd
bb4c1e0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
28e23fd
bb4c1e0
 
28e23fd
bb4c1e0
 
0bd867c
 
28e23fd
bb4c1e0
 
28e23fd
 
 
 
 
bb4c1e0
 
28e23fd
bb4c1e0
 
 
 
 
 
 
31cbe45
 
 
28e23fd
 
31cbe45
28e23fd
31cbe45
 
bb4c1e0
 
 
31cbe45
 
bb4c1e0
 
 
 
 
 
 
 
 
 
 
 
 
28e23fd
0bd867c
bb4c1e0
28e23fd
 
 
bb4c1e0
0bd867c
28e23fd
0bd867c
 
28e23fd
 
bb4c1e0
28e23fd
bb4c1e0
28e23fd
 
bb4c1e0
28e23fd
bb4c1e0
 
 
0bd867c
28e23fd
0bd867c
28e23fd
7ee1568
 
28e23fd
 
 
7ee1568
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
28e23fd
7ee1568
 
22576e5
7ee1568
 
 
 
22576e5
 
7ee1568
 
22576e5
 
 
7ee1568
22576e5
 
7ee1568
 
22576e5
 
 
3ad03f2
7ee1568
 
22576e5
7ee1568
22576e5
 
7ee1568
 
 
 
 
 
28e23fd
 
7ee1568
 
 
 
 
 
 
 
 
 
945a4ac
 
7ee1568
 
 
 
 
 
 
 
 
 
 
28e23fd
0bd867c
28e23fd
0bd867c
28e23fd
b96d100
 
0bd867c
b96d100
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
66b1f5b
 
 
 
b96d100
66b1f5b
 
 
 
 
c18e35f
66b1f5b
 
 
 
 
b96d100
66b1f5b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
698e9e5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b96d100
698e9e5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
28e23fd
b96d100
 
 
 
 
e0e242c
 
 
 
28e23fd
e0e242c
b96d100
 
 
 
 
ba32277
 
28e23fd
ba32277
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b96d100
 
 
 
 
 
28e23fd
66b1f5b
c18e35f
542453f
0bd867c
542453f
0bd867c
 
 
 
28e23fd
542453f
28e23fd
66b1f5b
 
 
 
28e23fd
66b1f5b
 
 
c18e35f
28e23fd
698e9e5
 
 
 
28e23fd
0bd867c
 
 
 
 
c18e35f
5524e77
0bd867c
28e23fd
 
66b1f5b
 
28e23fd
66b1f5b
b335dbb
66b1f5b
 
 
 
 
c18e35f
0bd867c
 
 
 
 
 
 
 
c18e35f
0bd867c
 
 
 
 
 
66b1f5b
0bd867c
 
 
 
 
66b1f5b
 
 
28e23fd
66b1f5b
 
28e23fd
66b1f5b
28e23fd
66b1f5b
 
 
 
 
 
 
 
b335dbb
66b1f5b
 
 
 
 
 
 
 
b335dbb
0bd867c
66b1f5b
 
 
 
 
 
28e23fd
698e9e5
 
 
 
 
 
 
28e23fd
66b1f5b
 
 
 
 
 
0bd867c
 
66b1f5b
 
 
c18e35f
28e23fd
698e9e5
28e23fd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0bd867c
 
28e23fd
0bd867c
698e9e5
c18e35f
7ee1568
 
28e23fd
5524e77
15fc4e7
c18e35f
 
b335dbb
 
28e23fd
5524e77
28e23fd
b96d100
28e23fd
b96d100
28e23fd
0bd867c
 
28e23fd
 
 
0bd867c
bb4c1e0
28e23fd
 
 
 
 
 
 
 
bb4c1e0
28e23fd
0bd867c
28e23fd
b96d100
28e23fd
 
 
 
b96d100
28e23fd
 
bb4c1e0
28e23fd
 
 
b96d100
28e23fd
 
 
 
b96d100
28e23fd
 
 
 
b96d100
28e23fd
b96d100
28e23fd
 
 
 
 
 
b96d100
28e23fd
7ee1568
28e23fd
 
 
7ee1568
28e23fd
b96d100
 
28e23fd
 
bb4c1e0
28e23fd
 
 
 
 
 
b96d100
28e23fd
b96d100
28e23fd
 
b96d100
28e23fd
0bd867c
28e23fd
 
 
 
bb4c1e0
b96d100
 
28e23fd
b96d100
 
28e23fd
 
7ee1568
 
b96d100
 
 
28e23fd
b96d100
28e23fd
b96d100
 
 
5524e77
b96d100
5524e77
b96d100
7a91a9a
b96d100
28e23fd
070f625
b96d100
 
28e23fd
b96d100
 
070f625
27ccef7
28e23fd
27ccef7
b96d100
 
 
ba32277
28e23fd
ba32277
28e23fd
9ddd1ab
 
 
ba32277
 
 
 
b96d100
 
27ccef7
28e23fd
27ccef7
b96d100
 
 
 
 
 
 
 
 
 
27ccef7
b96d100
 
 
 
ba32277
 
 
 
27ccef7
ba32277
 
 
 
 
 
 
 
b96d100
 
 
 
28e23fd
 
 
 
 
b96d100
 
28e23fd
b96d100
28e23fd
 
5524e77
28e23fd
 
 
0bd867c
5524e77
cc7eba8
5524e77
28e23fd
cc7eba8
66b1f5b
 
 
28e23fd
66b1f5b
28e23fd
5524e77
 
 
e0e242c
 
28e23fd
 
5524e77
 
 
 
 
66b1f5b
5524e77
0bd867c
28e23fd
 
5524e77
 
 
 
66b1f5b
a5be05a
5524e77
28e23fd
5524e77
 
28e23fd
5524e77
cc7eba8
5524e77
28e23fd
67ad6f7
 
5524e77
 
28e23fd
5524e77
e0e242c
 
28e23fd
e0e242c
28e23fd
e0e242c
 
 
 
 
 
 
28e23fd
7ee1568
28e23fd
 
 
 
7ee1568
 
15fc4e7
7ee1568
5524e77
66b1f5b
 
15fc4e7
5524e77
c18e35f
28e23fd
 
 
c18e35f
28e23fd
31cbe45
 
66b1f5b
b96d100
66b1f5b

import gradio as gr
import json
import pandas as pd
import os
from typing import Optional
import tempfile
import requests
from openai import OpenAI
import re
import spacy
from spellchecker import SpellChecker
import difflib
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
import hashlib

# ======================== WAC-GEC Import ========================
try:
    from whitespace_correction import WhitespaceCorrector
    WAC_GEC_AVAILABLE = True
    # Initialize WAC-GEC model (lazy loading)
    wac_corrector = None
except ImportError:
    WAC_GEC_AVAILABLE = False
    wac_corrector = None
    print("⚠️ whitespace_correction not installed, WAC-GEC functionality unavailable")

# Initialize GEC model (lazy loading)
gec_tokenizer = None
gec_model = None
GEC_MODEL_NAME = "lllouo/gec_Chat-LLaMa-2-7B-FT"

# ======================== API Configuration ========================
DEEPSEEK_API_KEY = os.getenv("DEEPSEEK_API_KEY", "")
DEEPSEEK_BASE_URL = "https://dashscope.aliyuncs.com/compatible-mode/v1"

# ======================== NLP Tools Initialization ========================
try:
    nlp = spacy.load("en_core_web_sm")
except OSError:
    import subprocess
    subprocess.run(["python", "-m", "spacy", "download", "en_core_web_sm"])
    nlp = spacy.load("en_core_web_sm")

spell = SpellChecker()

WHITESPACE_PATTERNS = [
    re.compile(r'[ \t]{2,}'),
    re.compile(r'\u200B|\u2060'),
    re.compile(r'\s+([.,!?;:])'),
    re.compile(r'([.,!?;:])\s{2,}'),
]

# ======================== Prompt Template ========================
PROMPT_TEMPLATE = """## Positioning
You are a **LANGUAGE grammatical error correction tool** that can identify and correct grammatical errors in a text.
Reply with a corrected version of the input sentence with all **grammatical**, **spelling** and **whitespace errors** fixed, making only necessary changes.
**If there are no errors, reply with a copy of the original sentence.**

## Formatting requirements
- [Input]: The sentence should start with the identifier [input], followed by the sentence provided by the user.
- [Output]: The sentence should start with the identifier [output], followed by the corrected sentence.
- **Just format the output as required, no need to give too much explanation. **
- **You only need to output [output]: corrected sentence. **

## Input and Output Examples
Example 1: Extra spaces and Missing spaces and Spelling errors
[input]: This is anexample sentence with in correct spa ces and spelling erorrs.
[output]: This is an example sentence with incorrect spaces and spelling errors.

Example 2: No errors, reply with a copy of the original sentence, don't fill in the contents of ___.
[input]: _______ such as bitcoin are becoming increasingly mainstream and have a whole host of associated ethical implications, for example, they are______ and more ______. However, they have also been used to engage in _______. 
[output]: _______ such as bitcoin are becoming increasingly mainstream and have a whole host of associated ethical implications, for example, they are______ and more ______. However, they have also been used to engage in _______.

## Task
Next, please correct the following sentence according to the above requirements.
**If there are no errors, reply with a copy of the original sentence. Don't fill in the contents of ___.**
**Remember: You only need to output [output]: Corrected sentence. **

[input]: """

# ======================== Initialize WAC + GEC ========================
def initialize_wac_gec():
    """Lazy initialization of WAC-GEC models (Whitespace + Grammar Error Correction)"""
    global wac_corrector, gec_tokenizer, gec_model
    
    # 1. Initialize WAC (Whitespace Correction)
    if not WAC_GEC_AVAILABLE:
        print("❌ WAC module not installed")
        return False
    
    if wac_corrector is None:
        try:
            device = "cuda" if torch.cuda.is_available() else "cpu"
            wac_corrector = WhitespaceCorrector.from_pretrained(
                model="eo_larger_byte",
                device=device,
                download_dir="./models"
            )
            print(f"✅ WAC whitespace correction model loaded (device: {device})")
        except Exception as e:
            print(f"❌ WAC model loading failed: {e}")
            return False
    
    # 2. Initialize GEC (Grammar Error Correction)
    if gec_model is None or gec_tokenizer is None:
        try:
            device = "cuda" if torch.cuda.is_available() else "cpu"
            
            print(f"📥 Downloading GEC model from HuggingFace: {GEC_MODEL_NAME}")
            gec_tokenizer = AutoTokenizer.from_pretrained(
                GEC_MODEL_NAME,
                trust_remote_code=True
            )
            gec_model = AutoModelForCausalLM.from_pretrained(
                GEC_MODEL_NAME,
                device_map="auto" if device == "cuda" else None,
                torch_dtype=torch.bfloat16 if device == "cuda" else torch.float32,
                trust_remote_code=True
            )
            
            if device == "cpu":
                gec_model = gec_model.to(device)
            
            gec_tokenizer.pad_token_id = gec_tokenizer.eos_token_id
            gec_tokenizer.padding_side = "left"
            
            print(f"✅ GEC grammar correction model loaded (device: {device})")
            
        except Exception as e:
            print(f"❌ GEC model loading failed: {e}")
            return False
    
    return True

# ======================== GEC Grammar Correction Function ========================
def correct_sentence_gec(input_sentence):
    """
    Use GEC model for grammar correction
    Args:
        input_sentence (str): Sentence to be corrected
    Returns:
        str: Corrected sentence
    """
    if gec_model is None or gec_tokenizer is None:
        raise ValueError("GEC model not initialized")
    
    prompt = f"""Rewrite the following sentence to correct grammatical errors. Return ONLY the corrected sentence.
Original: {input_sentence}
Corrected:"""
    
    inputs = gec_tokenizer(prompt, return_tensors="pt").to(gec_model.device)
    
    is_cpu = str(gec_model.device) == "cpu" or not torch.cuda.is_available()
    
    if is_cpu:
        max_tokens = 256
        beams = 2
    else:
        max_tokens = 512
        beams = 4
    
    with torch.no_grad():
        outputs = gec_model.generate(
            **inputs,
            max_new_tokens=max_tokens,
            num_beams=beams,
            do_sample=False,
            temperature=None,
            top_p=None
        )
    
    full_output = gec_tokenizer.decode(outputs[0], skip_special_tokens=True)
    corrected_text = full_output.replace(prompt, "").strip()
    
    if corrected_text.startswith("Corrected:"):
        corrected_text = corrected_text[len("Corrected:"):].strip()
    
    return corrected_text

# ======================== WAC-GEC Combined Processing ========================
def call_wac_gec(text):
    """
    Use WAC-GEC two-step correction:
    1. GEC model for grammar and spelling correction
    2. WAC model for whitespace correction
    """
    if not initialize_wac_gec():
        raise ValueError("⚠️ WAC-GEC models not installed or failed to load")
    
    try:
        # Step 1: Use GEC model for grammar correction
        print(f"🔍 GEC processing: {text[:50]}...")
        gec_corrected = correct_sentence_gec(text)
        print(f"✅ GEC result: {gec_corrected[:50]}...")
        
        # Step 2: Use WAC model for whitespace correction
        print(f"🔍 WAC processing: {gec_corrected[:50]}...")
        final_corrected = wac_corrector.correct_text(gec_corrected)
        print(f"✅ WAC result: {final_corrected[:50]}...")
        
        return f"[output]: {final_corrected}"
    
    except Exception as e:
        raise Exception(f"WAC-GEC processing error: {str(e)}")

# ======================== Color Diff Functions ========================
def generate_colored_diff(original, cleaned):
    """
    Generate HTML diff with color annotations
    Errors in original text: red
    Corrections after denoising: green
    """
    original_words = original.split()
    cleaned_words = cleaned.split()
    
    matcher = difflib.SequenceMatcher(None, original_words, cleaned_words)
    
    original_html = []
    cleaned_html = []
    
    for tag, i1, i2, j1, j2 in matcher.get_opcodes():
        if tag == 'equal':
            original_html.extend(original_words[i1:i2])
            cleaned_html.extend(cleaned_words[j1:j2])
        elif tag == 'replace':
            original_html.extend([f'<span style="color: #dc3545; font-weight: bold;">{w}</span>' 
                                 for w in original_words[i1:i2]])
            cleaned_html.extend([f'<span style="color: #28a745; font-weight: bold;">{w}</span>' 
                                for w in cleaned_words[j1:j2]])
        elif tag == 'delete':
            original_html.extend([f'<span style="color: #dc3545; text-decoration: line-through;">{w}</span>' 
                                 for w in original_words[i1:i2]])
        elif tag == 'insert':
            cleaned_html.extend([f'<span style="color: #28a745; font-weight: bold;">{w}</span>' 
                                for w in cleaned_words[j1:j2]])
    
    return ' '.join(original_html), ' '.join(cleaned_html)

def create_comparison_html(original_list, cleaned_list):
    """
    Create HTML table for comparison
    """
    html = """
    <div style="font-family: 'Times New Roman', serif; max-width: 100%; overflow-x: auto;">
        <style>
            .comparison-table {
                width: 100%;
                border-collapse: collapse;
                margin: 20px 0;
                border: 1px solid #000;
            }
            .comparison-table th {
                background-color: #f2f2f2;
                color: #000;
                padding: 8px;
                text-align: left;
                font-weight: bold;
                border-bottom: 2px solid #000;
            }
            .comparison-table td {
                padding: 8px;
                border-bottom: 1px solid #ccc;
                line-height: 1.5;
                vertical-align: top;
            }
            .index-col {
                width: 50px;
                text-align: center;
                font-weight: bold;
                color: #555;
            }
        </style>
        <table class="comparison-table">
            <thead>
                <tr>
                    <th class="index-col">#</th>
                    <th>Original Question</th>
                    <th>Denoised Question</th>
                </tr>
            </thead>
            <tbody>
    """
    
    for idx, (orig, clean) in enumerate(zip(original_list, cleaned_list), 1):
        orig_colored, clean_colored = generate_colored_diff(str(orig), str(clean))
        html += f"""
                <tr>
                    <td class="index-col">{idx}</td>
                    <td class="original-col">{orig_colored}</td>
                    <td class="cleaned-col">{clean_colored}</td>
                </tr>
        """
    
    html += """
            </tbody>
        </table>
    </div>
    """
    
    return html

# ======================== Utility Functions ========================
def check_api_key(model_choice):
    """Check API key (only required for DeepSeek)"""
    if model_choice == "deepseek-r1-distill-llama-8b" and not DEEPSEEK_API_KEY:
        raise ValueError("⚠️ Please configure DEEPSEEK_API_KEY in Space Settings!")

def call_deepseek_api(prompt, model="deepseek-r1-distill-llama-8b", temperature=0.1, stream=True):
    check_api_key(model)
    client = OpenAI(api_key=DEEPSEEK_API_KEY, base_url=DEEPSEEK_BASE_URL)
    completion = client.chat.completions.create(
        model=model,
        messages=[{"role": "user", "content": prompt}],
        temperature=temperature,
        stream=stream
    )
    
    if stream:
        response_content = ""
        for chunk in completion:
            if chunk.choices and chunk.choices[0].delta.content:
                response_content += chunk.choices[0].delta.content
        return response_content
    else:
        return completion.choices[0].message.content

def process_sentence(sentence):
    sentence = sentence.strip()
    lines = [line.strip() for line in sentence.split('\n') if line.strip()]
    is_multiline = len(lines) > 1
    target_line = lines[-1] if is_multiline else sentence
    last_char = target_line[-1] if target_line else ''
    if last_char in {'.', '?', '!', ';', ','}:
        return target_line
    else:
        return target_line + " ___."

def is_valid_output(content_2, content_1, content_0):
    if not (content_2.startswith('[output]:') and '\n' not in content_2):
        return False
    if ('___' in content_0 or '___' in content_1) and '___' not in content_2:
        return False
    if len(content_2) > 2 * len(content_1) or len(content_1) > 2 * len(content_2):
        return False
    return True

def extract_output_content(item):
    if item.startswith('[output]:'):
        output_content = item[len('[output]:'):].strip()
        if output_content and output_content[0] == '"' and output_content[-1] == '"':
            return output_content[1:-1]
        return output_content
    elif item.startswith('[ERROR] Failed to process:'):
        error_content = item[len('[ERROR] Failed to process:'):].strip()
        if error_content and error_content[0] == '"' and error_content[-1] == '"':
            return error_content[1:-1]
        return error_content
    else:
        return None

def has_missing_spaces(sentence):
    if ' ' in sentence:
        return False
    doc = nlp(sentence)
    alpha_tokens = [t for t in doc if t.is_alpha]
    return len(alpha_tokens) >= 2

def calculate_whitespace_anomaly_rate(sentences):
    if not sentences:
        return 0.0
    anomaly_count = 0
    for sent in sentences:
        if has_missing_spaces(sent):
            anomaly_count += 1
            continue
        if any(p.search(sent) for p in WHITESPACE_PATTERNS):
            anomaly_count += 1
    return anomaly_count / len(sentences) * 100

def normalize_tokens(text):
    doc = nlp(text)
    tokens = []
    for t in doc:
        if not t.is_alpha or len(t.text) <= 2 or t.text.isupper():
            continue
        tokens.append(t.text.lower())
    return tokens

def calculate_spelling_error_density(sentences):
    total_words = 0
    total_errors = 0
    for sent in sentences:
        if has_missing_spaces(sent):
            continue
        tokens = normalize_tokens(sent)
        if not tokens:
            continue
        misspelled = spell.unknown(tokens)
        total_errors += len(misspelled)
        total_words += len(tokens)
    if total_words == 0:
        return 0.0
    return total_errors / total_words * 100

# ======================== Leaderboard Data Processing ========================
def load_leaderboard_data():
    json_path = "leaderboard.json"
    try:
        with open(json_path, 'r', encoding='utf-8') as f:
            data = json.load(f)

        for item in data:
            benchmark = item['Benchmark']
            hash_object = hashlib.md5(benchmark.encode())
            item['ID'] = hash_object.hexdigest()[:8]

        return pd.DataFrame(data)
    except Exception as e:
        print(f"Error loading leaderboard: {e}")
        return pd.DataFrame()

def filter_leaderboard(df, category_query, version_query):
    """
    Filter by both category and version
    """
    result = df.copy()
    
    if category_query != "all":
        result = result[result['Category'] == category_query]
    
    if version_query != "all":
        if version_query == "original":
            result = result[result['Benchmark'].str.contains('_original', case=False, na=False)]
        elif version_query == "deepseek":
            result = result[result['Benchmark'].str.contains('deepseek_r1_denoising', case=False, na=False)]
        elif version_query == "wac_gec":
            result = result[result['Benchmark'].str.contains('wac_gec', case=False, na=False)]
    
    return result

def search_leaderboard(df, query):
    if not query:
        return df
    return df[df['Benchmark'].str.contains(query, case=False, na=False)]

# ======================== Dataset Denoising Function ========================
def clean_dataset(file_path, question_column, model_choice, temperature, max_samples, progress=gr.Progress()):
    try:
        try:
            check_api_key(model_choice)
        except ValueError as e:
            if model_choice == "deepseek-r1-distill-llama-8b":
                return str(e), None, ""
        
        if model_choice == "WAC-GEC" and not WAC_GEC_AVAILABLE:
            return "❌ WAC-GEC model not installed! Please install whitespace_correction package.", None, ""
        
        progress(0.05, desc="📁 Reading data file...")
        df = pd.read_parquet(file_path)
        
        if question_column not in df.columns:
            available_columns = ", ".join(df.columns.tolist())
            return f"❌ Column '{question_column}' not found!\nAvailable columns: {available_columns}", None, ""
        
        data_ori = df[question_column].tolist()[:int(max_samples)]
        total = len(data_ori)
        
        progress(0.08, desc="📊 Calculating original metrics...")
        original_sentences = [str(item) for item in data_ori]
        war_original = calculate_whitespace_anomaly_rate(original_sentences)
        sed_original = calculate_spelling_error_density(original_sentences)
        
        progress(0.1, desc=f"🚀 Starting denoising of {total} samples (model: {model_choice})...")
        
        if model_choice == "WAC-GEC":
            data_corrupt = [str(item) for item in data_ori]
        else:
            data_corrupt = [process_sentence(str(item)) for item in data_ori]
        
        results = []
        max_retries = 5 if model_choice == "deepseek-r1-distill-llama-8b" else 3
        log_text = f"🚀 Processing {total} samples...\n"
        log_text += f"📌 Using model: {model_choice}\n\n"
        
        for idx in range(total):
            progress((0.1 + 0.7 * idx / total), desc=f"Processing: {idx+1}/{total}")
            
            unprocess_text = str(data_ori[idx])
            original_text = data_corrupt[idx]
            response_content = ""
            retry_count = 0
            
            while retry_count < max_retries:
                try:
                    if model_choice == "WAC-GEC":
                        response_content = call_wac_gec(original_text)
                    else:
                        response_content = call_deepseek_api(
                            PROMPT_TEMPLATE + original_text,
                            model=model_choice,
                            temperature=float(temperature)
                        )
                    
                    if model_choice == "WAC-GEC":
                        if response_content.startswith('[output]:'):
                            results.append(response_content)
                            break
                        else:
                            retry_count += 1
                    else:
                        if is_valid_output(response_content, original_text, unprocess_text):
                            results.append(response_content)
                            break
                        else:
                            retry_count += 1
                        
                except Exception as e:
                    retry_count += 1
                    log_text += f"⚠️ Sample {idx+1} error, retry {retry_count}/{max_retries}: {str(e)}\n"
            else:
                results.append(f"[ERROR] Failed to process: {original_text}")
                log_text += f"❌ Sample {idx+1} processing failed\n"
        
        progress(0.85, desc="📊 Post-processing...")
        
        lst_extracted = []
        error_count = 0
        unknown_count = 0
        
        for i, item in enumerate(results):
            extracted = extract_output_content(item)
            if extracted is None:
                lst_extracted.append(str(data_ori[i]))
                unknown_count += 1
            else:
                lst_extracted.append(extracted)
                if item.startswith('[ERROR]'):
                    error_count += 1
        
        lst_final = []
        for i in range(len(data_ori)):
            item = str(data_ori[i])
            if '\n' in item and model_choice != "WAC-GEC":
                tmp_lines = [line.strip() for line in item.strip().split('\n') if line.strip()]
                tmp_lines[-1] = lst_extracted[i]
                lst_final.append('\n'.join(tmp_lines))
            else:
                lst_final.append(lst_extracted[i])
        
        progress(0.90, desc="📊 Calculating denoised metrics...")
        cleaned_sentences = [str(item) for item in lst_final]
        war_cleaned = calculate_whitespace_anomaly_rate(cleaned_sentences)
        sed_cleaned = calculate_spelling_error_density(cleaned_sentences)
        
        delta_war = war_cleaned - war_original
        delta_sed = sed_cleaned - sed_original
        
        progress(0.95, desc="💾 Saving results...")
        
        df_cleaned = df.copy()
        df_cleaned[question_column + '_cleaned'] = lst_final[:len(df)]
        
        original_filename = os.path.basename(file_path)
        base_name = original_filename.replace('.parquet', '')
        model_suffix = "WAC-GEC" if model_choice == "WAC-GEC" else "DeepSeek"
        output_filename = f"{base_name}-Denoising-{model_suffix}.parquet"
        output_path = os.path.join(tempfile.gettempdir(), output_filename)
        
        df_cleaned.to_parquet(output_path, index=False)
        
        log_text += f"\n\n📊 Processing Complete!\n"
        log_text += f"{'='*50}\n"
        log_text += f"【Basic Statistics】\n"
        log_text += f"- Model used: {model_choice}\n"
        log_text += f"- Total samples: {total}\n"
        log_text += f"- Successfully processed: {total - error_count - unknown_count}\n"
        log_text += f"- Failed samples: {error_count}\n"
        log_text += f"- Unknown format: {unknown_count}\n"
        log_text += f"- Output file: {output_filename}\n\n"
        
        log_text += f"【Quality Metrics】\n"
        log_text += f"📍 Whitespace Anomaly Rate (WAR):\n"
        log_text += f"   Original: {war_original:.2f}% → Denoised: {war_cleaned:.2f}%\n"
        log_text += f"   Change: {delta_war:+.2f}% {'✅ Improved' if delta_war < 0 else '⚠️ Increased'}\n\n"
        
        log_text += f"📍 Spelling Error Density (SED):\n"
        log_text += f"   Original: {sed_original:.2f}% → Denoised: {sed_cleaned:.2f}%\n"
        log_text += f"   Change: {delta_sed:+.2f}% {'✅ Improved' if delta_sed < 0 else '⚠️ Increased'}\n"
        
        if model_choice == "WAC-GEC":
            log_text += f"\n💡 Note: WAC-GEC uses two-step correction (GEC grammar + WAC whitespace)\n"
        
        log_text += f"{'='*50}\n"
        
        preview_html = create_comparison_html(data_ori[:5], lst_final[:5])
        
        progress(1.0, desc="✅ Complete!")
        
        return log_text, output_path, preview_html
        
    except Exception as e:
        import traceback
        error_detail = traceback.format_exc()
        return f"❌ Processing error: {str(e)}\n\nDetailed error:\n{error_detail}", None, ""

# ======================== Text Content ========================
ABOUT_TEXT = """
## Denoising Workflow

### Supported Models

#### 1. DeepSeek-R1 (deepseek-r1-distill-llama-8b)
- **Function**: Comprehensive grammar, spelling, and whitespace error correction
- **Advantages**: Strong comprehensive capability, handles multiple error types
- **Configuration**: Requires DEEPSEEK_API_KEY in Space Settings

#### 2. WAC-GEC (Whitespace + Grammar Error Correction)
- **Function**: Two-step correction workflow
  - **Step 1 (GEC)**: Use LLaMA-2-7B fine-tuned model for grammar and spelling correction
  - **Step 2 (WAC)**: Use whitespace correction model for spacing issues
- **Advantages**: 
  - Fully local, no API key required
  - Combines two specialized models
  - Suitable for offline environments and limited budgets
- **Model Source**: 
  - GEC: [lllouo/gec_Chat-LLaMa-2-7B-FT](https://huggingface.co/lllouo/gec_Chat-LLaMa-2-7B-FT)
  - WAC: whitespace_correction library

### Core Algorithm

1. **Preprocessing (process_sentence)**
   - Detect sentence completeness
   - Add marker `___` for incomplete sentences (DeepSeek only)
   - Preserve multi-line text format

2. **Model Denoising**
   - **DeepSeek**: Use API for comprehensive error correction, up to 5 retries
   - **WAC-GEC**: 
     - First use GEC model for grammar and spelling correction
     - Then use WAC model for whitespace correction
     - Up to 3 retries

3. **Format Validation**
   - Verify output format correctness
   - Check marker preservation
   - Length reasonability check

4. **Post-processing**
   - Extract denoised content
   - Restore original multi-line format
   - Generate Parquet file with model identifier

### Supported Datasets

- **MMLU**: Multiple choice questions across 57 subjects
- **GSM8K**: Math reasoning problems
- **ARC-Challenge**: Science Q&A
- **MedMCQA**: Medical multiple choice
- **CoQA**: Conversational Q&A
- And more...

### Color Annotation Legend

- 🔴 **Red**: Errors in original text (spelling, grammar, spacing, etc.)
- 🟢 **Green**: Corrections after denoising
- ⚫ **Black**: Unchanged correct parts

### Tech Stack

- **LLM**: DeepSeek API (deepseek-r1-distill-llama-8b)
- **Local Models**: 
  - GEC: LLaMA-2-7B (fine-tuned for grammar correction)
  - WAC: Whitespace Correction Model
- **Frontend**: Gradio 4.16.0
- **Data Processing**: Pandas + PyArrow (Parquet)
- **Diff Comparison**: Python difflib
- **NLP Tools**: spaCy, pyspellchecker
- **API Calls**: OpenAI SDK
- **Deployment**: Hugging Face Spaces

### Quality Metrics

- **WAR (Whitespace Anomaly Rate)**: Whitespace anomaly rate
- **SED (Spelling Error Density)**: Spelling error density

### Model Selection Guide

- **Need comprehensive denoising + API budget**: Choose DeepSeek-R1
- **Local deployment + complete correction**: Choose WAC-GEC (Recommended)
- **Only need spacing correction**: Use WAC module alone
- **Fastest speed**: Use GPU-accelerated WAC-GEC

---

**Graduate Thesis Research Showcase** | Powered by DeepSeek API & WAC-GEC
"""

# ======================== Gradio Interface ========================
demo = gr.Blocks(title="Dataset Denoising Framework Demo System", css="""
    .markdown-text { font-size: 16px; line-height: 1.6; }
""")

with demo:
    gr.Markdown(
        """<div style="text-align: center;"><h1>⭐ <span style='color: #e6b800;'>Denoising Factory</span> Based on Benchmark Denoising Framework</h1></div>
        <br>
        <p>This system demonstrates the denoising effects of DeepSeek-R1 and WAC-GEC methods on mainstream benchmark datasets based on <a href="https://github.com/LLLoUo/bd-toolkit" target="_blank">BD-toolkit</a>. Quality is evaluated using WAR (Whitespace Anomaly Rate) and SED (Spelling Error Density) metrics.</p>
        """,
        elem_classes="markdown-text"
    )
    
    leaderboard_data = load_leaderboard_data()
    
    with gr.Tabs(elem_classes="tab-buttons") as tabs:
        with gr.TabItem("📊 BD-benchmarks Leaderboard", id=0):
            with gr.Column():
                gr.Markdown("### Mainstream Benchmark Leaderboard After BD Denoising")
                
                with gr.Row():
                    search_bar = gr.Textbox(
                        placeholder="🔍 Search benchmark name and press ENTER...",
                        show_label=False,
                        elem_id="search-bar",
                    )
                    filter_categories = gr.Radio(
                        label="📂 Filter by Benchmark Category",
                        choices=["all", "BT", "RA", "TG", "SU", "ME", "GR"],
                        value="all",
                        elem_id="filter-columns",
                    )
                    filter_versions = gr.Radio(
                        label="🔖 Filter by Dataset Version",
                        choices=[
                            ("All Versions", "all"),
                            ("Original", "original"),
                            ("DeepSeek-R1-denoised", "deepseek"),
                            ("WAC-GEC", "wac_gec")
                        ],
                        value="all",
                        elem_id="filter-versions",
                    )
                
                leaderboard_table = gr.Dataframe(
                    value=leaderboard_data[['ID', 'Category', 'Benchmark', 'WAR', 'SED', 'Download']],
                    headers=['ID', 'Category', 'Benchmark', 'WAR (%)', 'SED', 'Download'],
                    datatype=['number', 'str', 'str', 'number', 'number', 'markdown'],
                    elem_id="leaderboard-table",
                    interactive=False,
                )
                
                hidden_leaderboard = gr.Dataframe(
                    value=leaderboard_data,
                    visible=False
                )
                
                search_bar.submit(
                    lambda df, query: search_leaderboard(df, query)[['ID', 'Category', 'Benchmark', 'WAR', 'SED', 'Download']],
                    [hidden_leaderboard, search_bar],
                    leaderboard_table
                )
                
                def combined_filter(df, category, version):
                    filtered = filter_leaderboard(df, category, version)
                    return filtered[['ID', 'Category', 'Benchmark', 'WAR', 'SED', 'Download']]
                
                filter_categories.change(
                    combined_filter,
                    [hidden_leaderboard, filter_categories, filter_versions],
                    leaderboard_table
                )
                
                filter_versions.change(
                    combined_filter,
                    [hidden_leaderboard, filter_categories, filter_versions],
                    leaderboard_table
                )
                
                gr.Markdown("""
                **Legend:**
                - **Category**: BT=Basic Tasks, RA=Reasoning Abilities, TG=Text Generation, SU=Speech Understanding, ME=Medical, GR=Grammar
                - **Version**: Original=Unprocessed dataset, DeepSeek-R1=DeepSeek denoised version, WAC-GEC=WAC-GEC denoised version
                - **WAR**: Whitespace Anomaly Rate (lower is better)
                - **SED**: Spelling Error Density (lower is better)
                """, elem_classes="markdown-text")
        

        
        with gr.TabItem("🚀 BD-toolkit Demo", id=2):
            gr.Markdown("## BD-toolkit Lightweight Demo")
            
            model_status = "✅ WAC-GEC: " + ("Available" if WAC_GEC_AVAILABLE else "Not Installed")
            model_status += " | ✅ DeepSeek-R1: " + ("Configured" if DEEPSEEK_API_KEY else "API Key Not Configured")
            gr.Markdown(f"**Model Status**: {model_status}")
            
            with gr.Row():
                with gr.Column():
                    file_input = gr.File(
                        label="📁 Upload Parquet File",
                        file_types=[".parquet"]
                    )
                    
                    question_column = gr.Textbox(
                        label="📝 Question Column Name",
                        value="question",
                        placeholder="e.g., question, input_text, prompt"
                    )
                    
                    model_choice = gr.Dropdown(
                        choices=["WAC-GEC", "deepseek-r1-distill-llama-8b"],
                        value="WAC-GEC",
                        label="🤖 Select Model",
                        info="DeepSeek: Comprehensive correction | WAC-GEC: Grammar + whitespace (local model)"
                    )
                    
                    temperature = gr.Slider(
                        minimum=0.0,
                        maximum=1.0,
                        value=0.1,
                        step=0.1,
                        label="🌡️ Temperature",
                        info="Only effective for DeepSeek",
                        interactive=False
                    )
                    
                    max_samples = gr.Slider(
                        minimum=1,
                        maximum=100,
                        value=5,
                        step=1,
                        label="📊 Number of Samples to Process (Demo Limit)"
                    )
                    
                    clean_btn = gr.Button("🚀 Start Denoising", variant="primary", size="lg")
                
                with gr.Column():
                    output_text = gr.Textbox(
                        label="⏳ Processing Progress",
                        lines=10,
                        max_lines=15
                    )
                    
                    download_file = gr.File(label="📥 Download Denoised Dataset")
            
            def update_temperature_interactive(model):
                if model == "deepseek-r1-distill-llama-8b":
                    return gr.update(interactive=True, info="Adjust generation randomness")
                else:
                    return gr.update(interactive=False, info="WAC-GEC model does not support temperature parameter")
            
            model_choice.change(
                fn=update_temperature_interactive,
                inputs=[model_choice],
                outputs=[temperature]
            )
            
            gr.Markdown("### 🎨 Denoising Effect Comparison Preview")
            gr.Markdown("""
            **Color Legend**: 
            - 🔴 <span style="color: #dc3545;">Red</span> = Errors in original text
            - 🟢 <span style="color: #28a745;">Green</span> = Corrections after denoising
            - ⚫ Black = Unchanged correct parts
            """)
            
            colored_preview = gr.HTML(label="")
            
            clean_btn.click(
                fn=clean_dataset,
                inputs=[file_input, question_column, model_choice, temperature, max_samples],
                outputs=[output_text, download_file, colored_preview]
            )

        with gr.TabItem("📝 About", id=3):
            gr.Markdown(ABOUT_TEXT, elem_classes="markdown-text")

if __name__ == "__main__":
    print("🚀 Preloading WAC-GEC models...")
    initialize_wac_gec()
    
    demo.launch(
        server_name="0.0.0.0",
        server_port=7860,
        ssr_mode=False
    )