Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| import json | |
| import pandas as pd | |
| import os | |
| from typing import Optional | |
| import tempfile | |
| import requests | |
| from openai import OpenAI | |
| import re | |
| import spacy | |
| from spellchecker import SpellChecker | |
| import difflib | |
| import torch | |
| from transformers import AutoTokenizer, AutoModelForCausalLM | |
| import hashlib | |
| # ======================== WAC-GEC Import ======================== | |
| try: | |
| from whitespace_correction import WhitespaceCorrector | |
| WAC_GEC_AVAILABLE = True | |
| # Initialize WAC-GEC model (lazy loading) | |
| wac_corrector = None | |
| except ImportError: | |
| WAC_GEC_AVAILABLE = False | |
| wac_corrector = None | |
| print("β οΈ whitespace_correction not installed, WAC-GEC functionality unavailable") | |
| # Initialize GEC model (lazy loading) | |
| gec_tokenizer = None | |
| gec_model = None | |
| GEC_MODEL_NAME = "lllouo/gec_Chat-LLaMa-2-7B-FT" | |
| # ======================== API Configuration ======================== | |
| DEEPSEEK_API_KEY = os.getenv("DEEPSEEK_API_KEY", "") | |
| DEEPSEEK_BASE_URL = "https://dashscope.aliyuncs.com/compatible-mode/v1" | |
| # ======================== NLP Tools Initialization ======================== | |
| try: | |
| nlp = spacy.load("en_core_web_sm") | |
| except OSError: | |
| import subprocess | |
| subprocess.run(["python", "-m", "spacy", "download", "en_core_web_sm"]) | |
| nlp = spacy.load("en_core_web_sm") | |
| spell = SpellChecker() | |
| WHITESPACE_PATTERNS = [ | |
| re.compile(r'[ \t]{2,}'), | |
| re.compile(r'\u200B|\u2060'), | |
| re.compile(r'\s+([.,!?;:])'), | |
| re.compile(r'([.,!?;:])\s{2,}'), | |
| ] | |
| # ======================== Prompt Template ======================== | |
| PROMPT_TEMPLATE = """## Positioning | |
| You are a **LANGUAGE grammatical error correction tool** that can identify and correct grammatical errors in a text. | |
| Reply with a corrected version of the input sentence with all **grammatical**, **spelling** and **whitespace errors** fixed, making only necessary changes. | |
| **If there are no errors, reply with a copy of the original sentence.** | |
| ## Formatting requirements | |
| - [Input]: The sentence should start with the identifier [input], followed by the sentence provided by the user. | |
| - [Output]: The sentence should start with the identifier [output], followed by the corrected sentence. | |
| - **Just format the output as required, no need to give too much explanation. ** | |
| - **You only need to output [output]: corrected sentence. ** | |
| ## Input and Output Examples | |
| Example 1: Extra spaces and Missing spaces and Spelling errors | |
| [input]: This is anexample sentence with in correct spa ces and spelling erorrs. | |
| [output]: This is an example sentence with incorrect spaces and spelling errors. | |
| Example 2: No errors, reply with a copy of the original sentence, don't fill in the contents of ___. | |
| [input]: _______ such as bitcoin are becoming increasingly mainstream and have a whole host of associated ethical implications, for example, they are______ and more ______. However, they have also been used to engage in _______. | |
| [output]: _______ such as bitcoin are becoming increasingly mainstream and have a whole host of associated ethical implications, for example, they are______ and more ______. However, they have also been used to engage in _______. | |
| ## Task | |
| Next, please correct the following sentence according to the above requirements. | |
| **If there are no errors, reply with a copy of the original sentence. Don't fill in the contents of ___.** | |
| **Remember: You only need to output [output]: Corrected sentence. ** | |
| [input]: """ | |
| # ======================== Initialize WAC + GEC ======================== | |
| def initialize_wac_gec(): | |
| """Lazy initialization of WAC-GEC models (Whitespace + Grammar Error Correction)""" | |
| global wac_corrector, gec_tokenizer, gec_model | |
| # 1. Initialize WAC (Whitespace Correction) | |
| if not WAC_GEC_AVAILABLE: | |
| print("β WAC module not installed") | |
| return False | |
| if wac_corrector is None: | |
| try: | |
| device = "cuda" if torch.cuda.is_available() else "cpu" | |
| wac_corrector = WhitespaceCorrector.from_pretrained( | |
| model="eo_larger_byte", | |
| device=device, | |
| download_dir="./models" | |
| ) | |
| print(f"β WAC whitespace correction model loaded (device: {device})") | |
| except Exception as e: | |
| print(f"β WAC model loading failed: {e}") | |
| return False | |
| # 2. Initialize GEC (Grammar Error Correction) | |
| if gec_model is None or gec_tokenizer is None: | |
| try: | |
| device = "cuda" if torch.cuda.is_available() else "cpu" | |
| print(f"π₯ Downloading GEC model from HuggingFace: {GEC_MODEL_NAME}") | |
| gec_tokenizer = AutoTokenizer.from_pretrained( | |
| GEC_MODEL_NAME, | |
| trust_remote_code=True | |
| ) | |
| gec_model = AutoModelForCausalLM.from_pretrained( | |
| GEC_MODEL_NAME, | |
| device_map="auto" if device == "cuda" else None, | |
| torch_dtype=torch.bfloat16 if device == "cuda" else torch.float32, | |
| trust_remote_code=True | |
| ) | |
| if device == "cpu": | |
| gec_model = gec_model.to(device) | |
| gec_tokenizer.pad_token_id = gec_tokenizer.eos_token_id | |
| gec_tokenizer.padding_side = "left" | |
| print(f"β GEC grammar correction model loaded (device: {device})") | |
| except Exception as e: | |
| print(f"β GEC model loading failed: {e}") | |
| return False | |
| return True | |
| # ======================== GEC Grammar Correction Function ======================== | |
| def correct_sentence_gec(input_sentence): | |
| """ | |
| Use GEC model for grammar correction | |
| Args: | |
| input_sentence (str): Sentence to be corrected | |
| Returns: | |
| str: Corrected sentence | |
| """ | |
| if gec_model is None or gec_tokenizer is None: | |
| raise ValueError("GEC model not initialized") | |
| prompt = f"""Rewrite the following sentence to correct grammatical errors. Return ONLY the corrected sentence. | |
| Original: {input_sentence} | |
| Corrected:""" | |
| inputs = gec_tokenizer(prompt, return_tensors="pt").to(gec_model.device) | |
| is_cpu = str(gec_model.device) == "cpu" or not torch.cuda.is_available() | |
| if is_cpu: | |
| max_tokens = 256 | |
| beams = 2 | |
| else: | |
| max_tokens = 512 | |
| beams = 4 | |
| with torch.no_grad(): | |
| outputs = gec_model.generate( | |
| **inputs, | |
| max_new_tokens=max_tokens, | |
| num_beams=beams, | |
| do_sample=False, | |
| temperature=None, | |
| top_p=None | |
| ) | |
| full_output = gec_tokenizer.decode(outputs[0], skip_special_tokens=True) | |
| corrected_text = full_output.replace(prompt, "").strip() | |
| if corrected_text.startswith("Corrected:"): | |
| corrected_text = corrected_text[len("Corrected:"):].strip() | |
| return corrected_text | |
| # ======================== WAC-GEC Combined Processing ======================== | |
| def call_wac_gec(text): | |
| """ | |
| Use WAC-GEC two-step correction: | |
| 1. GEC model for grammar and spelling correction | |
| 2. WAC model for whitespace correction | |
| """ | |
| if not initialize_wac_gec(): | |
| raise ValueError("β οΈ WAC-GEC models not installed or failed to load") | |
| try: | |
| # Step 1: Use GEC model for grammar correction | |
| print(f"π GEC processing: {text[:50]}...") | |
| gec_corrected = correct_sentence_gec(text) | |
| print(f"β GEC result: {gec_corrected[:50]}...") | |
| # Step 2: Use WAC model for whitespace correction | |
| print(f"π WAC processing: {gec_corrected[:50]}...") | |
| final_corrected = wac_corrector.correct_text(gec_corrected) | |
| print(f"β WAC result: {final_corrected[:50]}...") | |
| return f"[output]: {final_corrected}" | |
| except Exception as e: | |
| raise Exception(f"WAC-GEC processing error: {str(e)}") | |
| # ======================== Color Diff Functions ======================== | |
| def generate_colored_diff(original, cleaned): | |
| """ | |
| Generate HTML diff with color annotations | |
| Errors in original text: red | |
| Corrections after denoising: green | |
| """ | |
| original_words = original.split() | |
| cleaned_words = cleaned.split() | |
| matcher = difflib.SequenceMatcher(None, original_words, cleaned_words) | |
| original_html = [] | |
| cleaned_html = [] | |
| for tag, i1, i2, j1, j2 in matcher.get_opcodes(): | |
| if tag == 'equal': | |
| original_html.extend(original_words[i1:i2]) | |
| cleaned_html.extend(cleaned_words[j1:j2]) | |
| elif tag == 'replace': | |
| original_html.extend([f'<span style="color: #dc3545; font-weight: bold;">{w}</span>' | |
| for w in original_words[i1:i2]]) | |
| cleaned_html.extend([f'<span style="color: #28a745; font-weight: bold;">{w}</span>' | |
| for w in cleaned_words[j1:j2]]) | |
| elif tag == 'delete': | |
| original_html.extend([f'<span style="color: #dc3545; text-decoration: line-through;">{w}</span>' | |
| for w in original_words[i1:i2]]) | |
| elif tag == 'insert': | |
| cleaned_html.extend([f'<span style="color: #28a745; font-weight: bold;">{w}</span>' | |
| for w in cleaned_words[j1:j2]]) | |
| return ' '.join(original_html), ' '.join(cleaned_html) | |
| def create_comparison_html(original_list, cleaned_list): | |
| """ | |
| Create HTML table for comparison | |
| """ | |
| html = """ | |
| <div style="font-family: 'Times New Roman', serif; max-width: 100%; overflow-x: auto;"> | |
| <style> | |
| .comparison-table { | |
| width: 100%; | |
| border-collapse: collapse; | |
| margin: 20px 0; | |
| border: 1px solid #000; | |
| } | |
| .comparison-table th { | |
| background-color: #f2f2f2; | |
| color: #000; | |
| padding: 8px; | |
| text-align: left; | |
| font-weight: bold; | |
| border-bottom: 2px solid #000; | |
| } | |
| .comparison-table td { | |
| padding: 8px; | |
| border-bottom: 1px solid #ccc; | |
| line-height: 1.5; | |
| vertical-align: top; | |
| } | |
| .index-col { | |
| width: 50px; | |
| text-align: center; | |
| font-weight: bold; | |
| color: #555; | |
| } | |
| </style> | |
| <table class="comparison-table"> | |
| <thead> | |
| <tr> | |
| <th class="index-col">#</th> | |
| <th>Original Question</th> | |
| <th>Denoised Question</th> | |
| </tr> | |
| </thead> | |
| <tbody> | |
| """ | |
| for idx, (orig, clean) in enumerate(zip(original_list, cleaned_list), 1): | |
| orig_colored, clean_colored = generate_colored_diff(str(orig), str(clean)) | |
| html += f""" | |
| <tr> | |
| <td class="index-col">{idx}</td> | |
| <td class="original-col">{orig_colored}</td> | |
| <td class="cleaned-col">{clean_colored}</td> | |
| </tr> | |
| """ | |
| html += """ | |
| </tbody> | |
| </table> | |
| </div> | |
| """ | |
| return html | |
| # ======================== Utility Functions ======================== | |
| def check_api_key(model_choice): | |
| """Check API key (only required for DeepSeek)""" | |
| if model_choice == "deepseek-r1-distill-llama-8b" and not DEEPSEEK_API_KEY: | |
| raise ValueError("β οΈ Please configure DEEPSEEK_API_KEY in Space Settings!") | |
| def call_deepseek_api(prompt, model="deepseek-r1-distill-llama-8b", temperature=0.1, stream=True): | |
| check_api_key(model) | |
| client = OpenAI(api_key=DEEPSEEK_API_KEY, base_url=DEEPSEEK_BASE_URL) | |
| completion = client.chat.completions.create( | |
| model=model, | |
| messages=[{"role": "user", "content": prompt}], | |
| temperature=temperature, | |
| stream=stream | |
| ) | |
| if stream: | |
| response_content = "" | |
| for chunk in completion: | |
| if chunk.choices and chunk.choices[0].delta.content: | |
| response_content += chunk.choices[0].delta.content | |
| return response_content | |
| else: | |
| return completion.choices[0].message.content | |
| def process_sentence(sentence): | |
| sentence = sentence.strip() | |
| lines = [line.strip() for line in sentence.split('\n') if line.strip()] | |
| is_multiline = len(lines) > 1 | |
| target_line = lines[-1] if is_multiline else sentence | |
| last_char = target_line[-1] if target_line else '' | |
| if last_char in {'.', '?', '!', ';', ','}: | |
| return target_line | |
| else: | |
| return target_line + " ___." | |
| def is_valid_output(content_2, content_1, content_0): | |
| if not (content_2.startswith('[output]:') and '\n' not in content_2): | |
| return False | |
| if ('___' in content_0 or '___' in content_1) and '___' not in content_2: | |
| return False | |
| if len(content_2) > 2 * len(content_1) or len(content_1) > 2 * len(content_2): | |
| return False | |
| return True | |
| def extract_output_content(item): | |
| if item.startswith('[output]:'): | |
| output_content = item[len('[output]:'):].strip() | |
| if output_content and output_content[0] == '"' and output_content[-1] == '"': | |
| return output_content[1:-1] | |
| return output_content | |
| elif item.startswith('[ERROR] Failed to process:'): | |
| error_content = item[len('[ERROR] Failed to process:'):].strip() | |
| if error_content and error_content[0] == '"' and error_content[-1] == '"': | |
| return error_content[1:-1] | |
| return error_content | |
| else: | |
| return None | |
| def has_missing_spaces(sentence): | |
| if ' ' in sentence: | |
| return False | |
| doc = nlp(sentence) | |
| alpha_tokens = [t for t in doc if t.is_alpha] | |
| return len(alpha_tokens) >= 2 | |
| def calculate_whitespace_anomaly_rate(sentences): | |
| if not sentences: | |
| return 0.0 | |
| anomaly_count = 0 | |
| for sent in sentences: | |
| if has_missing_spaces(sent): | |
| anomaly_count += 1 | |
| continue | |
| if any(p.search(sent) for p in WHITESPACE_PATTERNS): | |
| anomaly_count += 1 | |
| return anomaly_count / len(sentences) * 100 | |
| def normalize_tokens(text): | |
| doc = nlp(text) | |
| tokens = [] | |
| for t in doc: | |
| if not t.is_alpha or len(t.text) <= 2 or t.text.isupper(): | |
| continue | |
| tokens.append(t.text.lower()) | |
| return tokens | |
| def calculate_spelling_error_density(sentences): | |
| total_words = 0 | |
| total_errors = 0 | |
| for sent in sentences: | |
| if has_missing_spaces(sent): | |
| continue | |
| tokens = normalize_tokens(sent) | |
| if not tokens: | |
| continue | |
| misspelled = spell.unknown(tokens) | |
| total_errors += len(misspelled) | |
| total_words += len(tokens) | |
| if total_words == 0: | |
| return 0.0 | |
| return total_errors / total_words * 100 | |
| # ======================== Leaderboard Data Processing ======================== | |
| def load_leaderboard_data(): | |
| json_path = "leaderboard.json" | |
| try: | |
| with open(json_path, 'r', encoding='utf-8') as f: | |
| data = json.load(f) | |
| for item in data: | |
| benchmark = item['Benchmark'] | |
| hash_object = hashlib.md5(benchmark.encode()) | |
| item['ID'] = hash_object.hexdigest()[:8] | |
| return pd.DataFrame(data) | |
| except Exception as e: | |
| print(f"Error loading leaderboard: {e}") | |
| return pd.DataFrame() | |
| def filter_leaderboard(df, category_query, version_query): | |
| """ | |
| Filter by both category and version | |
| """ | |
| result = df.copy() | |
| if category_query != "all": | |
| result = result[result['Category'] == category_query] | |
| if version_query != "all": | |
| if version_query == "original": | |
| result = result[result['Benchmark'].str.contains('_original', case=False, na=False)] | |
| elif version_query == "deepseek": | |
| result = result[result['Benchmark'].str.contains('deepseek_r1_denoising', case=False, na=False)] | |
| elif version_query == "wac_gec": | |
| result = result[result['Benchmark'].str.contains('wac_gec', case=False, na=False)] | |
| return result | |
| def search_leaderboard(df, query): | |
| if not query: | |
| return df | |
| return df[df['Benchmark'].str.contains(query, case=False, na=False)] | |
| # ======================== Dataset Denoising Function ======================== | |
| def clean_dataset(file_path, question_column, model_choice, temperature, max_samples, progress=gr.Progress()): | |
| try: | |
| try: | |
| check_api_key(model_choice) | |
| except ValueError as e: | |
| if model_choice == "deepseek-r1-distill-llama-8b": | |
| return str(e), None, "" | |
| if model_choice == "WAC-GEC" and not WAC_GEC_AVAILABLE: | |
| return "β WAC-GEC model not installed! Please install whitespace_correction package.", None, "" | |
| progress(0.05, desc="π Reading data file...") | |
| df = pd.read_parquet(file_path) | |
| if question_column not in df.columns: | |
| available_columns = ", ".join(df.columns.tolist()) | |
| return f"β Column '{question_column}' not found!\nAvailable columns: {available_columns}", None, "" | |
| data_ori = df[question_column].tolist()[:int(max_samples)] | |
| total = len(data_ori) | |
| progress(0.08, desc="π Calculating original metrics...") | |
| original_sentences = [str(item) for item in data_ori] | |
| war_original = calculate_whitespace_anomaly_rate(original_sentences) | |
| sed_original = calculate_spelling_error_density(original_sentences) | |
| progress(0.1, desc=f"π Starting denoising of {total} samples (model: {model_choice})...") | |
| if model_choice == "WAC-GEC": | |
| data_corrupt = [str(item) for item in data_ori] | |
| else: | |
| data_corrupt = [process_sentence(str(item)) for item in data_ori] | |
| results = [] | |
| max_retries = 5 if model_choice == "deepseek-r1-distill-llama-8b" else 3 | |
| log_text = f"π Processing {total} samples...\n" | |
| log_text += f"π Using model: {model_choice}\n\n" | |
| for idx in range(total): | |
| progress((0.1 + 0.7 * idx / total), desc=f"Processing: {idx+1}/{total}") | |
| unprocess_text = str(data_ori[idx]) | |
| original_text = data_corrupt[idx] | |
| response_content = "" | |
| retry_count = 0 | |
| while retry_count < max_retries: | |
| try: | |
| if model_choice == "WAC-GEC": | |
| response_content = call_wac_gec(original_text) | |
| else: | |
| response_content = call_deepseek_api( | |
| PROMPT_TEMPLATE + original_text, | |
| model=model_choice, | |
| temperature=float(temperature) | |
| ) | |
| if model_choice == "WAC-GEC": | |
| if response_content.startswith('[output]:'): | |
| results.append(response_content) | |
| break | |
| else: | |
| retry_count += 1 | |
| else: | |
| if is_valid_output(response_content, original_text, unprocess_text): | |
| results.append(response_content) | |
| break | |
| else: | |
| retry_count += 1 | |
| except Exception as e: | |
| retry_count += 1 | |
| log_text += f"β οΈ Sample {idx+1} error, retry {retry_count}/{max_retries}: {str(e)}\n" | |
| else: | |
| results.append(f"[ERROR] Failed to process: {original_text}") | |
| log_text += f"β Sample {idx+1} processing failed\n" | |
| progress(0.85, desc="π Post-processing...") | |
| lst_extracted = [] | |
| error_count = 0 | |
| unknown_count = 0 | |
| for i, item in enumerate(results): | |
| extracted = extract_output_content(item) | |
| if extracted is None: | |
| lst_extracted.append(str(data_ori[i])) | |
| unknown_count += 1 | |
| else: | |
| lst_extracted.append(extracted) | |
| if item.startswith('[ERROR]'): | |
| error_count += 1 | |
| lst_final = [] | |
| for i in range(len(data_ori)): | |
| item = str(data_ori[i]) | |
| if '\n' in item and model_choice != "WAC-GEC": | |
| tmp_lines = [line.strip() for line in item.strip().split('\n') if line.strip()] | |
| tmp_lines[-1] = lst_extracted[i] | |
| lst_final.append('\n'.join(tmp_lines)) | |
| else: | |
| lst_final.append(lst_extracted[i]) | |
| progress(0.90, desc="π Calculating denoised metrics...") | |
| cleaned_sentences = [str(item) for item in lst_final] | |
| war_cleaned = calculate_whitespace_anomaly_rate(cleaned_sentences) | |
| sed_cleaned = calculate_spelling_error_density(cleaned_sentences) | |
| delta_war = war_cleaned - war_original | |
| delta_sed = sed_cleaned - sed_original | |
| progress(0.95, desc="πΎ Saving results...") | |
| df_cleaned = df.copy() | |
| df_cleaned[question_column + '_cleaned'] = lst_final[:len(df)] | |
| original_filename = os.path.basename(file_path) | |
| base_name = original_filename.replace('.parquet', '') | |
| model_suffix = "WAC-GEC" if model_choice == "WAC-GEC" else "DeepSeek" | |
| output_filename = f"{base_name}-Denoising-{model_suffix}.parquet" | |
| output_path = os.path.join(tempfile.gettempdir(), output_filename) | |
| df_cleaned.to_parquet(output_path, index=False) | |
| log_text += f"\n\nπ Processing Complete!\n" | |
| log_text += f"{'='*50}\n" | |
| log_text += f"γBasic Statisticsγ\n" | |
| log_text += f"- Model used: {model_choice}\n" | |
| log_text += f"- Total samples: {total}\n" | |
| log_text += f"- Successfully processed: {total - error_count - unknown_count}\n" | |
| log_text += f"- Failed samples: {error_count}\n" | |
| log_text += f"- Unknown format: {unknown_count}\n" | |
| log_text += f"- Output file: {output_filename}\n\n" | |
| log_text += f"γQuality Metricsγ\n" | |
| log_text += f"π Whitespace Anomaly Rate (WAR):\n" | |
| log_text += f" Original: {war_original:.2f}% β Denoised: {war_cleaned:.2f}%\n" | |
| log_text += f" Change: {delta_war:+.2f}% {'β Improved' if delta_war < 0 else 'β οΈ Increased'}\n\n" | |
| log_text += f"π Spelling Error Density (SED):\n" | |
| log_text += f" Original: {sed_original:.2f}% β Denoised: {sed_cleaned:.2f}%\n" | |
| log_text += f" Change: {delta_sed:+.2f}% {'β Improved' if delta_sed < 0 else 'β οΈ Increased'}\n" | |
| if model_choice == "WAC-GEC": | |
| log_text += f"\nπ‘ Note: WAC-GEC uses two-step correction (GEC grammar + WAC whitespace)\n" | |
| log_text += f"{'='*50}\n" | |
| preview_html = create_comparison_html(data_ori[:5], lst_final[:5]) | |
| progress(1.0, desc="β Complete!") | |
| return log_text, output_path, preview_html | |
| except Exception as e: | |
| import traceback | |
| error_detail = traceback.format_exc() | |
| return f"β Processing error: {str(e)}\n\nDetailed error:\n{error_detail}", None, "" | |
| # ======================== Text Content ======================== | |
| ABOUT_TEXT = """ | |
| ## Denoising Workflow | |
| ### Supported Models | |
| #### 1. DeepSeek-R1 (deepseek-r1-distill-llama-8b) | |
| - **Function**: Comprehensive grammar, spelling, and whitespace error correction | |
| - **Advantages**: Strong comprehensive capability, handles multiple error types | |
| - **Configuration**: Requires DEEPSEEK_API_KEY in Space Settings | |
| #### 2. WAC-GEC (Whitespace + Grammar Error Correction) | |
| - **Function**: Two-step correction workflow | |
| - **Step 1 (GEC)**: Use LLaMA-2-7B fine-tuned model for grammar and spelling correction | |
| - **Step 2 (WAC)**: Use whitespace correction model for spacing issues | |
| - **Advantages**: | |
| - Fully local, no API key required | |
| - Combines two specialized models | |
| - Suitable for offline environments and limited budgets | |
| - **Model Source**: | |
| - GEC: [lllouo/gec_Chat-LLaMa-2-7B-FT](https://huggingface.co/lllouo/gec_Chat-LLaMa-2-7B-FT) | |
| - WAC: whitespace_correction library | |
| ### Core Algorithm | |
| 1. **Preprocessing (process_sentence)** | |
| - Detect sentence completeness | |
| - Add marker `___` for incomplete sentences (DeepSeek only) | |
| - Preserve multi-line text format | |
| 2. **Model Denoising** | |
| - **DeepSeek**: Use API for comprehensive error correction, up to 5 retries | |
| - **WAC-GEC**: | |
| - First use GEC model for grammar and spelling correction | |
| - Then use WAC model for whitespace correction | |
| - Up to 3 retries | |
| 3. **Format Validation** | |
| - Verify output format correctness | |
| - Check marker preservation | |
| - Length reasonability check | |
| 4. **Post-processing** | |
| - Extract denoised content | |
| - Restore original multi-line format | |
| - Generate Parquet file with model identifier | |
| ### Supported Datasets | |
| - **MMLU**: Multiple choice questions across 57 subjects | |
| - **GSM8K**: Math reasoning problems | |
| - **ARC-Challenge**: Science Q&A | |
| - **MedMCQA**: Medical multiple choice | |
| - **CoQA**: Conversational Q&A | |
| - And more... | |
| ### Color Annotation Legend | |
| - π΄ **Red**: Errors in original text (spelling, grammar, spacing, etc.) | |
| - π’ **Green**: Corrections after denoising | |
| - β« **Black**: Unchanged correct parts | |
| ### Tech Stack | |
| - **LLM**: DeepSeek API (deepseek-r1-distill-llama-8b) | |
| - **Local Models**: | |
| - GEC: LLaMA-2-7B (fine-tuned for grammar correction) | |
| - WAC: Whitespace Correction Model | |
| - **Frontend**: Gradio 4.16.0 | |
| - **Data Processing**: Pandas + PyArrow (Parquet) | |
| - **Diff Comparison**: Python difflib | |
| - **NLP Tools**: spaCy, pyspellchecker | |
| - **API Calls**: OpenAI SDK | |
| - **Deployment**: Hugging Face Spaces | |
| ### Quality Metrics | |
| - **WAR (Whitespace Anomaly Rate)**: Whitespace anomaly rate | |
| - **SED (Spelling Error Density)**: Spelling error density | |
| ### Model Selection Guide | |
| - **Need comprehensive denoising + API budget**: Choose DeepSeek-R1 | |
| - **Local deployment + complete correction**: Choose WAC-GEC (Recommended) | |
| - **Only need spacing correction**: Use WAC module alone | |
| - **Fastest speed**: Use GPU-accelerated WAC-GEC | |
| --- | |
| **Graduate Thesis Research Showcase** | Powered by DeepSeek API & WAC-GEC | |
| """ | |
| # ======================== Gradio Interface ======================== | |
| demo = gr.Blocks(title="Dataset Denoising Framework Demo System", css=""" | |
| .markdown-text { font-size: 16px; line-height: 1.6; } | |
| """) | |
| with demo: | |
| gr.Markdown( | |
| """<div style="text-align: center;"><h1>β <span style='color: #e6b800;'>Denoising Factory</span> Based on Benchmark Denoising Framework</h1></div> | |
| <br> | |
| <p>This system demonstrates the denoising effects of DeepSeek-R1 and WAC-GEC methods on mainstream benchmark datasets based on <a href="https://github.com/LLLoUo/bd-toolkit" target="_blank">BD-toolkit</a>. Quality is evaluated using WAR (Whitespace Anomaly Rate) and SED (Spelling Error Density) metrics.</p> | |
| """, | |
| elem_classes="markdown-text" | |
| ) | |
| leaderboard_data = load_leaderboard_data() | |
| with gr.Tabs(elem_classes="tab-buttons") as tabs: | |
| with gr.TabItem("π BD-benchmarks Leaderboard", id=0): | |
| with gr.Column(): | |
| gr.Markdown("### Mainstream Benchmark Leaderboard After BD Denoising") | |
| with gr.Row(): | |
| search_bar = gr.Textbox( | |
| placeholder="π Search benchmark name and press ENTER...", | |
| show_label=False, | |
| elem_id="search-bar", | |
| ) | |
| filter_categories = gr.Radio( | |
| label="π Filter by Benchmark Category", | |
| choices=["all", "BT", "RA", "TG", "SU", "ME", "GR"], | |
| value="all", | |
| elem_id="filter-columns", | |
| ) | |
| filter_versions = gr.Radio( | |
| label="π Filter by Dataset Version", | |
| choices=[ | |
| ("All Versions", "all"), | |
| ("Original", "original"), | |
| ("DeepSeek-R1-denoised", "deepseek"), | |
| ("WAC-GEC", "wac_gec") | |
| ], | |
| value="all", | |
| elem_id="filter-versions", | |
| ) | |
| leaderboard_table = gr.Dataframe( | |
| value=leaderboard_data[['ID', 'Category', 'Benchmark', 'WAR', 'SED', 'Download']], | |
| headers=['ID', 'Category', 'Benchmark', 'WAR (%)', 'SED', 'Download'], | |
| datatype=['number', 'str', 'str', 'number', 'number', 'markdown'], | |
| elem_id="leaderboard-table", | |
| interactive=False, | |
| ) | |
| hidden_leaderboard = gr.Dataframe( | |
| value=leaderboard_data, | |
| visible=False | |
| ) | |
| search_bar.submit( | |
| lambda df, query: search_leaderboard(df, query)[['ID', 'Category', 'Benchmark', 'WAR', 'SED', 'Download']], | |
| [hidden_leaderboard, search_bar], | |
| leaderboard_table | |
| ) | |
| def combined_filter(df, category, version): | |
| filtered = filter_leaderboard(df, category, version) | |
| return filtered[['ID', 'Category', 'Benchmark', 'WAR', 'SED', 'Download']] | |
| filter_categories.change( | |
| combined_filter, | |
| [hidden_leaderboard, filter_categories, filter_versions], | |
| leaderboard_table | |
| ) | |
| filter_versions.change( | |
| combined_filter, | |
| [hidden_leaderboard, filter_categories, filter_versions], | |
| leaderboard_table | |
| ) | |
| gr.Markdown(""" | |
| **Legend:** | |
| - **Category**: BT=Basic Tasks, RA=Reasoning Abilities, TG=Text Generation, SU=Speech Understanding, ME=Medical, GR=Grammar | |
| - **Version**: Original=Unprocessed dataset, DeepSeek-R1=DeepSeek denoised version, WAC-GEC=WAC-GEC denoised version | |
| - **WAR**: Whitespace Anomaly Rate (lower is better) | |
| - **SED**: Spelling Error Density (lower is better) | |
| """, elem_classes="markdown-text") | |
| with gr.TabItem("π BD-toolkit Demo", id=2): | |
| gr.Markdown("## BD-toolkit Lightweight Demo") | |
| model_status = "β WAC-GEC: " + ("Available" if WAC_GEC_AVAILABLE else "Not Installed") | |
| model_status += " | β DeepSeek-R1: " + ("Configured" if DEEPSEEK_API_KEY else "API Key Not Configured") | |
| gr.Markdown(f"**Model Status**: {model_status}") | |
| with gr.Row(): | |
| with gr.Column(): | |
| file_input = gr.File( | |
| label="π Upload Parquet File", | |
| file_types=[".parquet"] | |
| ) | |
| question_column = gr.Textbox( | |
| label="π Question Column Name", | |
| value="question", | |
| placeholder="e.g., question, input_text, prompt" | |
| ) | |
| model_choice = gr.Dropdown( | |
| choices=["WAC-GEC", "deepseek-r1-distill-llama-8b"], | |
| value="WAC-GEC", | |
| label="π€ Select Model", | |
| info="DeepSeek: Comprehensive correction | WAC-GEC: Grammar + whitespace (local model)" | |
| ) | |
| temperature = gr.Slider( | |
| minimum=0.0, | |
| maximum=1.0, | |
| value=0.1, | |
| step=0.1, | |
| label="π‘οΈ Temperature", | |
| info="Only effective for DeepSeek", | |
| interactive=False | |
| ) | |
| max_samples = gr.Slider( | |
| minimum=1, | |
| maximum=100, | |
| value=5, | |
| step=1, | |
| label="π Number of Samples to Process (Demo Limit)" | |
| ) | |
| clean_btn = gr.Button("π Start Denoising", variant="primary", size="lg") | |
| with gr.Column(): | |
| output_text = gr.Textbox( | |
| label="β³ Processing Progress", | |
| lines=10, | |
| max_lines=15 | |
| ) | |
| download_file = gr.File(label="π₯ Download Denoised Dataset") | |
| def update_temperature_interactive(model): | |
| if model == "deepseek-r1-distill-llama-8b": | |
| return gr.update(interactive=True, info="Adjust generation randomness") | |
| else: | |
| return gr.update(interactive=False, info="WAC-GEC model does not support temperature parameter") | |
| model_choice.change( | |
| fn=update_temperature_interactive, | |
| inputs=[model_choice], | |
| outputs=[temperature] | |
| ) | |
| gr.Markdown("### π¨ Denoising Effect Comparison Preview") | |
| gr.Markdown(""" | |
| **Color Legend**: | |
| - π΄ <span style="color: #dc3545;">Red</span> = Errors in original text | |
| - π’ <span style="color: #28a745;">Green</span> = Corrections after denoising | |
| - β« Black = Unchanged correct parts | |
| """) | |
| colored_preview = gr.HTML(label="") | |
| clean_btn.click( | |
| fn=clean_dataset, | |
| inputs=[file_input, question_column, model_choice, temperature, max_samples], | |
| outputs=[output_text, download_file, colored_preview] | |
| ) | |
| with gr.TabItem("π About", id=3): | |
| gr.Markdown(ABOUT_TEXT, elem_classes="markdown-text") | |
| if __name__ == "__main__": | |
| print("π Preloading WAC-GEC models...") | |
| initialize_wac_gec() | |
| demo.launch( | |
| server_name="0.0.0.0", | |
| server_port=7860, | |
| ssr_mode=False | |
| ) |