Spaces:

lllouo
/

BD_framework_test

Sleeping

App Files Files Community

BD_framework_test / app.py

lllouo

update filter

9ddd1ab about 2 months ago

raw

history blame contribute delete

35 kB

	import gradio as gr
	import json
	import pandas as pd
	import os
	from typing import Optional
	import tempfile
	import requests
	from openai import OpenAI
	import re
	import spacy
	from spellchecker import SpellChecker
	import difflib
	import torch
	from transformers import AutoTokenizer, AutoModelForCausalLM
	import hashlib

	# ======================== WAC-GEC Import ========================
	try:
	from whitespace_correction import WhitespaceCorrector
	WAC_GEC_AVAILABLE = True
	# Initialize WAC-GEC model (lazy loading)
	wac_corrector = None
	except ImportError:
	WAC_GEC_AVAILABLE = False
	wac_corrector = None
	print("⚠️ whitespace_correction not installed, WAC-GEC functionality unavailable")

	# Initialize GEC model (lazy loading)
	gec_tokenizer = None
	gec_model = None
	GEC_MODEL_NAME = "lllouo/gec_Chat-LLaMa-2-7B-FT"

	# ======================== API Configuration ========================
	DEEPSEEK_API_KEY = os.getenv("DEEPSEEK_API_KEY", "")
	DEEPSEEK_BASE_URL = "https://dashscope.aliyuncs.com/compatible-mode/v1"

	# ======================== NLP Tools Initialization ========================
	try:
	nlp = spacy.load("en_core_web_sm")
	except OSError:
	import subprocess
	subprocess.run(["python", "-m", "spacy", "download", "en_core_web_sm"])
	nlp = spacy.load("en_core_web_sm")

	spell = SpellChecker()

	WHITESPACE_PATTERNS = [
	re.compile(r'[ \t]{2,}'),
	re.compile(r'\u200B\|\u2060'),
	re.compile(r'\s+([.,!?;:])'),
	re.compile(r'([.,!?;:])\s{2,}'),
	]

	# ======================== Prompt Template ========================
	PROMPT_TEMPLATE = """## Positioning
	You are a LANGUAGE grammatical error correction tool that can identify and correct grammatical errors in a text.
	Reply with a corrected version of the input sentence with all grammatical, spelling and whitespace errors fixed, making only necessary changes.
	If there are no errors, reply with a copy of the original sentence.

	## Formatting requirements
	- [Input]: The sentence should start with the identifier [input], followed by the sentence provided by the user.
	- [Output]: The sentence should start with the identifier [output], followed by the corrected sentence.
	- Just format the output as required, no need to give too much explanation.
	- You only need to output [output]: corrected sentence.

	## Input and Output Examples
	Example 1: Extra spaces and Missing spaces and Spelling errors
	[input]: This is anexample sentence with in correct spa ces and spelling erorrs.
	[output]: This is an example sentence with incorrect spaces and spelling errors.

	Example 2: No errors, reply with a copy of the original sentence, don't fill in the contents of ___.
	[input]: _______ such as bitcoin are becoming increasingly mainstream and have a whole host of associated ethical implications, for example, they are______ and more ______. However, they have also been used to engage in _______.
	[output]: _______ such as bitcoin are becoming increasingly mainstream and have a whole host of associated ethical implications, for example, they are______ and more ______. However, they have also been used to engage in _______.

	## Task
	Next, please correct the following sentence according to the above requirements.
	If there are no errors, reply with a copy of the original sentence. Don't fill in the contents of ___.
	Remember: You only need to output [output]: Corrected sentence.

	[input]: """

	# ======================== Initialize WAC + GEC ========================
	def initialize_wac_gec():
	"""Lazy initialization of WAC-GEC models (Whitespace + Grammar Error Correction)"""
	global wac_corrector, gec_tokenizer, gec_model

	# 1. Initialize WAC (Whitespace Correction)
	if not WAC_GEC_AVAILABLE:
	print("❌ WAC module not installed")
	return False

	if wac_corrector is None:
	try:
	device = "cuda" if torch.cuda.is_available() else "cpu"
	wac_corrector = WhitespaceCorrector.from_pretrained(
	model="eo_larger_byte",
	device=device,
	download_dir="./models"
	)
	print(f"✅ WAC whitespace correction model loaded (device: {device})")
	except Exception as e:
	print(f"❌ WAC model loading failed: {e}")
	return False

	# 2. Initialize GEC (Grammar Error Correction)
	if gec_model is None or gec_tokenizer is None:
	try:
	device = "cuda" if torch.cuda.is_available() else "cpu"

	print(f"📥 Downloading GEC model from HuggingFace: {GEC_MODEL_NAME}")
	gec_tokenizer = AutoTokenizer.from_pretrained(
	GEC_MODEL_NAME,
	trust_remote_code=True
	)
	gec_model = AutoModelForCausalLM.from_pretrained(
	GEC_MODEL_NAME,
	device_map="auto" if device == "cuda" else None,
	torch_dtype=torch.bfloat16 if device == "cuda" else torch.float32,
	trust_remote_code=True
	)

	if device == "cpu":
	gec_model = gec_model.to(device)

	gec_tokenizer.pad_token_id = gec_tokenizer.eos_token_id
	gec_tokenizer.padding_side = "left"

	print(f"✅ GEC grammar correction model loaded (device: {device})")

	except Exception as e:
	print(f"❌ GEC model loading failed: {e}")
	return False

	return True

	# ======================== GEC Grammar Correction Function ========================
	def correct_sentence_gec(input_sentence):
	"""
	Use GEC model for grammar correction
	Args:
	input_sentence (str): Sentence to be corrected
	Returns:
	str: Corrected sentence
	"""
	if gec_model is None or gec_tokenizer is None:
	raise ValueError("GEC model not initialized")

	prompt = f"""Rewrite the following sentence to correct grammatical errors. Return ONLY the corrected sentence.
	Original: {input_sentence}
	Corrected:"""

	inputs = gec_tokenizer(prompt, return_tensors="pt").to(gec_model.device)

	is_cpu = str(gec_model.device) == "cpu" or not torch.cuda.is_available()

	if is_cpu:
	max_tokens = 256
	beams = 2
	else:
	max_tokens = 512
	beams = 4

	with torch.no_grad():
	outputs = gec_model.generate(
	**inputs,
	max_new_tokens=max_tokens,
	num_beams=beams,
	do_sample=False,
	temperature=None,
	top_p=None
	)

	full_output = gec_tokenizer.decode(outputs[0], skip_special_tokens=True)
	corrected_text = full_output.replace(prompt, "").strip()

	if corrected_text.startswith("Corrected:"):
	corrected_text = corrected_text[len("Corrected:"):].strip()

	return corrected_text

	# ======================== WAC-GEC Combined Processing ========================
	def call_wac_gec(text):
	"""
	Use WAC-GEC two-step correction:
	1. GEC model for grammar and spelling correction
	2. WAC model for whitespace correction
	"""
	if not initialize_wac_gec():
	raise ValueError("⚠️ WAC-GEC models not installed or failed to load")

	try:
	# Step 1: Use GEC model for grammar correction
	print(f"🔍 GEC processing: {text[:50]}...")
	gec_corrected = correct_sentence_gec(text)
	print(f"✅ GEC result: {gec_corrected[:50]}...")

	# Step 2: Use WAC model for whitespace correction
	print(f"🔍 WAC processing: {gec_corrected[:50]}...")
	final_corrected = wac_corrector.correct_text(gec_corrected)
	print(f"✅ WAC result: {final_corrected[:50]}...")

	return f"[output]: {final_corrected}"

	except Exception as e:
	raise Exception(f"WAC-GEC processing error: {str(e)}")

	# ======================== Color Diff Functions ========================
	def generate_colored_diff(original, cleaned):
	"""
	Generate HTML diff with color annotations
	Errors in original text: red
	Corrections after denoising: green
	"""
	original_words = original.split()
	cleaned_words = cleaned.split()

	matcher = difflib.SequenceMatcher(None, original_words, cleaned_words)

	original_html = []
	cleaned_html = []

	for tag, i1, i2, j1, j2 in matcher.get_opcodes():
	if tag == 'equal':
	original_html.extend(original_words[i1:i2])
	cleaned_html.extend(cleaned_words[j1:j2])
	elif tag == 'replace':
	original_html.extend([f'<span style="color: #dc3545; font-weight: bold;">{w}</span>'
	for w in original_words[i1:i2]])
	cleaned_html.extend([f'<span style="color: #28a745; font-weight: bold;">{w}</span>'
	for w in cleaned_words[j1:j2]])
	elif tag == 'delete':
	original_html.extend([f'<span style="color: #dc3545; text-decoration: line-through;">{w}</span>'
	for w in original_words[i1:i2]])
	elif tag == 'insert':
	cleaned_html.extend([f'<span style="color: #28a745; font-weight: bold;">{w}</span>'
	for w in cleaned_words[j1:j2]])

	return ' '.join(original_html), ' '.join(cleaned_html)

	def create_comparison_html(original_list, cleaned_list):
	"""
	Create HTML table for comparison
	"""
	html = """
	<div style="font-family: 'Times New Roman', serif; max-width: 100%; overflow-x: auto;">
	<style>
	.comparison-table {
	width: 100%;
	border-collapse: collapse;
	margin: 20px 0;
	border: 1px solid #000;
	}
	.comparison-table th {
	background-color: #f2f2f2;
	color: #000;
	padding: 8px;
	text-align: left;
	font-weight: bold;
	border-bottom: 2px solid #000;
	}
	.comparison-table td {
	padding: 8px;
	border-bottom: 1px solid #ccc;
	line-height: 1.5;
	vertical-align: top;
	}
	.index-col {
	width: 50px;
	text-align: center;
	font-weight: bold;
	color: #555;
	}
	</style>
	<table class="comparison-table">
	<thead>
	<tr>
	<th class="index-col">#</th>
	<th>Original Question</th>
	<th>Denoised Question</th>
	</tr>
	</thead>
	<tbody>
	"""

	for idx, (orig, clean) in enumerate(zip(original_list, cleaned_list), 1):
	orig_colored, clean_colored = generate_colored_diff(str(orig), str(clean))
	html += f"""
	<tr>
	<td class="index-col">{idx}</td>
	<td class="original-col">{orig_colored}</td>
	<td class="cleaned-col">{clean_colored}</td>
	</tr>
	"""

	html += """
	</tbody>
	</table>
	</div>
	"""

	return html

	# ======================== Utility Functions ========================
	def check_api_key(model_choice):
	"""Check API key (only required for DeepSeek)"""
	if model_choice == "deepseek-r1-distill-llama-8b" and not DEEPSEEK_API_KEY:
	raise ValueError("⚠️ Please configure DEEPSEEK_API_KEY in Space Settings!")

	def call_deepseek_api(prompt, model="deepseek-r1-distill-llama-8b", temperature=0.1, stream=True):
	check_api_key(model)
	client = OpenAI(api_key=DEEPSEEK_API_KEY, base_url=DEEPSEEK_BASE_URL)
	completion = client.chat.completions.create(
	model=model,
	messages=[{"role": "user", "content": prompt}],
	temperature=temperature,
	stream=stream
	)

	if stream:
	response_content = ""
	for chunk in completion:
	if chunk.choices and chunk.choices[0].delta.content:
	response_content += chunk.choices[0].delta.content
	return response_content
	else:
	return completion.choices[0].message.content

	def process_sentence(sentence):
	sentence = sentence.strip()
	lines = [line.strip() for line in sentence.split('\n') if line.strip()]
	is_multiline = len(lines) > 1
	target_line = lines[-1] if is_multiline else sentence
	last_char = target_line[-1] if target_line else ''
	if last_char in {'.', '?', '!', ';', ','}:
	return target_line
	else:
	return target_line + " ___."

	def is_valid_output(content_2, content_1, content_0):
	if not (content_2.startswith('[output]:') and '\n' not in content_2):
	return False
	if ('___' in content_0 or '___' in content_1) and '___' not in content_2:
	return False
	if len(content_2) > 2 * len(content_1) or len(content_1) > 2 * len(content_2):
	return False
	return True

	def extract_output_content(item):
	if item.startswith('[output]:'):
	output_content = item[len('[output]:'):].strip()
	if output_content and output_content[0] == '"' and output_content[-1] == '"':
	return output_content[1:-1]
	return output_content
	elif item.startswith('[ERROR] Failed to process:'):
	error_content = item[len('[ERROR] Failed to process:'):].strip()
	if error_content and error_content[0] == '"' and error_content[-1] == '"':
	return error_content[1:-1]
	return error_content
	else:
	return None

	def has_missing_spaces(sentence):
	if ' ' in sentence:
	return False
	doc = nlp(sentence)
	alpha_tokens = [t for t in doc if t.is_alpha]
	return len(alpha_tokens) >= 2

	def calculate_whitespace_anomaly_rate(sentences):
	if not sentences:
	return 0.0
	anomaly_count = 0
	for sent in sentences:
	if has_missing_spaces(sent):
	anomaly_count += 1
	continue
	if any(p.search(sent) for p in WHITESPACE_PATTERNS):
	anomaly_count += 1
	return anomaly_count / len(sentences) * 100

	def normalize_tokens(text):
	doc = nlp(text)
	tokens = []
	for t in doc:
	if not t.is_alpha or len(t.text) <= 2 or t.text.isupper():
	continue
	tokens.append(t.text.lower())
	return tokens

	def calculate_spelling_error_density(sentences):
	total_words = 0
	total_errors = 0
	for sent in sentences:
	if has_missing_spaces(sent):
	continue
	tokens = normalize_tokens(sent)
	if not tokens:
	continue
	misspelled = spell.unknown(tokens)
	total_errors += len(misspelled)
	total_words += len(tokens)
	if total_words == 0:
	return 0.0
	return total_errors / total_words * 100

	# ======================== Leaderboard Data Processing ========================
	def load_leaderboard_data():
	json_path = "leaderboard.json"
	try:
	with open(json_path, 'r', encoding='utf-8') as f:
	data = json.load(f)

	for item in data:
	benchmark = item['Benchmark']
	hash_object = hashlib.md5(benchmark.encode())
	item['ID'] = hash_object.hexdigest()[:8]

	return pd.DataFrame(data)
	except Exception as e:
	print(f"Error loading leaderboard: {e}")
	return pd.DataFrame()

	def filter_leaderboard(df, category_query, version_query):
	"""
	Filter by both category and version
	"""
	result = df.copy()

	if category_query != "all":
	result = result[result['Category'] == category_query]

	if version_query != "all":
	if version_query == "original":
	result = result[result['Benchmark'].str.contains('_original', case=False, na=False)]
	elif version_query == "deepseek":
	result = result[result['Benchmark'].str.contains('deepseek_r1_denoising', case=False, na=False)]
	elif version_query == "wac_gec":
	result = result[result['Benchmark'].str.contains('wac_gec', case=False, na=False)]

	return result

	def search_leaderboard(df, query):
	if not query:
	return df
	return df[df['Benchmark'].str.contains(query, case=False, na=False)]

	# ======================== Dataset Denoising Function ========================
	def clean_dataset(file_path, question_column, model_choice, temperature, max_samples, progress=gr.Progress()):
	try:
	try:
	check_api_key(model_choice)
	except ValueError as e:
	if model_choice == "deepseek-r1-distill-llama-8b":
	return str(e), None, ""

	if model_choice == "WAC-GEC" and not WAC_GEC_AVAILABLE:
	return "❌ WAC-GEC model not installed! Please install whitespace_correction package.", None, ""

	progress(0.05, desc="📁 Reading data file...")
	df = pd.read_parquet(file_path)

	if question_column not in df.columns:
	available_columns = ", ".join(df.columns.tolist())
	return f"❌ Column '{question_column}' not found!\nAvailable columns: {available_columns}", None, ""

	data_ori = df[question_column].tolist()[:int(max_samples)]
	total = len(data_ori)

	progress(0.08, desc="📊 Calculating original metrics...")
	original_sentences = [str(item) for item in data_ori]
	war_original = calculate_whitespace_anomaly_rate(original_sentences)
	sed_original = calculate_spelling_error_density(original_sentences)

	progress(0.1, desc=f"🚀 Starting denoising of {total} samples (model: {model_choice})...")

	if model_choice == "WAC-GEC":
	data_corrupt = [str(item) for item in data_ori]
	else:
	data_corrupt = [process_sentence(str(item)) for item in data_ori]

	results = []
	max_retries = 5 if model_choice == "deepseek-r1-distill-llama-8b" else 3
	log_text = f"🚀 Processing {total} samples...\n"
	log_text += f"📌 Using model: {model_choice}\n\n"

	for idx in range(total):
	progress((0.1 + 0.7 * idx / total), desc=f"Processing: {idx+1}/{total}")

	unprocess_text = str(data_ori[idx])
	original_text = data_corrupt[idx]
	response_content = ""
	retry_count = 0

	while retry_count < max_retries:
	try:
	if model_choice == "WAC-GEC":
	response_content = call_wac_gec(original_text)
	else:
	response_content = call_deepseek_api(
	PROMPT_TEMPLATE + original_text,
	model=model_choice,
	temperature=float(temperature)
	)

	if model_choice == "WAC-GEC":
	if response_content.startswith('[output]:'):
	results.append(response_content)
	break
	else:
	retry_count += 1
	else:
	if is_valid_output(response_content, original_text, unprocess_text):
	results.append(response_content)
	break
	else:
	retry_count += 1

	except Exception as e:
	retry_count += 1
	log_text += f"⚠️ Sample {idx+1} error, retry {retry_count}/{max_retries}: {str(e)}\n"
	else:
	results.append(f"[ERROR] Failed to process: {original_text}")
	log_text += f"❌ Sample {idx+1} processing failed\n"

	progress(0.85, desc="📊 Post-processing...")

	lst_extracted = []
	error_count = 0
	unknown_count = 0

	for i, item in enumerate(results):
	extracted = extract_output_content(item)
	if extracted is None:
	lst_extracted.append(str(data_ori[i]))
	unknown_count += 1
	else:
	lst_extracted.append(extracted)
	if item.startswith('[ERROR]'):
	error_count += 1

	lst_final = []
	for i in range(len(data_ori)):
	item = str(data_ori[i])
	if '\n' in item and model_choice != "WAC-GEC":
	tmp_lines = [line.strip() for line in item.strip().split('\n') if line.strip()]
	tmp_lines[-1] = lst_extracted[i]
	lst_final.append('\n'.join(tmp_lines))
	else:
	lst_final.append(lst_extracted[i])

	progress(0.90, desc="📊 Calculating denoised metrics...")
	cleaned_sentences = [str(item) for item in lst_final]
	war_cleaned = calculate_whitespace_anomaly_rate(cleaned_sentences)
	sed_cleaned = calculate_spelling_error_density(cleaned_sentences)

	delta_war = war_cleaned - war_original
	delta_sed = sed_cleaned - sed_original

	progress(0.95, desc="💾 Saving results...")

	df_cleaned = df.copy()
	df_cleaned[question_column + '_cleaned'] = lst_final[:len(df)]

	original_filename = os.path.basename(file_path)
	base_name = original_filename.replace('.parquet', '')
	model_suffix = "WAC-GEC" if model_choice == "WAC-GEC" else "DeepSeek"
	output_filename = f"{base_name}-Denoising-{model_suffix}.parquet"
	output_path = os.path.join(tempfile.gettempdir(), output_filename)

	df_cleaned.to_parquet(output_path, index=False)

	log_text += f"\n\n📊 Processing Complete!\n"
	log_text += f"{'='*50}\n"
	log_text += f"【Basic Statistics】\n"
	log_text += f"- Model used: {model_choice}\n"
	log_text += f"- Total samples: {total}\n"
	log_text += f"- Successfully processed: {total - error_count - unknown_count}\n"
	log_text += f"- Failed samples: {error_count}\n"
	log_text += f"- Unknown format: {unknown_count}\n"
	log_text += f"- Output file: {output_filename}\n\n"

	log_text += f"【Quality Metrics】\n"
	log_text += f"📍 Whitespace Anomaly Rate (WAR):\n"
	log_text += f" Original: {war_original:.2f}% → Denoised: {war_cleaned:.2f}%\n"
	log_text += f" Change: {delta_war:+.2f}% {'✅ Improved' if delta_war < 0 else '⚠️ Increased'}\n\n"

	log_text += f"📍 Spelling Error Density (SED):\n"
	log_text += f" Original: {sed_original:.2f}% → Denoised: {sed_cleaned:.2f}%\n"
	log_text += f" Change: {delta_sed:+.2f}% {'✅ Improved' if delta_sed < 0 else '⚠️ Increased'}\n"

	if model_choice == "WAC-GEC":
	log_text += f"\n💡 Note: WAC-GEC uses two-step correction (GEC grammar + WAC whitespace)\n"

	log_text += f"{'='*50}\n"

	preview_html = create_comparison_html(data_ori[:5], lst_final[:5])

	progress(1.0, desc="✅ Complete!")

	return log_text, output_path, preview_html

	except Exception as e:
	import traceback
	error_detail = traceback.format_exc()
	return f"❌ Processing error: {str(e)}\n\nDetailed error:\n{error_detail}", None, ""

	# ======================== Text Content ========================
	ABOUT_TEXT = """
	## Denoising Workflow

	### Supported Models

	#### 1. DeepSeek-R1 (deepseek-r1-distill-llama-8b)
	- Function: Comprehensive grammar, spelling, and whitespace error correction
	- Advantages: Strong comprehensive capability, handles multiple error types
	- Configuration: Requires DEEPSEEK_API_KEY in Space Settings

	#### 2. WAC-GEC (Whitespace + Grammar Error Correction)
	- Function: Two-step correction workflow
	- Step 1 (GEC): Use LLaMA-2-7B fine-tuned model for grammar and spelling correction
	- Step 2 (WAC): Use whitespace correction model for spacing issues
	- Advantages:
	- Fully local, no API key required
	- Combines two specialized models
	- Suitable for offline environments and limited budgets
	- Model Source:
	- GEC: [lllouo/gec_Chat-LLaMa-2-7B-FT](https://huggingface.co/lllouo/gec_Chat-LLaMa-2-7B-FT)
	- WAC: whitespace_correction library

	### Core Algorithm

	1. Preprocessing (process_sentence)
	- Detect sentence completeness
	- Add marker `___` for incomplete sentences (DeepSeek only)
	- Preserve multi-line text format

	2. Model Denoising
	- DeepSeek: Use API for comprehensive error correction, up to 5 retries
	- WAC-GEC:
	- First use GEC model for grammar and spelling correction
	- Then use WAC model for whitespace correction
	- Up to 3 retries

	3. Format Validation
	- Verify output format correctness
	- Check marker preservation
	- Length reasonability check

	4. Post-processing
	- Extract denoised content
	- Restore original multi-line format
	- Generate Parquet file with model identifier

	### Supported Datasets

	- MMLU: Multiple choice questions across 57 subjects
	- GSM8K: Math reasoning problems
	- ARC-Challenge: Science Q&A
	- MedMCQA: Medical multiple choice
	- CoQA: Conversational Q&A
	- And more...

	### Color Annotation Legend

	- 🔴 Red: Errors in original text (spelling, grammar, spacing, etc.)
	- 🟢 Green: Corrections after denoising
	- ⚫ Black: Unchanged correct parts

	### Tech Stack

	- LLM: DeepSeek API (deepseek-r1-distill-llama-8b)
	- Local Models:
	- GEC: LLaMA-2-7B (fine-tuned for grammar correction)
	- WAC: Whitespace Correction Model
	- Frontend: Gradio 4.16.0
	- Data Processing: Pandas + PyArrow (Parquet)
	- Diff Comparison: Python difflib
	- NLP Tools: spaCy, pyspellchecker
	- API Calls: OpenAI SDK
	- Deployment: Hugging Face Spaces

	### Quality Metrics

	- WAR (Whitespace Anomaly Rate): Whitespace anomaly rate
	- SED (Spelling Error Density): Spelling error density

	### Model Selection Guide

	- Need comprehensive denoising + API budget: Choose DeepSeek-R1
	- Local deployment + complete correction: Choose WAC-GEC (Recommended)
	- Only need spacing correction: Use WAC module alone
	- Fastest speed: Use GPU-accelerated WAC-GEC

	---

	Graduate Thesis Research Showcase \| Powered by DeepSeek API & WAC-GEC
	"""

	# ======================== Gradio Interface ========================
	demo = gr.Blocks(title="Dataset Denoising Framework Demo System", css="""
	.markdown-text { font-size: 16px; line-height: 1.6; }
	""")

	with demo:
	gr.Markdown(
	"""<div style="text-align: center;"><h1>⭐ <span style='color: #e6b800;'>Denoising Factory</span> Based on Benchmark Denoising Framework</h1></div>
	<br>
	<p>This system demonstrates the denoising effects of DeepSeek-R1 and WAC-GEC methods on mainstream benchmark datasets based on <a href="https://github.com/LLLoUo/bd-toolkit" target="_blank">BD-toolkit</a>. Quality is evaluated using WAR (Whitespace Anomaly Rate) and SED (Spelling Error Density) metrics.</p>
	""",
	elem_classes="markdown-text"
	)

	leaderboard_data = load_leaderboard_data()

	with gr.Tabs(elem_classes="tab-buttons") as tabs:
	with gr.TabItem("📊 BD-benchmarks Leaderboard", id=0):
	with gr.Column():
	gr.Markdown("### Mainstream Benchmark Leaderboard After BD Denoising")

	with gr.Row():
	search_bar = gr.Textbox(
	placeholder="🔍 Search benchmark name and press ENTER...",
	show_label=False,
	elem_id="search-bar",
	)
	filter_categories = gr.Radio(
	label="📂 Filter by Benchmark Category",
	choices=["all", "BT", "RA", "TG", "SU", "ME", "GR"],
	value="all",
	elem_id="filter-columns",
	)
	filter_versions = gr.Radio(
	label="🔖 Filter by Dataset Version",
	choices=[
	("All Versions", "all"),
	("Original", "original"),
	("DeepSeek-R1-denoised", "deepseek"),
	("WAC-GEC", "wac_gec")
	],
	value="all",
	elem_id="filter-versions",
	)

	leaderboard_table = gr.Dataframe(
	value=leaderboard_data[['ID', 'Category', 'Benchmark', 'WAR', 'SED', 'Download']],
	headers=['ID', 'Category', 'Benchmark', 'WAR (%)', 'SED', 'Download'],
	datatype=['number', 'str', 'str', 'number', 'number', 'markdown'],
	elem_id="leaderboard-table",
	interactive=False,
	)

	hidden_leaderboard = gr.Dataframe(
	value=leaderboard_data,
	visible=False
	)

	search_bar.submit(
	lambda df, query: search_leaderboard(df, query)[['ID', 'Category', 'Benchmark', 'WAR', 'SED', 'Download']],
	[hidden_leaderboard, search_bar],
	leaderboard_table
	)

	def combined_filter(df, category, version):
	filtered = filter_leaderboard(df, category, version)
	return filtered[['ID', 'Category', 'Benchmark', 'WAR', 'SED', 'Download']]

	filter_categories.change(
	combined_filter,
	[hidden_leaderboard, filter_categories, filter_versions],
	leaderboard_table
	)

	filter_versions.change(
	combined_filter,
	[hidden_leaderboard, filter_categories, filter_versions],
	leaderboard_table
	)

	gr.Markdown("""
	Legend:
	- Category: BT=Basic Tasks, RA=Reasoning Abilities, TG=Text Generation, SU=Speech Understanding, ME=Medical, GR=Grammar
	- Version: Original=Unprocessed dataset, DeepSeek-R1=DeepSeek denoised version, WAC-GEC=WAC-GEC denoised version
	- WAR: Whitespace Anomaly Rate (lower is better)
	- SED: Spelling Error Density (lower is better)
	""", elem_classes="markdown-text")



	with gr.TabItem("🚀 BD-toolkit Demo", id=2):
	gr.Markdown("## BD-toolkit Lightweight Demo")

	model_status = "✅ WAC-GEC: " + ("Available" if WAC_GEC_AVAILABLE else "Not Installed")
	model_status += " \| ✅ DeepSeek-R1: " + ("Configured" if DEEPSEEK_API_KEY else "API Key Not Configured")
	gr.Markdown(f"Model Status: {model_status}")

	with gr.Row():
	with gr.Column():
	file_input = gr.File(
	label="📁 Upload Parquet File",
	file_types=[".parquet"]
	)

	question_column = gr.Textbox(
	label="📝 Question Column Name",
	value="question",
	placeholder="e.g., question, input_text, prompt"
	)

	model_choice = gr.Dropdown(
	choices=["WAC-GEC", "deepseek-r1-distill-llama-8b"],
	value="WAC-GEC",
	label="🤖 Select Model",
	info="DeepSeek: Comprehensive correction \| WAC-GEC: Grammar + whitespace (local model)"
	)

	temperature = gr.Slider(
	minimum=0.0,
	maximum=1.0,
	value=0.1,
	step=0.1,
	label="🌡️ Temperature",
	info="Only effective for DeepSeek",
	interactive=False
	)

	max_samples = gr.Slider(
	minimum=1,
	maximum=100,
	value=5,
	step=1,
	label="📊 Number of Samples to Process (Demo Limit)"
	)

	clean_btn = gr.Button("🚀 Start Denoising", variant="primary", size="lg")

	with gr.Column():
	output_text = gr.Textbox(
	label="⏳ Processing Progress",
	lines=10,
	max_lines=15
	)

	download_file = gr.File(label="📥 Download Denoised Dataset")

	def update_temperature_interactive(model):
	if model == "deepseek-r1-distill-llama-8b":
	return gr.update(interactive=True, info="Adjust generation randomness")
	else:
	return gr.update(interactive=False, info="WAC-GEC model does not support temperature parameter")

	model_choice.change(
	fn=update_temperature_interactive,
	inputs=[model_choice],
	outputs=[temperature]
	)

	gr.Markdown("### 🎨 Denoising Effect Comparison Preview")
	gr.Markdown("""
	Color Legend:
	- 🔴 <span style="color: #dc3545;">Red</span> = Errors in original text
	- 🟢 <span style="color: #28a745;">Green</span> = Corrections after denoising
	- ⚫ Black = Unchanged correct parts
	""")

	colored_preview = gr.HTML(label="")

	clean_btn.click(
	fn=clean_dataset,
	inputs=[file_input, question_column, model_choice, temperature, max_samples],
	outputs=[output_text, download_file, colored_preview]
	)

	with gr.TabItem("📝 About", id=3):
	gr.Markdown(ABOUT_TEXT, elem_classes="markdown-text")

	if __name__ == "__main__":
	print("🚀 Preloading WAC-GEC models...")
	initialize_wac_gec()

	demo.launch(
	server_name="0.0.0.0",
	server_port=7860,
	ssr_mode=False
	)