Spaces:
Sleeping
Sleeping
Switch to requests library for API calls (stable version)
Browse files- app.py +51 -33
- requirements.txt +1 -1
app.py
CHANGED
|
@@ -1,24 +1,44 @@
|
|
| 1 |
-
# app.py -
|
| 2 |
import gradio as gr
|
| 3 |
import json
|
| 4 |
import pandas as pd
|
| 5 |
import os
|
| 6 |
from typing import Optional
|
| 7 |
import tempfile
|
|
|
|
| 8 |
|
| 9 |
-
#
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
if not
|
| 16 |
raise ValueError("⚠️ 请在 Space Settings 中配置 DEEPSEEK_API_KEY!\n\n前往:Settings → Repository secrets → New secret")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 17 |
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 22 |
|
| 23 |
# 预置的Leaderboard数据
|
| 24 |
LEADERBOARD_DATA = [
|
|
@@ -134,9 +154,9 @@ def extract_output_content(item):
|
|
| 134 |
def clean_dataset(file_path, question_column, model_choice, temperature, max_samples, progress=gr.Progress()):
|
| 135 |
"""清洗数据集的核心函数"""
|
| 136 |
try:
|
| 137 |
-
#
|
| 138 |
try:
|
| 139 |
-
|
| 140 |
except ValueError as e:
|
| 141 |
return str(e), None, None
|
| 142 |
|
|
@@ -156,7 +176,7 @@ def clean_dataset(file_path, question_column, model_choice, temperature, max_sam
|
|
| 156 |
progress(0.1, desc=f"🚀 开始清洗 {total} 个样本...")
|
| 157 |
|
| 158 |
# 预处理:添加标记
|
| 159 |
-
data_corrupt = [process_sentence(item) for item in data_ori]
|
| 160 |
|
| 161 |
# 清洗结果
|
| 162 |
results = []
|
|
@@ -167,25 +187,20 @@ def clean_dataset(file_path, question_column, model_choice, temperature, max_sam
|
|
| 167 |
for idx in range(total):
|
| 168 |
progress((0.1 + 0.8 * idx / total), desc=f"处理中: {idx+1}/{total}")
|
| 169 |
|
| 170 |
-
unprocess_text = data_ori[idx]
|
| 171 |
original_text = data_corrupt[idx]
|
| 172 |
response_content = ""
|
| 173 |
retry_count = 0
|
| 174 |
|
| 175 |
while retry_count < max_retries:
|
| 176 |
try:
|
| 177 |
-
|
|
|
|
|
|
|
| 178 |
model=model_choice,
|
| 179 |
-
messages=[{"role": "user", "content": PROMPT_TEMPLATE + original_text}],
|
| 180 |
-
stream=True,
|
| 181 |
temperature=float(temperature)
|
| 182 |
)
|
| 183 |
|
| 184 |
-
response_content = ""
|
| 185 |
-
for chunk in completion:
|
| 186 |
-
if chunk.choices and chunk.choices[0].delta.content:
|
| 187 |
-
response_content += chunk.choices[0].delta.content
|
| 188 |
-
|
| 189 |
# 验证输出格式
|
| 190 |
if is_valid_output(response_content, original_text, unprocess_text):
|
| 191 |
results.append(response_content)
|
|
@@ -193,10 +208,11 @@ def clean_dataset(file_path, question_column, model_choice, temperature, max_sam
|
|
| 193 |
break
|
| 194 |
else:
|
| 195 |
retry_count += 1
|
|
|
|
| 196 |
|
| 197 |
except Exception as e:
|
| 198 |
retry_count += 1
|
| 199 |
-
log_text += f"⚠️ 样本 {idx+1} 重试 {retry_count}/{max_retries}: {str(e)}\n"
|
| 200 |
else:
|
| 201 |
# 重试次数用尽
|
| 202 |
results.append(f"[ERROR] Failed to process: {original_text}")
|
|
@@ -212,7 +228,7 @@ def clean_dataset(file_path, question_column, model_choice, temperature, max_sam
|
|
| 212 |
for i, item in enumerate(results):
|
| 213 |
extracted = extract_output_content(item)
|
| 214 |
if extracted is None:
|
| 215 |
-
lst_extracted.append(data_ori[i])
|
| 216 |
unknown_count += 1
|
| 217 |
else:
|
| 218 |
lst_extracted.append(extracted)
|
|
@@ -222,7 +238,7 @@ def clean_dataset(file_path, question_column, model_choice, temperature, max_sam
|
|
| 222 |
# 恢复多行格式
|
| 223 |
lst_final = []
|
| 224 |
for i in range(len(data_ori)):
|
| 225 |
-
item = data_ori[i]
|
| 226 |
if '\n' in item:
|
| 227 |
tmp_lines = [line.strip() for line in item.strip().split('\n') if line.strip()]
|
| 228 |
tmp_lines[-1] = lst_extracted[i]
|
|
@@ -255,8 +271,8 @@ def clean_dataset(file_path, question_column, model_choice, temperature, max_sam
|
|
| 255 |
|
| 256 |
# 生成预览数据
|
| 257 |
preview_df = pd.DataFrame({
|
| 258 |
-
'原始问题': data_ori[:10],
|
| 259 |
-
'清洗后问题': lst_final[:10]
|
| 260 |
})
|
| 261 |
|
| 262 |
progress(1.0, desc="✅ 完成!")
|
|
@@ -264,7 +280,9 @@ def clean_dataset(file_path, question_column, model_choice, temperature, max_sam
|
|
| 264 |
return log_text, output_path, preview_df
|
| 265 |
|
| 266 |
except Exception as e:
|
| 267 |
-
|
|
|
|
|
|
|
| 268 |
|
| 269 |
def show_leaderboard():
|
| 270 |
"""显示Leaderboard"""
|
|
@@ -392,7 +410,7 @@ with gr.Blocks(title="数据集清洗框架展示系统") as demo:
|
|
| 392 |
2. **LLM清洗**
|
| 393 |
- 使用 DeepSeek API 进行语法、拼写、空格错误修正
|
| 394 |
- 重试机制:最多重试3次
|
| 395 |
-
-
|
| 396 |
|
| 397 |
3. **格式验证 (is_valid_output)**
|
| 398 |
- 验证输出格式正确性
|
|
@@ -418,6 +436,7 @@ with gr.Blocks(title="数据集清洗框架展示系统") as demo:
|
|
| 418 |
- **LLM**: DeepSeek API (deepseek-chat / deepseek-coder)
|
| 419 |
- **前端**: Gradio 4.16.0
|
| 420 |
- **数据处理**: Pandas + PyArrow (Parquet)
|
|
|
|
| 421 |
- **部署**: Hugging Face Spaces
|
| 422 |
|
| 423 |
### 研究成果
|
|
@@ -438,12 +457,11 @@ with gr.Blocks(title="数据集清洗框架展示系统") as demo:
|
|
| 438 |
- Demo版本限制最多处理100个样本
|
| 439 |
- 完整版本可处理数万样本
|
| 440 |
- 建议 temperature=0.1 以获得稳定输出
|
|
|
|
| 441 |
|
| 442 |
---
|
| 443 |
|
| 444 |
**研究生毕业论文成果展示** | Powered by DeepSeek API
|
| 445 |
-
|
| 446 |
-
GitHub: [添加你的项目链接]
|
| 447 |
""")
|
| 448 |
|
| 449 |
# 启动应用
|
|
|
|
| 1 |
+
# app.py - 使用 requests 调用 DeepSeek API(稳定版本)
|
| 2 |
import gradio as gr
|
| 3 |
import json
|
| 4 |
import pandas as pd
|
| 5 |
import os
|
| 6 |
from typing import Optional
|
| 7 |
import tempfile
|
| 8 |
+
import requests
|
| 9 |
|
| 10 |
+
# DeepSeek API配置
|
| 11 |
+
DEEPSEEK_API_KEY = os.getenv("DEEPSEEK_API_KEY", "")
|
| 12 |
+
DEEPSEEK_BASE_URL = "https://api.deepseek.com/v1/chat/completions"
|
| 13 |
+
|
| 14 |
+
def check_api_key():
|
| 15 |
+
"""检查API Key是否配置"""
|
| 16 |
+
if not DEEPSEEK_API_KEY:
|
| 17 |
raise ValueError("⚠️ 请在 Space Settings 中配置 DEEPSEEK_API_KEY!\n\n前往:Settings → Repository secrets → New secret")
|
| 18 |
+
|
| 19 |
+
def call_deepseek_api(prompt, model="deepseek-chat", temperature=0.1):
|
| 20 |
+
"""使用 requests 调用 DeepSeek API"""
|
| 21 |
+
check_api_key()
|
| 22 |
+
|
| 23 |
+
headers = {
|
| 24 |
+
"Content-Type": "application/json",
|
| 25 |
+
"Authorization": f"Bearer {DEEPSEEK_API_KEY}"
|
| 26 |
+
}
|
| 27 |
|
| 28 |
+
data = {
|
| 29 |
+
"model": model,
|
| 30 |
+
"messages": [
|
| 31 |
+
{"role": "user", "content": prompt}
|
| 32 |
+
],
|
| 33 |
+
"temperature": temperature,
|
| 34 |
+
"stream": False # 简化处理,不使用流式
|
| 35 |
+
}
|
| 36 |
+
|
| 37 |
+
response = requests.post(DEEPSEEK_BASE_URL, headers=headers, json=data, timeout=60)
|
| 38 |
+
response.raise_for_status()
|
| 39 |
+
|
| 40 |
+
result = response.json()
|
| 41 |
+
return result['choices'][0]['message']['content']
|
| 42 |
|
| 43 |
# 预置的Leaderboard数据
|
| 44 |
LEADERBOARD_DATA = [
|
|
|
|
| 154 |
def clean_dataset(file_path, question_column, model_choice, temperature, max_samples, progress=gr.Progress()):
|
| 155 |
"""清洗数据集的核心函数"""
|
| 156 |
try:
|
| 157 |
+
# 检查 API Key
|
| 158 |
try:
|
| 159 |
+
check_api_key()
|
| 160 |
except ValueError as e:
|
| 161 |
return str(e), None, None
|
| 162 |
|
|
|
|
| 176 |
progress(0.1, desc=f"🚀 开始清洗 {total} 个样本...")
|
| 177 |
|
| 178 |
# 预处理:添加标记
|
| 179 |
+
data_corrupt = [process_sentence(str(item)) for item in data_ori]
|
| 180 |
|
| 181 |
# 清洗结果
|
| 182 |
results = []
|
|
|
|
| 187 |
for idx in range(total):
|
| 188 |
progress((0.1 + 0.8 * idx / total), desc=f"处理中: {idx+1}/{total}")
|
| 189 |
|
| 190 |
+
unprocess_text = str(data_ori[idx])
|
| 191 |
original_text = data_corrupt[idx]
|
| 192 |
response_content = ""
|
| 193 |
retry_count = 0
|
| 194 |
|
| 195 |
while retry_count < max_retries:
|
| 196 |
try:
|
| 197 |
+
# 调用 DeepSeek API
|
| 198 |
+
response_content = call_deepseek_api(
|
| 199 |
+
PROMPT_TEMPLATE + original_text,
|
| 200 |
model=model_choice,
|
|
|
|
|
|
|
| 201 |
temperature=float(temperature)
|
| 202 |
)
|
| 203 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 204 |
# 验证输出格式
|
| 205 |
if is_valid_output(response_content, original_text, unprocess_text):
|
| 206 |
results.append(response_content)
|
|
|
|
| 208 |
break
|
| 209 |
else:
|
| 210 |
retry_count += 1
|
| 211 |
+
log_text += f"⚠️ 样本 {idx+1} 格式验证失败,重试 {retry_count}/{max_retries}\n"
|
| 212 |
|
| 213 |
except Exception as e:
|
| 214 |
retry_count += 1
|
| 215 |
+
log_text += f"⚠️ 样本 {idx+1} API错误,重试 {retry_count}/{max_retries}: {str(e)}\n"
|
| 216 |
else:
|
| 217 |
# 重试次数用尽
|
| 218 |
results.append(f"[ERROR] Failed to process: {original_text}")
|
|
|
|
| 228 |
for i, item in enumerate(results):
|
| 229 |
extracted = extract_output_content(item)
|
| 230 |
if extracted is None:
|
| 231 |
+
lst_extracted.append(str(data_ori[i]))
|
| 232 |
unknown_count += 1
|
| 233 |
else:
|
| 234 |
lst_extracted.append(extracted)
|
|
|
|
| 238 |
# 恢复多行格式
|
| 239 |
lst_final = []
|
| 240 |
for i in range(len(data_ori)):
|
| 241 |
+
item = str(data_ori[i])
|
| 242 |
if '\n' in item:
|
| 243 |
tmp_lines = [line.strip() for line in item.strip().split('\n') if line.strip()]
|
| 244 |
tmp_lines[-1] = lst_extracted[i]
|
|
|
|
| 271 |
|
| 272 |
# 生成预览数据
|
| 273 |
preview_df = pd.DataFrame({
|
| 274 |
+
'原始问题': [str(x)[:100] for x in data_ori[:10]],
|
| 275 |
+
'清洗后问题': [str(x)[:100] for x in lst_final[:10]]
|
| 276 |
})
|
| 277 |
|
| 278 |
progress(1.0, desc="✅ 完成!")
|
|
|
|
| 280 |
return log_text, output_path, preview_df
|
| 281 |
|
| 282 |
except Exception as e:
|
| 283 |
+
import traceback
|
| 284 |
+
error_detail = traceback.format_exc()
|
| 285 |
+
return f"❌ 处理出错: {str(e)}\n\n详细错误:\n{error_detail}", None, None
|
| 286 |
|
| 287 |
def show_leaderboard():
|
| 288 |
"""显示Leaderboard"""
|
|
|
|
| 410 |
2. **LLM清洗**
|
| 411 |
- 使用 DeepSeek API 进行语法、拼写、空格错误修正
|
| 412 |
- 重试机制:最多重试3次
|
| 413 |
+
- 稳定的 REST API 调用
|
| 414 |
|
| 415 |
3. **格式验证 (is_valid_output)**
|
| 416 |
- 验证输出格式正确性
|
|
|
|
| 436 |
- **LLM**: DeepSeek API (deepseek-chat / deepseek-coder)
|
| 437 |
- **前端**: Gradio 4.16.0
|
| 438 |
- **数据处理**: Pandas + PyArrow (Parquet)
|
| 439 |
+
- **API调用**: Requests (稳定版本)
|
| 440 |
- **部署**: Hugging Face Spaces
|
| 441 |
|
| 442 |
### 研究成果
|
|
|
|
| 457 |
- Demo版本限制最多处理100个样本
|
| 458 |
- 完整版本可处理数万样本
|
| 459 |
- 建议 temperature=0.1 以获得稳定输出
|
| 460 |
+
- 使用稳定的 REST API,避免兼容性问题
|
| 461 |
|
| 462 |
---
|
| 463 |
|
| 464 |
**研究生毕业论文成果展示** | Powered by DeepSeek API
|
|
|
|
|
|
|
| 465 |
""")
|
| 466 |
|
| 467 |
# 启动应用
|
requirements.txt
CHANGED
|
@@ -1,4 +1,4 @@
|
|
| 1 |
gradio==4.16.0
|
| 2 |
-
|
| 3 |
pandas==2.0.3
|
| 4 |
pyarrow==14.0.1
|
|
|
|
| 1 |
gradio==4.16.0
|
| 2 |
+
requests==2.31.0
|
| 3 |
pandas==2.0.3
|
| 4 |
pyarrow==14.0.1
|