lllouo commited on
Commit
b335dbb
·
1 Parent(s): 66b1f5b

Switch to requests library for API calls (stable version)

Browse files
Files changed (2) hide show
  1. app.py +51 -33
  2. requirements.txt +1 -1
app.py CHANGED
@@ -1,24 +1,44 @@
1
- # app.py - 基于真实清洗逻辑的 Gradio 版本
2
  import gradio as gr
3
  import json
4
  import pandas as pd
5
  import os
6
  from typing import Optional
7
  import tempfile
 
8
 
9
- # 延迟导入 OpenAI,避免启动时就要求 API key
10
- def get_client():
11
- """延迟初始化OpenAI客户端"""
12
- from openai import OpenAI
13
-
14
- api_key = os.getenv("DEEPSEEK_API_KEY", "")
15
- if not api_key:
16
  raise ValueError("⚠️ 请在 Space Settings 中配置 DEEPSEEK_API_KEY!\n\n前往:Settings → Repository secrets → New secret")
 
 
 
 
 
 
 
 
 
17
 
18
- return OpenAI(
19
- api_key=api_key,
20
- base_url="https://api.deepseek.com/v1"
21
- )
 
 
 
 
 
 
 
 
 
 
22
 
23
  # 预置的Leaderboard数据
24
  LEADERBOARD_DATA = [
@@ -134,9 +154,9 @@ def extract_output_content(item):
134
  def clean_dataset(file_path, question_column, model_choice, temperature, max_samples, progress=gr.Progress()):
135
  """清洗数据集的核心函数"""
136
  try:
137
- # 初始化客户端
138
  try:
139
- client = get_client()
140
  except ValueError as e:
141
  return str(e), None, None
142
 
@@ -156,7 +176,7 @@ def clean_dataset(file_path, question_column, model_choice, temperature, max_sam
156
  progress(0.1, desc=f"🚀 开始清洗 {total} 个样本...")
157
 
158
  # 预处理:添加标记
159
- data_corrupt = [process_sentence(item) for item in data_ori]
160
 
161
  # 清洗结果
162
  results = []
@@ -167,25 +187,20 @@ def clean_dataset(file_path, question_column, model_choice, temperature, max_sam
167
  for idx in range(total):
168
  progress((0.1 + 0.8 * idx / total), desc=f"处理中: {idx+1}/{total}")
169
 
170
- unprocess_text = data_ori[idx]
171
  original_text = data_corrupt[idx]
172
  response_content = ""
173
  retry_count = 0
174
 
175
  while retry_count < max_retries:
176
  try:
177
- completion = client.chat.completions.create(
 
 
178
  model=model_choice,
179
- messages=[{"role": "user", "content": PROMPT_TEMPLATE + original_text}],
180
- stream=True,
181
  temperature=float(temperature)
182
  )
183
 
184
- response_content = ""
185
- for chunk in completion:
186
- if chunk.choices and chunk.choices[0].delta.content:
187
- response_content += chunk.choices[0].delta.content
188
-
189
  # 验证输出格式
190
  if is_valid_output(response_content, original_text, unprocess_text):
191
  results.append(response_content)
@@ -193,10 +208,11 @@ def clean_dataset(file_path, question_column, model_choice, temperature, max_sam
193
  break
194
  else:
195
  retry_count += 1
 
196
 
197
  except Exception as e:
198
  retry_count += 1
199
- log_text += f"⚠️ 样本 {idx+1} 重试 {retry_count}/{max_retries}: {str(e)}\n"
200
  else:
201
  # 重试次数用尽
202
  results.append(f"[ERROR] Failed to process: {original_text}")
@@ -212,7 +228,7 @@ def clean_dataset(file_path, question_column, model_choice, temperature, max_sam
212
  for i, item in enumerate(results):
213
  extracted = extract_output_content(item)
214
  if extracted is None:
215
- lst_extracted.append(data_ori[i])
216
  unknown_count += 1
217
  else:
218
  lst_extracted.append(extracted)
@@ -222,7 +238,7 @@ def clean_dataset(file_path, question_column, model_choice, temperature, max_sam
222
  # 恢复多行格式
223
  lst_final = []
224
  for i in range(len(data_ori)):
225
- item = data_ori[i]
226
  if '\n' in item:
227
  tmp_lines = [line.strip() for line in item.strip().split('\n') if line.strip()]
228
  tmp_lines[-1] = lst_extracted[i]
@@ -255,8 +271,8 @@ def clean_dataset(file_path, question_column, model_choice, temperature, max_sam
255
 
256
  # 生成预览数据
257
  preview_df = pd.DataFrame({
258
- '原始问题': data_ori[:10],
259
- '清洗后问题': lst_final[:10]
260
  })
261
 
262
  progress(1.0, desc="✅ 完成!")
@@ -264,7 +280,9 @@ def clean_dataset(file_path, question_column, model_choice, temperature, max_sam
264
  return log_text, output_path, preview_df
265
 
266
  except Exception as e:
267
- return f"❌ 处理出错: {str(e)}", None, None
 
 
268
 
269
  def show_leaderboard():
270
  """显示Leaderboard"""
@@ -392,7 +410,7 @@ with gr.Blocks(title="数据集清洗框架展示系统") as demo:
392
  2. **LLM清洗**
393
  - 使用 DeepSeek API 进行语法、拼写、空格错误修正
394
  - 重试机制:最多重试3次
395
- - 流式响应处理
396
 
397
  3. **格式验证 (is_valid_output)**
398
  - 验证输出格式正确性
@@ -418,6 +436,7 @@ with gr.Blocks(title="数据集清洗框架展示系统") as demo:
418
  - **LLM**: DeepSeek API (deepseek-chat / deepseek-coder)
419
  - **前端**: Gradio 4.16.0
420
  - **数据处理**: Pandas + PyArrow (Parquet)
 
421
  - **部署**: Hugging Face Spaces
422
 
423
  ### 研究成果
@@ -438,12 +457,11 @@ with gr.Blocks(title="数据集清洗框架展示系统") as demo:
438
  - Demo版本限制最多处理100个样本
439
  - 完整版本可处理数万样本
440
  - 建议 temperature=0.1 以获得稳定输出
 
441
 
442
  ---
443
 
444
  **研究生毕业论文成果展示** | Powered by DeepSeek API
445
-
446
- GitHub: [添加你的项目链接]
447
  """)
448
 
449
  # 启动应用
 
1
+ # app.py - 使用 requests 调用 DeepSeek API(稳定版本
2
  import gradio as gr
3
  import json
4
  import pandas as pd
5
  import os
6
  from typing import Optional
7
  import tempfile
8
+ import requests
9
 
10
+ # DeepSeek API配置
11
+ DEEPSEEK_API_KEY = os.getenv("DEEPSEEK_API_KEY", "")
12
+ DEEPSEEK_BASE_URL = "https://api.deepseek.com/v1/chat/completions"
13
+
14
+ def check_api_key():
15
+ """检查API Key是否配置"""
16
+ if not DEEPSEEK_API_KEY:
17
  raise ValueError("⚠️ 请在 Space Settings 中配置 DEEPSEEK_API_KEY!\n\n前往:Settings → Repository secrets → New secret")
18
+
19
+ def call_deepseek_api(prompt, model="deepseek-chat", temperature=0.1):
20
+ """使用 requests 调用 DeepSeek API"""
21
+ check_api_key()
22
+
23
+ headers = {
24
+ "Content-Type": "application/json",
25
+ "Authorization": f"Bearer {DEEPSEEK_API_KEY}"
26
+ }
27
 
28
+ data = {
29
+ "model": model,
30
+ "messages": [
31
+ {"role": "user", "content": prompt}
32
+ ],
33
+ "temperature": temperature,
34
+ "stream": False # 简化处理,不使用流式
35
+ }
36
+
37
+ response = requests.post(DEEPSEEK_BASE_URL, headers=headers, json=data, timeout=60)
38
+ response.raise_for_status()
39
+
40
+ result = response.json()
41
+ return result['choices'][0]['message']['content']
42
 
43
  # 预置的Leaderboard数据
44
  LEADERBOARD_DATA = [
 
154
  def clean_dataset(file_path, question_column, model_choice, temperature, max_samples, progress=gr.Progress()):
155
  """清洗数据集的核心函数"""
156
  try:
157
+ # 检查 API Key
158
  try:
159
+ check_api_key()
160
  except ValueError as e:
161
  return str(e), None, None
162
 
 
176
  progress(0.1, desc=f"🚀 开始清洗 {total} 个样本...")
177
 
178
  # 预处理:添加标记
179
+ data_corrupt = [process_sentence(str(item)) for item in data_ori]
180
 
181
  # 清洗结果
182
  results = []
 
187
  for idx in range(total):
188
  progress((0.1 + 0.8 * idx / total), desc=f"处理中: {idx+1}/{total}")
189
 
190
+ unprocess_text = str(data_ori[idx])
191
  original_text = data_corrupt[idx]
192
  response_content = ""
193
  retry_count = 0
194
 
195
  while retry_count < max_retries:
196
  try:
197
+ # 调用 DeepSeek API
198
+ response_content = call_deepseek_api(
199
+ PROMPT_TEMPLATE + original_text,
200
  model=model_choice,
 
 
201
  temperature=float(temperature)
202
  )
203
 
 
 
 
 
 
204
  # 验证输出格式
205
  if is_valid_output(response_content, original_text, unprocess_text):
206
  results.append(response_content)
 
208
  break
209
  else:
210
  retry_count += 1
211
+ log_text += f"⚠️ 样本 {idx+1} 格式验证失败,重试 {retry_count}/{max_retries}\n"
212
 
213
  except Exception as e:
214
  retry_count += 1
215
+ log_text += f"⚠️ 样本 {idx+1} API错误,重试 {retry_count}/{max_retries}: {str(e)}\n"
216
  else:
217
  # 重试次数用尽
218
  results.append(f"[ERROR] Failed to process: {original_text}")
 
228
  for i, item in enumerate(results):
229
  extracted = extract_output_content(item)
230
  if extracted is None:
231
+ lst_extracted.append(str(data_ori[i]))
232
  unknown_count += 1
233
  else:
234
  lst_extracted.append(extracted)
 
238
  # 恢复多行格式
239
  lst_final = []
240
  for i in range(len(data_ori)):
241
+ item = str(data_ori[i])
242
  if '\n' in item:
243
  tmp_lines = [line.strip() for line in item.strip().split('\n') if line.strip()]
244
  tmp_lines[-1] = lst_extracted[i]
 
271
 
272
  # 生成预览数据
273
  preview_df = pd.DataFrame({
274
+ '原始问题': [str(x)[:100] for x in data_ori[:10]],
275
+ '清洗后问题': [str(x)[:100] for x in lst_final[:10]]
276
  })
277
 
278
  progress(1.0, desc="✅ 完成!")
 
280
  return log_text, output_path, preview_df
281
 
282
  except Exception as e:
283
+ import traceback
284
+ error_detail = traceback.format_exc()
285
+ return f"❌ 处理出错: {str(e)}\n\n详细错误:\n{error_detail}", None, None
286
 
287
  def show_leaderboard():
288
  """显示Leaderboard"""
 
410
  2. **LLM清洗**
411
  - 使用 DeepSeek API 进行语法、拼写、空格错误修正
412
  - 重试机制:最多重试3次
413
+ - 稳定的 REST API 调用
414
 
415
  3. **格式验证 (is_valid_output)**
416
  - 验证输出格式正确性
 
436
  - **LLM**: DeepSeek API (deepseek-chat / deepseek-coder)
437
  - **前端**: Gradio 4.16.0
438
  - **数据处理**: Pandas + PyArrow (Parquet)
439
+ - **API调用**: Requests (稳定版本)
440
  - **部署**: Hugging Face Spaces
441
 
442
  ### 研究成果
 
457
  - Demo版本限制最多处理100个样本
458
  - 完整版本可处理数万样本
459
  - 建议 temperature=0.1 以获得稳定输出
460
+ - 使用稳定的 REST API,避免兼容性问题
461
 
462
  ---
463
 
464
  **研究生毕业论文成果展示** | Powered by DeepSeek API
 
 
465
  """)
466
 
467
  # 启动应用
requirements.txt CHANGED
@@ -1,4 +1,4 @@
1
  gradio==4.16.0
2
- openai==1.54.3
3
  pandas==2.0.3
4
  pyarrow==14.0.1
 
1
  gradio==4.16.0
2
+ requests==2.31.0
3
  pandas==2.0.3
4
  pyarrow==14.0.1