Paul720810 commited on
Commit
0481392
·
verified ·
1 Parent(s): f327d97

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +161 -34
app.py CHANGED
@@ -136,8 +136,22 @@ class CompleteDataLoader:
136
  if 'messages' in item:
137
  user_content = item['messages'][0]['content']
138
  assistant_content = item['messages'][1]['content']
139
- print(f"User: {user_content[:100]}...")
140
- print(f"Assistant: {assistant_content[:100]}...")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
141
 
142
  # 檢查是否為JSON格式
143
  if assistant_content.strip().startswith('{'):
@@ -148,9 +162,46 @@ class CompleteDataLoader:
148
  print("JSON解析失敗")
149
  else:
150
  print(f"無messages字段: {list(item.keys())}")
 
 
151
  except Exception as e:
152
  print(f"預覽失敗: {e}")
153
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
154
  def load_complete_dataset(self) -> bool:
155
  try:
156
  print(f"[{get_current_time()}] 正在加載完整數據集 '{DATASET_REPO_ID}'...")
@@ -214,74 +265,112 @@ class CompleteDataLoader:
214
 
215
  # SQL提取邏輯(如果還沒從JSON中獲得)
216
  if not sql_query:
217
- # 策略1: 標準「SQL查詢:」格式
218
- sql_match = re.search(r'SQL查詢:\s*(.*?)(?:\n|$)', assistant_content, re.DOTALL)
219
- if sql_match:
220
- sql_query = sql_match.group(1).strip()
221
 
222
- # 策略2: SQL代碼塊格式
223
  if not sql_query:
224
- sql_block_match = re.search(r'```sql\s*(.*?)\s*```', assistant_content, re.DOTALL)
225
- if sql_block_match:
226
- sql_query = sql_block_match.group(1).strip()
 
 
227
 
228
- # 策略3: 查找任何包含 SELECT 的行
229
  if not sql_query:
230
- for line in assistant_content.split('\n'):
231
- if 'SELECT' in line.upper():
232
- # 從這行開始提取到最後或到下個非SQL行
233
- sql_lines = []
234
- found_start = False
235
- for l in assistant_content.split('\n'):
236
- if 'SELECT' in l.upper():
237
- found_start = True
238
- if found_start:
239
- if l.strip() and not l.strip().startswith('```'):
240
- sql_lines.append(l)
241
- elif l.strip() == '' and sql_lines:
242
- continue
243
- elif found_start and len(sql_lines) > 0:
244
- break
245
- if sql_lines:
246
- sql_query = '\n'.join(sql_lines).strip()
 
247
  break
 
 
248
 
249
- # 策略4: 如果還是沒找到,使用整個assistant內容
 
 
 
250
  if not sql_query:
251
- sql_query = assistant_content.strip()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
252
 
253
  # 清理SQL查詢
254
  if sql_query:
 
255
  sql_query = re.sub(r'```sql|```', '', sql_query).strip()
256
  sql_query = re.sub(r'^思考過程:.*?\n', '', sql_query, flags=re.MULTILINE).strip()
257
  sql_query = re.sub(r'^SQL查詢:\s*', '', sql_query, flags=re.MULTILINE).strip()
258
 
 
 
 
 
 
 
 
 
 
259
  # 清理問題文本
260
  if question:
261
  question = re.sub(r'^###\s*', '', question).strip()
262
  question = re.sub(r'Your JSON Response.*', '', question).strip()
 
 
263
 
264
  # 數據質量驗證(降低標準以提高利用率)
265
  if not question or len(question.strip()) < 3:
266
  skipped_reasons["empty_question"] += 1
267
  continue
268
 
269
- if not sql_query or len(sql_query.strip()) < 5: # 降低最小長度要求
270
  skipped_reasons["empty_sql"] += 1
 
 
271
  continue
272
 
273
  # 更寬鬆的SQL驗證
274
  sql_upper = sql_query.upper()
275
- if "SELECT" not in sql_upper and "WITH" not in sql_upper:
276
  skipped_reasons["invalid_format"] += 1
 
 
277
  continue
278
 
279
  self.questions.append(question)
280
  self.sql_answers.append(sql_query)
281
  successful_loads += 1
282
 
283
- # 調試:顯示前幾個成功案例
284
- if successful_loads <= 3:
285
  print(f"✅ 成功案例 {successful_loads}:")
286
  print(f" 問題: {question[:80]}...")
287
  print(f" SQL: {sql_query[:80]}...")
@@ -377,12 +466,50 @@ class CompleteTextToSQLSystem:
377
  self.retrieval_system = RetrievalSystem()
378
  self.initialize_system()
379
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
380
  def initialize_system(self):
381
  print("正在初始化完整數據系統...")
382
 
383
  # 首先預覽數據結構
384
  self.data_loader.preview_dataset_structure(3)
385
 
 
 
 
386
  # 然後加載數據
387
  self.data_loader.load_complete_dataset()
388
  self.data_loader.load_schema()
 
136
  if 'messages' in item:
137
  user_content = item['messages'][0]['content']
138
  assistant_content = item['messages'][1]['content']
139
+ print(f"User: {user_content[:120]}...")
140
+ print(f"Assistant: {assistant_content[:120]}...")
141
+
142
+ # 檢查SQL代碼塊
143
+ sql_block_match = re.search(r'```sql\s*(.*?)\s*```', assistant_content, re.DOTALL)
144
+ if sql_block_match:
145
+ sql_content = sql_block_match.group(1).strip()
146
+ print(f"✅ 找到SQL代碼塊: {sql_content[:60]}...")
147
+ else:
148
+ print("❌ 未找到SQL代碼塊")
149
+
150
+ # 檢查是否有其他SQL格式
151
+ if 'SELECT' in assistant_content.upper():
152
+ print("⚠️ 但包含SELECT關鍵字")
153
+ if 'SQL查詢:' in assistant_content:
154
+ print("⚠️ 但包含'SQL查詢:'標記")
155
 
156
  # 檢查是否為JSON格式
157
  if assistant_content.strip().startswith('{'):
 
162
  print("JSON解析失敗")
163
  else:
164
  print(f"無messages字段: {list(item.keys())}")
165
+
166
+ print(f"\n總數據量: {len(raw_dataset)} 項")
167
  except Exception as e:
168
  print(f"預覽失敗: {e}")
169
 
170
+ def diagnose_data_issues(self, sample_size: int = 20) -> None:
171
+ """診斷數據問題"""
172
+ try:
173
+ print(f"🔍 診斷數據問題 (檢查前 {sample_size} 個可能有問題的項目)...")
174
+ raw_dataset = load_dataset(DATASET_REPO_ID, token=self.hf_token)['train']
175
+
176
+ issues_found = {"no_sql_block": 0, "empty_assistant": 0, "parsing_error": 0, "other": 0}
177
+
178
+ for i in range(min(sample_size, len(raw_dataset))):
179
+ item = raw_dataset[i]
180
+ try:
181
+ if 'messages' in item and len(item['messages']) >= 2:
182
+ assistant_content = item['messages'][1]['content']
183
+
184
+ # 檢查SQL代碼塊
185
+ sql_block_match = re.search(r'```sql\s*(.*?)\s*```', assistant_content, re.DOTALL)
186
+ if not sql_block_match:
187
+ issues_found["no_sql_block"] += 1
188
+ if issues_found["no_sql_block"] <= 3:
189
+ print(f"\n❌ 無SQL代碼塊 #{i}: {assistant_content[:200]}...")
190
+
191
+ if not assistant_content.strip():
192
+ issues_found["empty_assistant"] += 1
193
+
194
+ except Exception as e:
195
+ issues_found["parsing_error"] += 1
196
+ if issues_found["parsing_error"] <= 2:
197
+ print(f"\n💥 解析錯誤 #{i}: {e}")
198
+
199
+ print(f"\n📊 診斷結果:")
200
+ for issue, count in issues_found.items():
201
+ print(f" {issue}: {count}")
202
+ except Exception as e:
203
+ print(f"診斷失敗: {e}")
204
+
205
  def load_complete_dataset(self) -> bool:
206
  try:
207
  print(f"[{get_current_time()}] 正在加載完整數據集 '{DATASET_REPO_ID}'...")
 
265
 
266
  # SQL提取邏輯(如果還沒從JSON中獲得)
267
  if not sql_query:
268
+ # 策略1: SQL代碼塊格式(最常見)
269
+ sql_block_match = re.search(r'```sql\s*(.*?)\s*```', assistant_content, re.DOTALL)
270
+ if sql_block_match:
271
+ sql_query = sql_block_match.group(1).strip()
272
 
273
+ # 策略2: 標準「SQL查詢:」格式
274
  if not sql_query:
275
+ sql_match = re.search(r'SQL查詢:\s*(.*?)(?:\n\n|$)', assistant_content, re.DOTALL)
276
+ if sql_match:
277
+ sql_query = sql_match.group(1).strip()
278
+ # 清理可能的代碼塊標記
279
+ sql_query = re.sub(r'```sql|```', '', sql_query).strip()
280
 
281
+ # 策略3: 查找任何包含 SELECT 或 WITH 的多行內容
282
  if not sql_query:
283
+ lines = assistant_content.split('\n')
284
+ sql_lines = []
285
+ in_sql_block = False
286
+
287
+ for line in lines:
288
+ line_upper = line.upper().strip()
289
+ # 開始條件:找到SQL關鍵字
290
+ if not in_sql_block and (line_upper.startswith('SELECT') or line_upper.startswith('WITH')):
291
+ in_sql_block = True
292
+ sql_lines.append(line)
293
+ # 繼續條件:在SQL塊中
294
+ elif in_sql_block:
295
+ # 結束條件:空行或看起來不像SQL的行
296
+ if not line.strip():
297
+ break
298
+ elif line.strip().startswith('```') and len(sql_lines) > 0:
299
+ break
300
+ elif line_upper.startswith('思考過程:') or line_upper.startswith('上下文:'):
301
  break
302
+ else:
303
+ sql_lines.append(line)
304
 
305
+ if sql_lines:
306
+ sql_query = '\n'.join(sql_lines).strip()
307
+
308
+ # 策略4: 如果還是沒找到,嘗試更寬鬆的匹配
309
  if not sql_query:
310
+ # 查找所有可能的SQL片段
311
+ sql_patterns = [
312
+ r'(SELECT.*?FROM.*?)(?:\n\n|$)',
313
+ r'(WITH.*?SELECT.*?)(?:\n\n|$)',
314
+ r'SQL查詢:\s*\n(.*?)(?:\n\n|$)'
315
+ ]
316
+
317
+ for pattern in sql_patterns:
318
+ match = re.search(pattern, assistant_content, re.DOTALL | re.IGNORECASE)
319
+ if match:
320
+ candidate = match.group(1).strip()
321
+ # 基本驗證
322
+ if len(candidate) > 10 and ('SELECT' in candidate.upper() or 'WITH' in candidate.upper()):
323
+ sql_query = candidate
324
+ break
325
 
326
  # 清理SQL查詢
327
  if sql_query:
328
+ # 移除各種標記
329
  sql_query = re.sub(r'```sql|```', '', sql_query).strip()
330
  sql_query = re.sub(r'^思考過程:.*?\n', '', sql_query, flags=re.MULTILINE).strip()
331
  sql_query = re.sub(r'^SQL查詢:\s*', '', sql_query, flags=re.MULTILINE).strip()
332
 
333
+ # 移除多餘的空行
334
+ sql_query = re.sub(r'\n\s*\n', '\n', sql_query).strip()
335
+
336
+ # 確保SQL完整性 - 如果以分號結尾且內容合理,保留
337
+ if not sql_query.endswith(';') and len(sql_query) > 20:
338
+ # 檢查是否看起來像完整的SQL
339
+ if 'FROM' in sql_query.upper() and sql_query.count('(') == sql_query.count(')'):
340
+ sql_query += ';'
341
+
342
  # 清理問題文本
343
  if question:
344
  question = re.sub(r'^###\s*', '', question).strip()
345
  question = re.sub(r'Your JSON Response.*', '', question).strip()
346
+ # 移除多餘的上下文信息
347
+ question = re.sub(r'\n上下文:.*', '', question, flags=re.DOTALL).strip()
348
 
349
  # 數據質量驗證(降低標準以提高利用率)
350
  if not question or len(question.strip()) < 3:
351
  skipped_reasons["empty_question"] += 1
352
  continue
353
 
354
+ if not sql_query or len(sql_query.strip()) < 8: # 進一步降低最小長度要求
355
  skipped_reasons["empty_sql"] += 1
356
+ if idx < 10: # 調試:顯示前10個被跳過的SQL為空的案例
357
+ print(f"SQL為空案例 {idx}: 原始助手回應前100字符: {assistant_content[:100]}...")
358
  continue
359
 
360
  # 更寬鬆的SQL驗證
361
  sql_upper = sql_query.upper()
362
+ if "SELECT" not in sql_upper and "WITH" not in sql_upper and "CREATE" not in sql_upper:
363
  skipped_reasons["invalid_format"] += 1
364
+ if idx < 5: # 調試:顯示前5個格式錯誤的案例
365
+ print(f"格式錯誤案例 {idx}: SQL內容: {sql_query[:100]}...")
366
  continue
367
 
368
  self.questions.append(question)
369
  self.sql_answers.append(sql_query)
370
  successful_loads += 1
371
 
372
+ # 調試:顯示前5個成功案例
373
+ if successful_loads <= 5:
374
  print(f"✅ 成功案例 {successful_loads}:")
375
  print(f" 問題: {question[:80]}...")
376
  print(f" SQL: {sql_query[:80]}...")
 
466
  self.retrieval_system = RetrievalSystem()
467
  self.initialize_system()
468
 
469
+ def diagnose_data_issues(self, sample_size: int = 20) -> None:
470
+ """診斷數據問題"""
471
+ try:
472
+ print(f"🔍 診斷數據問題 (檢查前 {sample_size} 個可能有問題的項目)...")
473
+ raw_dataset = load_dataset(DATASET_REPO_ID, token=self.hf_token)['train']
474
+
475
+ issues_found = {"no_sql_block": 0, "empty_assistant": 0, "parsing_error": 0, "other": 0}
476
+
477
+ for i in range(min(sample_size, len(raw_dataset))):
478
+ item = raw_dataset[i]
479
+ try:
480
+ if 'messages' in item and len(item['messages']) >= 2:
481
+ assistant_content = item['messages'][1]['content']
482
+
483
+ # 檢查SQL代碼塊
484
+ sql_block_match = re.search(r'```sql\s*(.*?)\s*```', assistant_content, re.DOTALL)
485
+ if not sql_block_match:
486
+ issues_found["no_sql_block"] += 1
487
+ if issues_found["no_sql_block"] <= 3:
488
+ print(f"\n❌ 無SQL代碼塊 #{i}: {assistant_content[:200]}...")
489
+
490
+ if not assistant_content.strip():
491
+ issues_found["empty_assistant"] += 1
492
+
493
+ except Exception as e:
494
+ issues_found["parsing_error"] += 1
495
+ if issues_found["parsing_error"] <= 2:
496
+ print(f"\n💥 解析錯誤 #{i}: {e}")
497
+
498
+ print(f"\n📊 診斷結果:")
499
+ for issue, count in issues_found.items():
500
+ print(f" {issue}: {count}")
501
+ except Exception as e:
502
+ print(f"診斷失敗: {e}")
503
+
504
  def initialize_system(self):
505
  print("正在初始化完整數據系統...")
506
 
507
  # 首先預覽數據結構
508
  self.data_loader.preview_dataset_structure(3)
509
 
510
+ # 診斷數據問題
511
+ self.data_loader.diagnose_data_issues(10)
512
+
513
  # 然後加載數據
514
  self.data_loader.load_complete_dataset()
515
  self.data_loader.load_schema()