Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -136,8 +136,22 @@ class CompleteDataLoader:
|
|
| 136 |
if 'messages' in item:
|
| 137 |
user_content = item['messages'][0]['content']
|
| 138 |
assistant_content = item['messages'][1]['content']
|
| 139 |
-
print(f"User: {user_content[:
|
| 140 |
-
print(f"Assistant: {assistant_content[:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 141 |
|
| 142 |
# 檢查是否為JSON格式
|
| 143 |
if assistant_content.strip().startswith('{'):
|
|
@@ -148,9 +162,46 @@ class CompleteDataLoader:
|
|
| 148 |
print("JSON解析失敗")
|
| 149 |
else:
|
| 150 |
print(f"無messages字段: {list(item.keys())}")
|
|
|
|
|
|
|
| 151 |
except Exception as e:
|
| 152 |
print(f"預覽失敗: {e}")
|
| 153 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 154 |
def load_complete_dataset(self) -> bool:
|
| 155 |
try:
|
| 156 |
print(f"[{get_current_time()}] 正在加載完整數據集 '{DATASET_REPO_ID}'...")
|
|
@@ -214,74 +265,112 @@ class CompleteDataLoader:
|
|
| 214 |
|
| 215 |
# SQL提取邏輯(如果還沒從JSON中獲得)
|
| 216 |
if not sql_query:
|
| 217 |
-
# 策略1:
|
| 218 |
-
|
| 219 |
-
if
|
| 220 |
-
sql_query =
|
| 221 |
|
| 222 |
-
# 策略2: SQL
|
| 223 |
if not sql_query:
|
| 224 |
-
|
| 225 |
-
if
|
| 226 |
-
sql_query =
|
|
|
|
|
|
|
| 227 |
|
| 228 |
-
# 策略3: 查找任何包含 SELECT
|
| 229 |
if not sql_query:
|
| 230 |
-
|
| 231 |
-
|
| 232 |
-
|
| 233 |
-
|
| 234 |
-
|
| 235 |
-
|
| 236 |
-
|
| 237 |
-
|
| 238 |
-
|
| 239 |
-
|
| 240 |
-
|
| 241 |
-
|
| 242 |
-
|
| 243 |
-
|
| 244 |
-
|
| 245 |
-
|
| 246 |
-
|
|
|
|
| 247 |
break
|
|
|
|
|
|
|
| 248 |
|
| 249 |
-
|
|
|
|
|
|
|
|
|
|
| 250 |
if not sql_query:
|
| 251 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 252 |
|
| 253 |
# 清理SQL查詢
|
| 254 |
if sql_query:
|
|
|
|
| 255 |
sql_query = re.sub(r'```sql|```', '', sql_query).strip()
|
| 256 |
sql_query = re.sub(r'^思考過程:.*?\n', '', sql_query, flags=re.MULTILINE).strip()
|
| 257 |
sql_query = re.sub(r'^SQL查詢:\s*', '', sql_query, flags=re.MULTILINE).strip()
|
| 258 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 259 |
# 清理問題文本
|
| 260 |
if question:
|
| 261 |
question = re.sub(r'^###\s*', '', question).strip()
|
| 262 |
question = re.sub(r'Your JSON Response.*', '', question).strip()
|
|
|
|
|
|
|
| 263 |
|
| 264 |
# 數據質量驗證(降低標準以提高利用率)
|
| 265 |
if not question or len(question.strip()) < 3:
|
| 266 |
skipped_reasons["empty_question"] += 1
|
| 267 |
continue
|
| 268 |
|
| 269 |
-
if not sql_query or len(sql_query.strip()) <
|
| 270 |
skipped_reasons["empty_sql"] += 1
|
|
|
|
|
|
|
| 271 |
continue
|
| 272 |
|
| 273 |
# 更寬鬆的SQL驗證
|
| 274 |
sql_upper = sql_query.upper()
|
| 275 |
-
if "SELECT" not in sql_upper and "WITH" not in sql_upper:
|
| 276 |
skipped_reasons["invalid_format"] += 1
|
|
|
|
|
|
|
| 277 |
continue
|
| 278 |
|
| 279 |
self.questions.append(question)
|
| 280 |
self.sql_answers.append(sql_query)
|
| 281 |
successful_loads += 1
|
| 282 |
|
| 283 |
-
#
|
| 284 |
-
if successful_loads <=
|
| 285 |
print(f"✅ 成功案例 {successful_loads}:")
|
| 286 |
print(f" 問題: {question[:80]}...")
|
| 287 |
print(f" SQL: {sql_query[:80]}...")
|
|
@@ -377,12 +466,50 @@ class CompleteTextToSQLSystem:
|
|
| 377 |
self.retrieval_system = RetrievalSystem()
|
| 378 |
self.initialize_system()
|
| 379 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 380 |
def initialize_system(self):
|
| 381 |
print("正在初始化完整數據系統...")
|
| 382 |
|
| 383 |
# 首先預覽數據結構
|
| 384 |
self.data_loader.preview_dataset_structure(3)
|
| 385 |
|
|
|
|
|
|
|
|
|
|
| 386 |
# 然後加載數據
|
| 387 |
self.data_loader.load_complete_dataset()
|
| 388 |
self.data_loader.load_schema()
|
|
|
|
| 136 |
if 'messages' in item:
|
| 137 |
user_content = item['messages'][0]['content']
|
| 138 |
assistant_content = item['messages'][1]['content']
|
| 139 |
+
print(f"User: {user_content[:120]}...")
|
| 140 |
+
print(f"Assistant: {assistant_content[:120]}...")
|
| 141 |
+
|
| 142 |
+
# 檢查SQL代碼塊
|
| 143 |
+
sql_block_match = re.search(r'```sql\s*(.*?)\s*```', assistant_content, re.DOTALL)
|
| 144 |
+
if sql_block_match:
|
| 145 |
+
sql_content = sql_block_match.group(1).strip()
|
| 146 |
+
print(f"✅ 找到SQL代碼塊: {sql_content[:60]}...")
|
| 147 |
+
else:
|
| 148 |
+
print("❌ 未找到SQL代碼塊")
|
| 149 |
+
|
| 150 |
+
# 檢查是否有其他SQL格式
|
| 151 |
+
if 'SELECT' in assistant_content.upper():
|
| 152 |
+
print("⚠️ 但包含SELECT關鍵字")
|
| 153 |
+
if 'SQL查詢:' in assistant_content:
|
| 154 |
+
print("⚠️ 但包含'SQL查詢:'標記")
|
| 155 |
|
| 156 |
# 檢查是否為JSON格式
|
| 157 |
if assistant_content.strip().startswith('{'):
|
|
|
|
| 162 |
print("JSON解析失敗")
|
| 163 |
else:
|
| 164 |
print(f"無messages字段: {list(item.keys())}")
|
| 165 |
+
|
| 166 |
+
print(f"\n總數據量: {len(raw_dataset)} 項")
|
| 167 |
except Exception as e:
|
| 168 |
print(f"預覽失敗: {e}")
|
| 169 |
|
| 170 |
+
def diagnose_data_issues(self, sample_size: int = 20) -> None:
|
| 171 |
+
"""診斷數據問題"""
|
| 172 |
+
try:
|
| 173 |
+
print(f"🔍 診斷數據問題 (檢查前 {sample_size} 個可能有問題的項目)...")
|
| 174 |
+
raw_dataset = load_dataset(DATASET_REPO_ID, token=self.hf_token)['train']
|
| 175 |
+
|
| 176 |
+
issues_found = {"no_sql_block": 0, "empty_assistant": 0, "parsing_error": 0, "other": 0}
|
| 177 |
+
|
| 178 |
+
for i in range(min(sample_size, len(raw_dataset))):
|
| 179 |
+
item = raw_dataset[i]
|
| 180 |
+
try:
|
| 181 |
+
if 'messages' in item and len(item['messages']) >= 2:
|
| 182 |
+
assistant_content = item['messages'][1]['content']
|
| 183 |
+
|
| 184 |
+
# 檢查SQL代碼塊
|
| 185 |
+
sql_block_match = re.search(r'```sql\s*(.*?)\s*```', assistant_content, re.DOTALL)
|
| 186 |
+
if not sql_block_match:
|
| 187 |
+
issues_found["no_sql_block"] += 1
|
| 188 |
+
if issues_found["no_sql_block"] <= 3:
|
| 189 |
+
print(f"\n❌ 無SQL代碼塊 #{i}: {assistant_content[:200]}...")
|
| 190 |
+
|
| 191 |
+
if not assistant_content.strip():
|
| 192 |
+
issues_found["empty_assistant"] += 1
|
| 193 |
+
|
| 194 |
+
except Exception as e:
|
| 195 |
+
issues_found["parsing_error"] += 1
|
| 196 |
+
if issues_found["parsing_error"] <= 2:
|
| 197 |
+
print(f"\n💥 解析錯誤 #{i}: {e}")
|
| 198 |
+
|
| 199 |
+
print(f"\n📊 診斷結果:")
|
| 200 |
+
for issue, count in issues_found.items():
|
| 201 |
+
print(f" {issue}: {count}")
|
| 202 |
+
except Exception as e:
|
| 203 |
+
print(f"診斷失敗: {e}")
|
| 204 |
+
|
| 205 |
def load_complete_dataset(self) -> bool:
|
| 206 |
try:
|
| 207 |
print(f"[{get_current_time()}] 正在加載完整數據集 '{DATASET_REPO_ID}'...")
|
|
|
|
| 265 |
|
| 266 |
# SQL提取邏輯(如果還沒從JSON中獲得)
|
| 267 |
if not sql_query:
|
| 268 |
+
# 策略1: SQL代碼塊格式(最常見)
|
| 269 |
+
sql_block_match = re.search(r'```sql\s*(.*?)\s*```', assistant_content, re.DOTALL)
|
| 270 |
+
if sql_block_match:
|
| 271 |
+
sql_query = sql_block_match.group(1).strip()
|
| 272 |
|
| 273 |
+
# 策略2: 標準「SQL查詢:」格式
|
| 274 |
if not sql_query:
|
| 275 |
+
sql_match = re.search(r'SQL查詢:\s*(.*?)(?:\n\n|$)', assistant_content, re.DOTALL)
|
| 276 |
+
if sql_match:
|
| 277 |
+
sql_query = sql_match.group(1).strip()
|
| 278 |
+
# 清理可能的代碼塊標記
|
| 279 |
+
sql_query = re.sub(r'```sql|```', '', sql_query).strip()
|
| 280 |
|
| 281 |
+
# 策略3: 查找任何包含 SELECT 或 WITH 的多行內容
|
| 282 |
if not sql_query:
|
| 283 |
+
lines = assistant_content.split('\n')
|
| 284 |
+
sql_lines = []
|
| 285 |
+
in_sql_block = False
|
| 286 |
+
|
| 287 |
+
for line in lines:
|
| 288 |
+
line_upper = line.upper().strip()
|
| 289 |
+
# 開始條件:找到SQL關鍵字
|
| 290 |
+
if not in_sql_block and (line_upper.startswith('SELECT') or line_upper.startswith('WITH')):
|
| 291 |
+
in_sql_block = True
|
| 292 |
+
sql_lines.append(line)
|
| 293 |
+
# 繼續條件:在SQL塊中
|
| 294 |
+
elif in_sql_block:
|
| 295 |
+
# 結束條件:空行或看起來不像SQL的行
|
| 296 |
+
if not line.strip():
|
| 297 |
+
break
|
| 298 |
+
elif line.strip().startswith('```') and len(sql_lines) > 0:
|
| 299 |
+
break
|
| 300 |
+
elif line_upper.startswith('思考過程:') or line_upper.startswith('上下文:'):
|
| 301 |
break
|
| 302 |
+
else:
|
| 303 |
+
sql_lines.append(line)
|
| 304 |
|
| 305 |
+
if sql_lines:
|
| 306 |
+
sql_query = '\n'.join(sql_lines).strip()
|
| 307 |
+
|
| 308 |
+
# 策略4: 如果還是沒找到,嘗試更寬鬆的匹配
|
| 309 |
if not sql_query:
|
| 310 |
+
# 查找所有可能的SQL片段
|
| 311 |
+
sql_patterns = [
|
| 312 |
+
r'(SELECT.*?FROM.*?)(?:\n\n|$)',
|
| 313 |
+
r'(WITH.*?SELECT.*?)(?:\n\n|$)',
|
| 314 |
+
r'SQL查詢:\s*\n(.*?)(?:\n\n|$)'
|
| 315 |
+
]
|
| 316 |
+
|
| 317 |
+
for pattern in sql_patterns:
|
| 318 |
+
match = re.search(pattern, assistant_content, re.DOTALL | re.IGNORECASE)
|
| 319 |
+
if match:
|
| 320 |
+
candidate = match.group(1).strip()
|
| 321 |
+
# 基本驗證
|
| 322 |
+
if len(candidate) > 10 and ('SELECT' in candidate.upper() or 'WITH' in candidate.upper()):
|
| 323 |
+
sql_query = candidate
|
| 324 |
+
break
|
| 325 |
|
| 326 |
# 清理SQL查詢
|
| 327 |
if sql_query:
|
| 328 |
+
# 移除各種標記
|
| 329 |
sql_query = re.sub(r'```sql|```', '', sql_query).strip()
|
| 330 |
sql_query = re.sub(r'^思考過程:.*?\n', '', sql_query, flags=re.MULTILINE).strip()
|
| 331 |
sql_query = re.sub(r'^SQL查詢:\s*', '', sql_query, flags=re.MULTILINE).strip()
|
| 332 |
|
| 333 |
+
# 移除多餘的空行
|
| 334 |
+
sql_query = re.sub(r'\n\s*\n', '\n', sql_query).strip()
|
| 335 |
+
|
| 336 |
+
# 確保SQL完整性 - 如果以分號結尾且內容合理,保留
|
| 337 |
+
if not sql_query.endswith(';') and len(sql_query) > 20:
|
| 338 |
+
# 檢查是否看起來像完整的SQL
|
| 339 |
+
if 'FROM' in sql_query.upper() and sql_query.count('(') == sql_query.count(')'):
|
| 340 |
+
sql_query += ';'
|
| 341 |
+
|
| 342 |
# 清理問題文本
|
| 343 |
if question:
|
| 344 |
question = re.sub(r'^###\s*', '', question).strip()
|
| 345 |
question = re.sub(r'Your JSON Response.*', '', question).strip()
|
| 346 |
+
# 移除多餘的上下文信息
|
| 347 |
+
question = re.sub(r'\n上下文:.*', '', question, flags=re.DOTALL).strip()
|
| 348 |
|
| 349 |
# 數據質量驗證(降低標準以提高利用率)
|
| 350 |
if not question or len(question.strip()) < 3:
|
| 351 |
skipped_reasons["empty_question"] += 1
|
| 352 |
continue
|
| 353 |
|
| 354 |
+
if not sql_query or len(sql_query.strip()) < 8: # 進一步降低最小長度要求
|
| 355 |
skipped_reasons["empty_sql"] += 1
|
| 356 |
+
if idx < 10: # 調試:顯示前10個被跳過的SQL為空的案例
|
| 357 |
+
print(f"SQL為空案例 {idx}: 原始助手回應前100字符: {assistant_content[:100]}...")
|
| 358 |
continue
|
| 359 |
|
| 360 |
# 更寬鬆的SQL驗證
|
| 361 |
sql_upper = sql_query.upper()
|
| 362 |
+
if "SELECT" not in sql_upper and "WITH" not in sql_upper and "CREATE" not in sql_upper:
|
| 363 |
skipped_reasons["invalid_format"] += 1
|
| 364 |
+
if idx < 5: # 調試:顯示前5個格式錯誤的案例
|
| 365 |
+
print(f"格式錯誤案例 {idx}: SQL內容: {sql_query[:100]}...")
|
| 366 |
continue
|
| 367 |
|
| 368 |
self.questions.append(question)
|
| 369 |
self.sql_answers.append(sql_query)
|
| 370 |
successful_loads += 1
|
| 371 |
|
| 372 |
+
# 調試:顯示前5個成功案例
|
| 373 |
+
if successful_loads <= 5:
|
| 374 |
print(f"✅ 成功案例 {successful_loads}:")
|
| 375 |
print(f" 問題: {question[:80]}...")
|
| 376 |
print(f" SQL: {sql_query[:80]}...")
|
|
|
|
| 466 |
self.retrieval_system = RetrievalSystem()
|
| 467 |
self.initialize_system()
|
| 468 |
|
| 469 |
+
def diagnose_data_issues(self, sample_size: int = 20) -> None:
|
| 470 |
+
"""診斷數據問題"""
|
| 471 |
+
try:
|
| 472 |
+
print(f"🔍 診斷數據問題 (檢查前 {sample_size} 個可能有問題的項目)...")
|
| 473 |
+
raw_dataset = load_dataset(DATASET_REPO_ID, token=self.hf_token)['train']
|
| 474 |
+
|
| 475 |
+
issues_found = {"no_sql_block": 0, "empty_assistant": 0, "parsing_error": 0, "other": 0}
|
| 476 |
+
|
| 477 |
+
for i in range(min(sample_size, len(raw_dataset))):
|
| 478 |
+
item = raw_dataset[i]
|
| 479 |
+
try:
|
| 480 |
+
if 'messages' in item and len(item['messages']) >= 2:
|
| 481 |
+
assistant_content = item['messages'][1]['content']
|
| 482 |
+
|
| 483 |
+
# 檢查SQL代碼塊
|
| 484 |
+
sql_block_match = re.search(r'```sql\s*(.*?)\s*```', assistant_content, re.DOTALL)
|
| 485 |
+
if not sql_block_match:
|
| 486 |
+
issues_found["no_sql_block"] += 1
|
| 487 |
+
if issues_found["no_sql_block"] <= 3:
|
| 488 |
+
print(f"\n❌ 無SQL代碼塊 #{i}: {assistant_content[:200]}...")
|
| 489 |
+
|
| 490 |
+
if not assistant_content.strip():
|
| 491 |
+
issues_found["empty_assistant"] += 1
|
| 492 |
+
|
| 493 |
+
except Exception as e:
|
| 494 |
+
issues_found["parsing_error"] += 1
|
| 495 |
+
if issues_found["parsing_error"] <= 2:
|
| 496 |
+
print(f"\n💥 解析錯誤 #{i}: {e}")
|
| 497 |
+
|
| 498 |
+
print(f"\n📊 診斷結果:")
|
| 499 |
+
for issue, count in issues_found.items():
|
| 500 |
+
print(f" {issue}: {count}")
|
| 501 |
+
except Exception as e:
|
| 502 |
+
print(f"診斷失敗: {e}")
|
| 503 |
+
|
| 504 |
def initialize_system(self):
|
| 505 |
print("正在初始化完整數據系統...")
|
| 506 |
|
| 507 |
# 首先預覽數據結構
|
| 508 |
self.data_loader.preview_dataset_structure(3)
|
| 509 |
|
| 510 |
+
# 診斷數據問題
|
| 511 |
+
self.data_loader.diagnose_data_issues(10)
|
| 512 |
+
|
| 513 |
# 然後加載數據
|
| 514 |
self.data_loader.load_complete_dataset()
|
| 515 |
self.data_loader.load_schema()
|