Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -414,16 +414,11 @@ class TextToSQLSystem:
|
|
| 414 |
|
| 415 |
return relevant_tables[:3] # 最多返回3個相關表格
|
| 416 |
|
| 417 |
-
# in class TextToSQLSystem:
|
| 418 |
-
|
| 419 |
-
# in class TextToSQLSystem:
|
| 420 |
-
|
| 421 |
-
# in class TextToSQLSystem:
|
| 422 |
-
|
| 423 |
def _validate_and_fix_sql(self, sql: str, question: str) -> str:
|
| 424 |
"""
|
| 425 |
-
(
|
| 426 |
-
|
|
|
|
| 427 |
"""
|
| 428 |
if not sql or not self.schema:
|
| 429 |
self._log("SQL 修正被跳過,因輸入為空或 schema 未載入。", "WARNING")
|
|
@@ -442,41 +437,37 @@ class TextToSQLSystem:
|
|
| 442 |
top_n_pattern = r"(?:top|前|最高|最大|最好)\s*(\d+)?\s*(?:個|名)?\s*([^ ]+?)\s*(?:的)?(?:營收|業績|貢獻|金額|sales|revenue)"
|
| 443 |
top_n_match = re.search(top_n_pattern, question, re.IGNORECASE)
|
| 444 |
|
| 445 |
-
# --- 意圖 2: 特定實體報告數查詢
|
| 446 |
entity_match_data = None
|
| 447 |
ENTITY_TO_COLUMN_MAP = {
|
| 448 |
-
# 關鍵詞 (小寫,按可能的優先順序排列) -> 資料庫欄位
|
| 449 |
'申請廠商': 'T1.ApplicantName', '申請方': 'T1.ApplicantName', 'applicant': 'T1.ApplicantName',
|
| 450 |
'付款廠商': 'T1.InvoiceToName', '付款方': 'T1.InvoiceToName', 'invoiceto': 'T1.InvoiceToName',
|
| 451 |
'代理商': 'T1.AgentName', 'agent': 'T1.AgentName',
|
| 452 |
-
'買家': 'T1.BuyerName', 'buyer': 'T1.BuyerName', '客戶': 'T1.BuyerName',
|
| 453 |
}
|
| 454 |
-
|
| 455 |
-
# **新的分步匹配邏輯**
|
| 456 |
for keyword, column in ENTITY_TO_COLUMN_MAP.items():
|
| 457 |
-
# 步驟 1: 在問題中尋找關鍵詞
|
| 458 |
keyword_pos = q_lower.find(keyword)
|
| 459 |
if keyword_pos != -1:
|
| 460 |
-
# 步驟 2: 從關鍵詞之後的文本中提取實體名稱
|
| 461 |
-
# 正則: 捕獲關鍵詞後面跟著的、由字母/數字/&/./-組成的第一個詞組
|
| 462 |
pattern = fr"{re.escape(keyword)}[\s:;\'\"-]*([a-zA-Z0-9&.\s-]+?)(?:\s*的|\s+|$)"
|
| 463 |
match = re.search(pattern, question, re.IGNORECASE)
|
| 464 |
if match:
|
| 465 |
entity_match_data = {
|
| 466 |
-
"type": keyword,
|
| 467 |
-
"name": match.group(1).strip(),
|
| 468 |
-
"column": column
|
| 469 |
}
|
| 470 |
-
break
|
| 471 |
-
|
| 472 |
# --- 意圖 3: TAT 查詢 ---
|
| 473 |
is_tat_query = any(k in q_lower for k in ['平均', 'average']) and any(k in q_lower for k in ['時間', '時長', '多久', '天', 'tat', 'turnaround'])
|
| 474 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 475 |
# --- 判斷邏輯: 依優先級進入對應的模板 ---
|
| 476 |
if top_n_match:
|
| 477 |
limit = top_n_match.group(1) or '10'
|
| 478 |
entity_keyword = top_n_match.group(2).lower()
|
| 479 |
-
|
| 480 |
column_name = next((v for k, v in ENTITY_TO_COLUMN_MAP.items() if k in entity_keyword), None)
|
| 481 |
|
| 482 |
if column_name:
|
|
@@ -496,10 +487,8 @@ LIMIT {limit};
|
|
| 496 |
entity_type = entity_match_data["type"]
|
| 497 |
entity_name = entity_match_data["name"]
|
| 498 |
column_name = entity_match_data["column"]
|
| 499 |
-
|
| 500 |
year_match = re.search(r'(\d{4})\s*年?', question)
|
| 501 |
year = year_match.group(1) if year_match else '2024'
|
| 502 |
-
|
| 503 |
status_condition = ""
|
| 504 |
if 'fail' in q_lower or '失敗' in q_lower:
|
| 505 |
status_condition = "AND T1.OverallRating = 'Fail'"
|
|
@@ -517,6 +506,30 @@ WHERE {column_name} = '{entity_name}'
|
|
| 517 |
"""
|
| 518 |
fixes_applied.append(f"模板覆寫: 查詢 {entity_type}='{entity_name}' ({year}年) 的報告數")
|
| 519 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 520 |
elif is_tat_query:
|
| 521 |
start_col, end_col = 'T2.LabIn', 'T2.ReportAuthorization'
|
| 522 |
log_msg = "總流程平均時長 (天)"
|
|
@@ -595,46 +608,6 @@ WHERE {start_col} IS NOT NULL AND {end_col} IS NOT NULL AND {end_col} > {start_c
|
|
| 595 |
|
| 596 |
return fixed_sql
|
| 597 |
|
| 598 |
-
def _format_relevant_schema(self, table_names: List[str]) -> str:
|
| 599 |
-
"""
|
| 600 |
-
生成一個簡化的、不易被模型錯誤模仿的 Schema 字符串。
|
| 601 |
-
"""
|
| 602 |
-
if not self.schema:
|
| 603 |
-
return "No schema available.\n"
|
| 604 |
-
|
| 605 |
-
actual_table_names_map = {name.lower(): name for name in self.schema.keys()}
|
| 606 |
-
real_table_names = []
|
| 607 |
-
for table in table_names:
|
| 608 |
-
actual_name = actual_table_names_map.get(table.lower())
|
| 609 |
-
if actual_name:
|
| 610 |
-
real_table_names.append(actual_name)
|
| 611 |
-
elif table in self.schema:
|
| 612 |
-
real_table_names.append(table)
|
| 613 |
-
|
| 614 |
-
if not real_table_names:
|
| 615 |
-
self._log("未識別到相關表格,使用預設核心表格。", "WARNING")
|
| 616 |
-
real_table_names = ['TSR53SampleDescription', 'JobTimeline', 'JobsInProgress']
|
| 617 |
-
|
| 618 |
-
formatted = ""
|
| 619 |
-
for table in real_table_names:
|
| 620 |
-
if table in self.schema:
|
| 621 |
-
# 使用簡單的 "Table: ..." 和 "Columns: ..." 格式
|
| 622 |
-
formatted += f"Table: {table}\n"
|
| 623 |
-
cols_str = []
|
| 624 |
-
# 只顯示前 10 個關鍵欄位
|
| 625 |
-
for col in self.schema[table][:10]:
|
| 626 |
-
col_name = col['name']
|
| 627 |
-
col_type = col['type']
|
| 628 |
-
col_desc = col.get('description', '').replace('\n', ' ')
|
| 629 |
-
# 將描述信息放在括號裡
|
| 630 |
-
if col_desc:
|
| 631 |
-
cols_str.append(f"{col_name} ({col_type}, {col_desc})")
|
| 632 |
-
else:
|
| 633 |
-
cols_str.append(f"{col_name} ({col_type})")
|
| 634 |
-
formatted += f"Columns: {', '.join(cols_str)}\n\n"
|
| 635 |
-
|
| 636 |
-
return formatted.strip()
|
| 637 |
-
|
| 638 |
def find_most_similar(self, question: str, top_k: int) -> List[Dict]:
|
| 639 |
"""使用 FAISS 快速檢索相似問題"""
|
| 640 |
if self.faiss_index is None or self.dataset is None:
|
|
|
|
| 414 |
|
| 415 |
return relevant_tables[:3] # 最多返回3個相關表格
|
| 416 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 417 |
def _validate_and_fix_sql(self, sql: str, question: str) -> str:
|
| 418 |
"""
|
| 419 |
+
(V10 / 組別查詢版)
|
| 420 |
+
一個全面、多層次的 SQL 驗證與生成引擎。
|
| 421 |
+
新增了對按「組別」(LabGroup) 進行查詢的意圖識別與模板覆寫。
|
| 422 |
"""
|
| 423 |
if not sql or not self.schema:
|
| 424 |
self._log("SQL 修正被跳過,因輸入為空或 schema 未載入。", "WARNING")
|
|
|
|
| 437 |
top_n_pattern = r"(?:top|前|最高|最大|最好)\s*(\d+)?\s*(?:個|名)?\s*([^ ]+?)\s*(?:的)?(?:營收|業績|貢獻|金額|sales|revenue)"
|
| 438 |
top_n_match = re.search(top_n_pattern, question, re.IGNORECASE)
|
| 439 |
|
| 440 |
+
# --- 意圖 2: 特定實體報告數查詢 ---
|
| 441 |
entity_match_data = None
|
| 442 |
ENTITY_TO_COLUMN_MAP = {
|
|
|
|
| 443 |
'申請廠商': 'T1.ApplicantName', '申請方': 'T1.ApplicantName', 'applicant': 'T1.ApplicantName',
|
| 444 |
'付款廠商': 'T1.InvoiceToName', '付款方': 'T1.InvoiceToName', 'invoiceto': 'T1.InvoiceToName',
|
| 445 |
'代理商': 'T1.AgentName', 'agent': 'T1.AgentName',
|
| 446 |
+
'買家': 'T1.BuyerName', 'buyer': 'T1.BuyerName', '客戶': 'T1.BuyerName',
|
| 447 |
}
|
|
|
|
|
|
|
| 448 |
for keyword, column in ENTITY_TO_COLUMN_MAP.items():
|
|
|
|
| 449 |
keyword_pos = q_lower.find(keyword)
|
| 450 |
if keyword_pos != -1:
|
|
|
|
|
|
|
| 451 |
pattern = fr"{re.escape(keyword)}[\s:;\'\"-]*([a-zA-Z0-9&.\s-]+?)(?:\s*的|\s+|$)"
|
| 452 |
match = re.search(pattern, question, re.IGNORECASE)
|
| 453 |
if match:
|
| 454 |
entity_match_data = {
|
| 455 |
+
"type": keyword, "name": match.group(1).strip(), "column": column
|
|
|
|
|
|
|
| 456 |
}
|
| 457 |
+
break
|
| 458 |
+
|
| 459 |
# --- 意圖 3: TAT 查詢 ---
|
| 460 |
is_tat_query = any(k in q_lower for k in ['平均', 'average']) and any(k in q_lower for k in ['時間', '時長', '多久', '天', 'tat', 'turnaround'])
|
| 461 |
|
| 462 |
+
# --- 意圖 4: 按特定組別查詢報告數 ---
|
| 463 |
+
lab_group_pattern = r"([A-Z]{1,2})\s*組"
|
| 464 |
+
lab_group_match = re.search(lab_group_pattern, question, re.IGNORECASE)
|
| 465 |
+
|
| 466 |
+
|
| 467 |
# --- 判斷邏輯: 依優先級進入對應的模板 ---
|
| 468 |
if top_n_match:
|
| 469 |
limit = top_n_match.group(1) or '10'
|
| 470 |
entity_keyword = top_n_match.group(2).lower()
|
|
|
|
| 471 |
column_name = next((v for k, v in ENTITY_TO_COLUMN_MAP.items() if k in entity_keyword), None)
|
| 472 |
|
| 473 |
if column_name:
|
|
|
|
| 487 |
entity_type = entity_match_data["type"]
|
| 488 |
entity_name = entity_match_data["name"]
|
| 489 |
column_name = entity_match_data["column"]
|
|
|
|
| 490 |
year_match = re.search(r'(\d{4})\s*年?', question)
|
| 491 |
year = year_match.group(1) if year_match else '2024'
|
|
|
|
| 492 |
status_condition = ""
|
| 493 |
if 'fail' in q_lower or '失敗' in q_lower:
|
| 494 |
status_condition = "AND T1.OverallRating = 'Fail'"
|
|
|
|
| 506 |
"""
|
| 507 |
fixes_applied.append(f"模板覆寫: 查詢 {entity_type}='{entity_name}' ({year}年) 的報告數")
|
| 508 |
|
| 509 |
+
elif lab_group_match and any(kw in q_lower for kw in ['多少', '幾份', '數量', 'count']):
|
| 510 |
+
lab_group = lab_group_match.group(1).upper()
|
| 511 |
+
year_match = re.search(r'(\d{4})\s*年', question)
|
| 512 |
+
year = year_match.group(1) if year_match else datetime.now().strftime('%Y')
|
| 513 |
+
month_match = re.search(r'(\d{1,2})\s*月', question)
|
| 514 |
+
month_condition = ""
|
| 515 |
+
month_str = ""
|
| 516 |
+
if month_match:
|
| 517 |
+
month = month_match.group(1).zfill(2)
|
| 518 |
+
month_condition = f"AND strftime('%m', T2.ReportAuthorization) = '{month}'"
|
| 519 |
+
month_str = f"{month}月"
|
| 520 |
+
|
| 521 |
+
self._log(f"🔄 檢測到查詢【{lab_group}組】報告數量的意圖,啟用模板。", "INFO")
|
| 522 |
+
fixed_sql = f"""
|
| 523 |
+
SELECT COUNT(DISTINCT T1.JobNo) AS report_count
|
| 524 |
+
FROM JobItemsInProgress AS T1
|
| 525 |
+
JOIN JobTimeline AS T2 ON T1.JobNo = T2.JobNo
|
| 526 |
+
WHERE T1.LabGroup = '{lab_group}'
|
| 527 |
+
AND T2.ReportAuthorization IS NOT NULL
|
| 528 |
+
AND strftime('%Y', T2.ReportAuthorization) = '{year}'
|
| 529 |
+
{month_condition};
|
| 530 |
+
"""
|
| 531 |
+
fixes_applied.append(f"模板覆寫: 查詢 {lab_group}組 在 {year}年{month_str} 的報告數")
|
| 532 |
+
|
| 533 |
elif is_tat_query:
|
| 534 |
start_col, end_col = 'T2.LabIn', 'T2.ReportAuthorization'
|
| 535 |
log_msg = "總流程平均時長 (天)"
|
|
|
|
| 608 |
|
| 609 |
return fixed_sql
|
| 610 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 611 |
def find_most_similar(self, question: str, top_k: int) -> List[Dict]:
|
| 612 |
"""使用 FAISS 快速檢索相似問題"""
|
| 613 |
if self.faiss_index is None or self.dataset is None:
|