Paul720810 commited on
Commit
7b554e2
·
verified ·
1 Parent(s): 31a9f3d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +37 -64
app.py CHANGED
@@ -414,16 +414,11 @@ class TextToSQLSystem:
414
 
415
  return relevant_tables[:3] # 最多返回3個相關表格
416
 
417
- # in class TextToSQLSystem:
418
-
419
- # in class TextToSQLSystem:
420
-
421
- # in class TextToSQLSystem:
422
-
423
  def _validate_and_fix_sql(self, sql: str, question: str) -> str:
424
  """
425
- (V9 / 最終模式匹配版)
426
- 採用更穩健的分步正則匹配邏輯,確保意圖模板能被可靠觸發。
 
427
  """
428
  if not sql or not self.schema:
429
  self._log("SQL 修正被跳過,因輸入為空或 schema 未載入。", "WARNING")
@@ -442,41 +437,37 @@ class TextToSQLSystem:
442
  top_n_pattern = r"(?:top|前|最高|最大|最好)\s*(\d+)?\s*(?:個|名)?\s*([^ ]+?)\s*(?:的)?(?:營收|業績|貢獻|金額|sales|revenue)"
443
  top_n_match = re.search(top_n_pattern, question, re.IGNORECASE)
444
 
445
- # --- 意圖 2: 特定實體報告數查詢 (採用新的、更穩健的匹配邏輯) ---
446
  entity_match_data = None
447
  ENTITY_TO_COLUMN_MAP = {
448
- # 關鍵詞 (小寫,按可能的優先順序排列) -> 資料庫欄位
449
  '申請廠商': 'T1.ApplicantName', '申請方': 'T1.ApplicantName', 'applicant': 'T1.ApplicantName',
450
  '付款廠商': 'T1.InvoiceToName', '付款方': 'T1.InvoiceToName', 'invoiceto': 'T1.InvoiceToName',
451
  '代理商': 'T1.AgentName', 'agent': 'T1.AgentName',
452
- '買家': 'T1.BuyerName', 'buyer': 'T1.BuyerName', '客戶': 'T1.BuyerName', # 將通用詞放在後面
453
  }
454
-
455
- # **新的分步匹配邏輯**
456
  for keyword, column in ENTITY_TO_COLUMN_MAP.items():
457
- # 步驟 1: 在問題中尋找關鍵詞
458
  keyword_pos = q_lower.find(keyword)
459
  if keyword_pos != -1:
460
- # 步驟 2: 從關鍵詞之後的文本中提取實體名稱
461
- # 正則: 捕獲關鍵詞後面跟著的、由字母/數字/&/./-組成的第一個詞組
462
  pattern = fr"{re.escape(keyword)}[\s:;\'\"-]*([a-zA-Z0-9&.\s-]+?)(?:\s*的|\s+|$)"
463
  match = re.search(pattern, question, re.IGNORECASE)
464
  if match:
465
  entity_match_data = {
466
- "type": keyword,
467
- "name": match.group(1).strip(),
468
- "column": column
469
  }
470
- break # 找到第一個匹配的關鍵詞就停止,避免 "客戶" 覆蓋 "買家"
471
-
472
  # --- 意圖 3: TAT 查詢 ---
473
  is_tat_query = any(k in q_lower for k in ['平均', 'average']) and any(k in q_lower for k in ['時間', '時長', '多久', '天', 'tat', 'turnaround'])
474
 
 
 
 
 
 
475
  # --- 判斷邏輯: 依優先級進入對應的模板 ---
476
  if top_n_match:
477
  limit = top_n_match.group(1) or '10'
478
  entity_keyword = top_n_match.group(2).lower()
479
-
480
  column_name = next((v for k, v in ENTITY_TO_COLUMN_MAP.items() if k in entity_keyword), None)
481
 
482
  if column_name:
@@ -496,10 +487,8 @@ LIMIT {limit};
496
  entity_type = entity_match_data["type"]
497
  entity_name = entity_match_data["name"]
498
  column_name = entity_match_data["column"]
499
-
500
  year_match = re.search(r'(\d{4})\s*年?', question)
501
  year = year_match.group(1) if year_match else '2024'
502
-
503
  status_condition = ""
504
  if 'fail' in q_lower or '失敗' in q_lower:
505
  status_condition = "AND T1.OverallRating = 'Fail'"
@@ -517,6 +506,30 @@ WHERE {column_name} = '{entity_name}'
517
  """
518
  fixes_applied.append(f"模板覆寫: 查詢 {entity_type}='{entity_name}' ({year}年) 的報告數")
519
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
520
  elif is_tat_query:
521
  start_col, end_col = 'T2.LabIn', 'T2.ReportAuthorization'
522
  log_msg = "總流程平均時長 (天)"
@@ -595,46 +608,6 @@ WHERE {start_col} IS NOT NULL AND {end_col} IS NOT NULL AND {end_col} > {start_c
595
 
596
  return fixed_sql
597
 
598
- def _format_relevant_schema(self, table_names: List[str]) -> str:
599
- """
600
- 生成一個簡化的、不易被模型錯誤模仿的 Schema 字符串。
601
- """
602
- if not self.schema:
603
- return "No schema available.\n"
604
-
605
- actual_table_names_map = {name.lower(): name for name in self.schema.keys()}
606
- real_table_names = []
607
- for table in table_names:
608
- actual_name = actual_table_names_map.get(table.lower())
609
- if actual_name:
610
- real_table_names.append(actual_name)
611
- elif table in self.schema:
612
- real_table_names.append(table)
613
-
614
- if not real_table_names:
615
- self._log("未識別到相關表格,使用預設核心表格。", "WARNING")
616
- real_table_names = ['TSR53SampleDescription', 'JobTimeline', 'JobsInProgress']
617
-
618
- formatted = ""
619
- for table in real_table_names:
620
- if table in self.schema:
621
- # 使用簡單的 "Table: ..." 和 "Columns: ..." 格式
622
- formatted += f"Table: {table}\n"
623
- cols_str = []
624
- # 只顯示前 10 個關鍵欄位
625
- for col in self.schema[table][:10]:
626
- col_name = col['name']
627
- col_type = col['type']
628
- col_desc = col.get('description', '').replace('\n', ' ')
629
- # 將描述信息放在括號裡
630
- if col_desc:
631
- cols_str.append(f"{col_name} ({col_type}, {col_desc})")
632
- else:
633
- cols_str.append(f"{col_name} ({col_type})")
634
- formatted += f"Columns: {', '.join(cols_str)}\n\n"
635
-
636
- return formatted.strip()
637
-
638
  def find_most_similar(self, question: str, top_k: int) -> List[Dict]:
639
  """使用 FAISS 快速檢索相似問題"""
640
  if self.faiss_index is None or self.dataset is None:
 
414
 
415
  return relevant_tables[:3] # 最多返回3個相關表格
416
 
 
 
 
 
 
 
417
  def _validate_and_fix_sql(self, sql: str, question: str) -> str:
418
  """
419
+ (V10 / 組別查詢版)
420
+ 一個全面、多層次的 SQL 驗證與生成引擎。
421
+ 新增了對按「組別」(LabGroup) 進行查詢的意圖識別與模板覆寫。
422
  """
423
  if not sql or not self.schema:
424
  self._log("SQL 修正被跳過,因輸入為空或 schema 未載入。", "WARNING")
 
437
  top_n_pattern = r"(?:top|前|最高|最大|最好)\s*(\d+)?\s*(?:個|名)?\s*([^ ]+?)\s*(?:的)?(?:營收|業績|貢獻|金額|sales|revenue)"
438
  top_n_match = re.search(top_n_pattern, question, re.IGNORECASE)
439
 
440
+ # --- 意圖 2: 特定實體報告數查詢 ---
441
  entity_match_data = None
442
  ENTITY_TO_COLUMN_MAP = {
 
443
  '申請廠商': 'T1.ApplicantName', '申請方': 'T1.ApplicantName', 'applicant': 'T1.ApplicantName',
444
  '付款廠商': 'T1.InvoiceToName', '付款方': 'T1.InvoiceToName', 'invoiceto': 'T1.InvoiceToName',
445
  '代理商': 'T1.AgentName', 'agent': 'T1.AgentName',
446
+ '買家': 'T1.BuyerName', 'buyer': 'T1.BuyerName', '客戶': 'T1.BuyerName',
447
  }
 
 
448
  for keyword, column in ENTITY_TO_COLUMN_MAP.items():
 
449
  keyword_pos = q_lower.find(keyword)
450
  if keyword_pos != -1:
 
 
451
  pattern = fr"{re.escape(keyword)}[\s:;\'\"-]*([a-zA-Z0-9&.\s-]+?)(?:\s*的|\s+|$)"
452
  match = re.search(pattern, question, re.IGNORECASE)
453
  if match:
454
  entity_match_data = {
455
+ "type": keyword, "name": match.group(1).strip(), "column": column
 
 
456
  }
457
+ break
458
+
459
  # --- 意圖 3: TAT 查詢 ---
460
  is_tat_query = any(k in q_lower for k in ['平均', 'average']) and any(k in q_lower for k in ['時間', '時長', '多久', '天', 'tat', 'turnaround'])
461
 
462
+ # --- 意圖 4: 按特定組別查詢報告數 ---
463
+ lab_group_pattern = r"([A-Z]{1,2})\s*組"
464
+ lab_group_match = re.search(lab_group_pattern, question, re.IGNORECASE)
465
+
466
+
467
  # --- 判斷邏輯: 依優先級進入對應的模板 ---
468
  if top_n_match:
469
  limit = top_n_match.group(1) or '10'
470
  entity_keyword = top_n_match.group(2).lower()
 
471
  column_name = next((v for k, v in ENTITY_TO_COLUMN_MAP.items() if k in entity_keyword), None)
472
 
473
  if column_name:
 
487
  entity_type = entity_match_data["type"]
488
  entity_name = entity_match_data["name"]
489
  column_name = entity_match_data["column"]
 
490
  year_match = re.search(r'(\d{4})\s*年?', question)
491
  year = year_match.group(1) if year_match else '2024'
 
492
  status_condition = ""
493
  if 'fail' in q_lower or '失敗' in q_lower:
494
  status_condition = "AND T1.OverallRating = 'Fail'"
 
506
  """
507
  fixes_applied.append(f"模板覆寫: 查詢 {entity_type}='{entity_name}' ({year}年) 的報告數")
508
 
509
+ elif lab_group_match and any(kw in q_lower for kw in ['多少', '幾份', '數量', 'count']):
510
+ lab_group = lab_group_match.group(1).upper()
511
+ year_match = re.search(r'(\d{4})\s*年', question)
512
+ year = year_match.group(1) if year_match else datetime.now().strftime('%Y')
513
+ month_match = re.search(r'(\d{1,2})\s*月', question)
514
+ month_condition = ""
515
+ month_str = ""
516
+ if month_match:
517
+ month = month_match.group(1).zfill(2)
518
+ month_condition = f"AND strftime('%m', T2.ReportAuthorization) = '{month}'"
519
+ month_str = f"{month}月"
520
+
521
+ self._log(f"🔄 檢測到查詢【{lab_group}組】報告數量的意圖,啟用模板。", "INFO")
522
+ fixed_sql = f"""
523
+ SELECT COUNT(DISTINCT T1.JobNo) AS report_count
524
+ FROM JobItemsInProgress AS T1
525
+ JOIN JobTimeline AS T2 ON T1.JobNo = T2.JobNo
526
+ WHERE T1.LabGroup = '{lab_group}'
527
+ AND T2.ReportAuthorization IS NOT NULL
528
+ AND strftime('%Y', T2.ReportAuthorization) = '{year}'
529
+ {month_condition};
530
+ """
531
+ fixes_applied.append(f"模板覆寫: 查詢 {lab_group}組 在 {year}年{month_str} 的報告數")
532
+
533
  elif is_tat_query:
534
  start_col, end_col = 'T2.LabIn', 'T2.ReportAuthorization'
535
  log_msg = "總流程平均時長 (天)"
 
608
 
609
  return fixed_sql
610
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
611
  def find_most_similar(self, question: str, top_k: int) -> List[Dict]:
612
  """使用 FAISS 快速檢索相似問題"""
613
  if self.faiss_index is None or self.dataset is None: