Spaces:

Paul720810
/

Softline-SQL-Assistant

Sleeping

App Files Files Community

Paul720810 commited on Sep 5, 2025

Commit

7b554e2

verified ·

1 Parent(s): 31a9f3d

Update app.py

Browse files

Files changed (1) hide show

app.py +37 -64

app.py CHANGED Viewed

@@ -414,16 +414,11 @@ class TextToSQLSystem:
         return relevant_tables[:3]  # 最多返回3個相關表格
-    # in class TextToSQLSystem:
-    # in class TextToSQLSystem:
-    # in class TextToSQLSystem:
     def _validate_and_fix_sql(self, sql: str, question: str) -> str:
         """
-        (V9 / 最終模式匹配版)
-        採用更穩健的分步正則匹配邏輯，確保意圖模板能被可靠觸發。
         """
         if not sql or not self.schema:
             self._log("SQL 修正被跳過，因輸入為空或 schema 未載入。", "WARNING")
@@ -442,41 +437,37 @@ class TextToSQLSystem:
         top_n_pattern = r"(?:top|前|最高|最大|最好)\s*(\d+)?\s*(?:個|名)?\s*([^ ]+?)\s*(?:的)?(?:營收|業績|貢獻|金額|sales|revenue)"
         top_n_match = re.search(top_n_pattern, question, re.IGNORECASE)
-        # --- 意圖 2: 特定實體報告數查詢 (採用新的、更穩健的匹配邏輯) ---
         entity_match_data = None
         ENTITY_TO_COLUMN_MAP = {
-            # 關鍵詞 (小寫，按可能的優先順序排列) -> 資料庫欄位
             '申請廠商': 'T1.ApplicantName', '申請方': 'T1.ApplicantName', 'applicant': 'T1.ApplicantName',
             '付款廠商': 'T1.InvoiceToName', '付款方': 'T1.InvoiceToName', 'invoiceto': 'T1.InvoiceToName',
             '代理商': 'T1.AgentName', 'agent': 'T1.AgentName',
-            '買家': 'T1.BuyerName', 'buyer': 'T1.BuyerName', '客戶': 'T1.BuyerName', # 將通用詞放在後面
         }
-        # **新的分步匹配邏輯**
         for keyword, column in ENTITY_TO_COLUMN_MAP.items():
-            # 步驟 1: 在問題中尋找關鍵詞
             keyword_pos = q_lower.find(keyword)
             if keyword_pos != -1:
-                # 步驟 2: 從關鍵詞之後的文本中提取實體名稱
-                # 正則: 捕獲關鍵詞後面跟著的、由字母/數字/&/./-組成的第一個詞組
                 pattern = fr"{re.escape(keyword)}[\s:;\'\"-]*([a-zA-Z0-9&.\s-]+?)(?:\s*的|\s+|$)"
                 match = re.search(pattern, question, re.IGNORECASE)
                 if match:
                     entity_match_data = {
-                        "type": keyword,
-                        "name": match.group(1).strip(),
-                        "column": column
                     }
-                    break # 找到第一個匹配的關鍵詞就停止，避免 "客戶" 覆蓋 "買家"
         # --- 意圖 3: TAT 查詢 ---
         is_tat_query = any(k in q_lower for k in ['平均', 'average']) and any(k in q_lower for k in ['時間', '時長', '多久', '天', 'tat', 'turnaround'])
         # --- 判斷邏輯: 依優先級進入對應的模板 ---
         if top_n_match:
             limit = top_n_match.group(1) or '10'
             entity_keyword = top_n_match.group(2).lower()
             column_name = next((v for k, v in ENTITY_TO_COLUMN_MAP.items() if k in entity_keyword), None)
             if column_name:
@@ -496,10 +487,8 @@ LIMIT {limit};
             entity_type = entity_match_data["type"]
             entity_name = entity_match_data["name"]
             column_name = entity_match_data["column"]
             year_match = re.search(r'(\d{4})\s*年?', question)
             year = year_match.group(1) if year_match else '2024'
             status_condition = ""
             if 'fail' in q_lower or '失敗' in q_lower:
                 status_condition = "AND T1.OverallRating = 'Fail'"
@@ -517,6 +506,30 @@ WHERE {column_name} = '{entity_name}'
 """
             fixes_applied.append(f"模板覆寫: 查詢 {entity_type}='{entity_name}' ({year}年) 的報告數")
         elif is_tat_query:
             start_col, end_col = 'T2.LabIn', 'T2.ReportAuthorization'
             log_msg = "總流程平均時長 (天)"
@@ -595,46 +608,6 @@ WHERE {start_col} IS NOT NULL AND {end_col} IS NOT NULL AND {end_col} > {start_c
         return fixed_sql
-    def _format_relevant_schema(self, table_names: List[str]) -> str:
-        """
-        生成一個簡化的、不易被模型錯誤模仿的 Schema 字符串。
-        """
-        if not self.schema:
-            return "No schema available.\n"
-        actual_table_names_map = {name.lower(): name for name in self.schema.keys()}
-        real_table_names = []
-        for table in table_names:
-            actual_name = actual_table_names_map.get(table.lower())
-            if actual_name:
-                real_table_names.append(actual_name)
-            elif table in self.schema:
-                real_table_names.append(table)
-        if not real_table_names:
-            self._log("未識別到相關表格，使用預設核心表格。", "WARNING")
-            real_table_names = ['TSR53SampleDescription', 'JobTimeline', 'JobsInProgress']
-        formatted = ""
-        for table in real_table_names:
-            if table in self.schema:
-                # 使用簡單的 "Table: ..." 和 "Columns: ..." 格式
-                formatted += f"Table: {table}\n"
-                cols_str = []
-                # 只顯示前 10 個關鍵欄位
-                for col in self.schema[table][:10]:
-                    col_name = col['name']
-                    col_type = col['type']
-                    col_desc = col.get('description', '').replace('\n', ' ')
-                    # 將描述信息放在括號裡
-                    if col_desc:
-                        cols_str.append(f"{col_name} ({col_type}, {col_desc})")
-                    else:
-                        cols_str.append(f"{col_name} ({col_type})")
-                formatted += f"Columns: {', '.join(cols_str)}\n\n"
-        return formatted.strip()
     def find_most_similar(self, question: str, top_k: int) -> List[Dict]:
         """使用 FAISS 快速檢索相似問題"""
         if self.faiss_index is None or self.dataset is None:

         return relevant_tables[:3]  # 最多返回3個相關表格
     def _validate_and_fix_sql(self, sql: str, question: str) -> str:
         """
+        (V10 / 組別查詢版)
+        一個全面、多層次的 SQL 驗證與生成引擎。
+        新增了對按「組別」(LabGroup) 進行查詢的意圖識別與模板覆寫。
         """
         if not sql or not self.schema:
             self._log("SQL 修正被跳過，因輸入為空或 schema 未載入。", "WARNING")
         top_n_pattern = r"(?:top|前|最高|最大|最好)\s*(\d+)?\s*(?:個|名)?\s*([^ ]+?)\s*(?:的)?(?:營收|業績|貢獻|金額|sales|revenue)"
         top_n_match = re.search(top_n_pattern, question, re.IGNORECASE)
+        # --- 意圖 2: 特定實體報告數查詢 ---
         entity_match_data = None
         ENTITY_TO_COLUMN_MAP = {
             '申請廠商': 'T1.ApplicantName', '申請方': 'T1.ApplicantName', 'applicant': 'T1.ApplicantName',
             '付款廠商': 'T1.InvoiceToName', '付款方': 'T1.InvoiceToName', 'invoiceto': 'T1.InvoiceToName',
             '代理商': 'T1.AgentName', 'agent': 'T1.AgentName',
+            '買家': 'T1.BuyerName', 'buyer': 'T1.BuyerName', '客戶': 'T1.BuyerName',
         }
         for keyword, column in ENTITY_TO_COLUMN_MAP.items():
             keyword_pos = q_lower.find(keyword)
             if keyword_pos != -1:
                 pattern = fr"{re.escape(keyword)}[\s:;\'\"-]*([a-zA-Z0-9&.\s-]+?)(?:\s*的|\s+|$)"
                 match = re.search(pattern, question, re.IGNORECASE)
                 if match:
                     entity_match_data = {
+                        "type": keyword, "name": match.group(1).strip(), "column": column
                     }
+                    break
         # --- 意圖 3: TAT 查詢 ---
         is_tat_query = any(k in q_lower for k in ['平均', 'average']) and any(k in q_lower for k in ['時間', '時長', '多久', '天', 'tat', 'turnaround'])
+        # --- 意圖 4: 按特定組別查詢報告數 ---
+        lab_group_pattern = r"([A-Z]{1,2})\s*組"
+        lab_group_match = re.search(lab_group_pattern, question, re.IGNORECASE)
         # --- 判斷邏輯: 依優先級進入對應的模板 ---
         if top_n_match:
             limit = top_n_match.group(1) or '10'
             entity_keyword = top_n_match.group(2).lower()
             column_name = next((v for k, v in ENTITY_TO_COLUMN_MAP.items() if k in entity_keyword), None)
             if column_name:
             entity_type = entity_match_data["type"]
             entity_name = entity_match_data["name"]
             column_name = entity_match_data["column"]
             year_match = re.search(r'(\d{4})\s*年?', question)
             year = year_match.group(1) if year_match else '2024'
             status_condition = ""
             if 'fail' in q_lower or '失敗' in q_lower:
                 status_condition = "AND T1.OverallRating = 'Fail'"
 """
             fixes_applied.append(f"模板覆寫: 查詢 {entity_type}='{entity_name}' ({year}年) 的報告數")
+        elif lab_group_match and any(kw in q_lower for kw in ['多少', '幾份', '數量', 'count']):
+            lab_group = lab_group_match.group(1).upper()
+            year_match = re.search(r'(\d{4})\s*年', question)
+            year = year_match.group(1) if year_match else datetime.now().strftime('%Y')
+            month_match = re.search(r'(\d{1,2})\s*月', question)
+            month_condition = ""
+            month_str = ""
+            if month_match:
+                month = month_match.group(1).zfill(2)
+                month_condition = f"AND strftime('%m', T2.ReportAuthorization) = '{month}'"
+                month_str = f"{month}月"
+            self._log(f"🔄 檢測到查詢【{lab_group}組】報告數量的意圖，啟用模板。", "INFO")
+            fixed_sql = f"""
+SELECT COUNT(DISTINCT T1.JobNo) AS report_count
+FROM JobItemsInProgress AS T1
+JOIN JobTimeline AS T2 ON T1.JobNo = T2.JobNo
+WHERE T1.LabGroup = '{lab_group}'
+  AND T2.ReportAuthorization IS NOT NULL
+  AND strftime('%Y', T2.ReportAuthorization) = '{year}'
+  {month_condition};
+"""
+            fixes_applied.append(f"模板覆寫: 查詢 {lab_group}組 在 {year}年{month_str} 的報告數")
         elif is_tat_query:
             start_col, end_col = 'T2.LabIn', 'T2.ReportAuthorization'
             log_msg = "總流程平均時長 (天)"
         return fixed_sql
     def find_most_similar(self, question: str, top_k: int) -> List[Dict]:
         """使用 FAISS 快速檢索相似問題"""
         if self.faiss_index is None or self.dataset is None: