Spaces:

Paul720810
/

Softline-SQL-Assistant

Sleeping

App Files Files Community

Paul720810 commited on Sep 11, 2025

Commit

1021a18

verified ·

1 Parent(s): f452661

Update app.py

Browse files

Files changed (1) hide show

app.py +75 -66

app.py CHANGED Viewed

@@ -11,6 +11,7 @@ from llama_cpp import Llama
 from typing import List, Dict, Tuple, Optional
 import faiss
 from functools import lru_cache
 # 使用 transformers 替代 sentence-transformers
 from transformers import AutoModel, AutoTokenizer
@@ -44,48 +45,56 @@ def format_log(message: str, level: str = "INFO") -> str:
     return f"[{get_current_time()}] [{level.upper()}] {message}"
 def parse_sql_from_response(response_text: str) -> Optional[str]:
-    """從模型輸出提取 SQL，增強版"""
     if not response_text:
         return None
-    # 清理回應文本
-    response_text = response_text.strip()
-    # 1. 先找 ```sql ... ```
-    match = re.search(r"```sql\s*\n(.*?)\n```", response_text, re.DOTALL | re.IGNORECASE)
-    if match:
-        return match.group(1).strip()
-    # 2. 找任何 ``` 包圍的內容
-    match = re.search(r"```\s*\n?(.*?)\n?```", response_text, re.DOTALL)
-    if match:
-        sql_candidate = match.group(1).strip()
-        if sql_candidate.upper().startswith('SELECT'):
-            return sql_candidate
-    # 3. 找 SQL 語句（更寬鬆的匹配）
-    match = re.search(r"(SELECT\s+.*?;)", response_text, re.DOTALL | re.IGNORECASE)
-    if match:
-        return match.group(1).strip()
-    # 4. 找沒有分號的 SQL
-    match = re.search(r"(SELECT\s+.*?)(?=\n\n|\n```|$|\n[^,\s])", response_text, re.DOTALL | re.IGNORECASE)
-    if match:
-        sql = match.group(1).strip()
-        if not sql.endswith(';'):
-            sql += ';'
-        return sql
-    # 5. 如果包含 SELECT，嘗試提取整行
-    if 'SELECT' in response_text.upper():
-        lines = response_text.split('\n')
-        for line in lines:
-            line = line.strip()
-            if line.upper().startswith('SELECT'):
-                if not line.endswith(';'):
-                    line += ';'
-                return line
     return None
 # ==================== Text-to-SQL 核心類 ====================
@@ -228,14 +237,14 @@ class TextToSQLSystem:
             return self._generate_fallback_sql(prompt)
         try:
             output = self.llm(
                 prompt,
                 max_tokens=350,
                 temperature=0.05,
                 top_p=0.9,
                 echo=False,
-                # --- 將 stop 參數加回來 ---
-                stop=["```", ";", "\n\n", "</s>"],
             )
             self._log(f"🧠 模型原始輸出 (Raw Output): {output}", "DEBUG")
@@ -258,7 +267,7 @@ class TextToSQLSystem:
                 # --- 清理邏輯結束 ---
             else:
                 self._log("❌ 模型的原始輸出格式不正確或為空。", "ERROR")
-                return ""
         except Exception as e:
             self._log(f"❌ 模型生成過程中發生嚴重錯誤: {e}", "CRITICAL")
@@ -505,7 +514,7 @@ class TextToSQLSystem:
         - 根據問題是關於「報告」還是「測試項目」來智能地決定計數目標。
         """
         q_lower = question.lower()
         # ==============================================================================
         #  第零層：統一實體識別引擎 (Unified Entity Recognition Engine)
         # ==============================================================================
@@ -517,40 +526,40 @@ class TextToSQLSystem:
             {'pattern': r"(申請方|申请方|申請廠商|申请厂商|applicant)\s*(?:id|代號|代碼|代号|代码)\s*'\"?\b([A-Z]\d{4}[A-Z])\b'\"?", 'column': 'sd.ApplicantID', 'type': '申請方ID'},
             {'pattern': r"(付款方|付款厂商|invoiceto)\s*(?:id|代號|代碼|代号|代码)\s*'\"?\b([A-Z]\d{4}[A-Z])\b'\"?", 'column': 'sd.InvoiceToID', 'type': '付款方ID'},
             {'pattern': r"(代理商|agent)\s*(?:id|代號|代碼|代号|代码)\s*'\"?\b([A-Z]\d{4}[A-Z])\b'\"?", 'column': 'sd.AgentID', 'type': '代理商ID'},
             # 模式2: 匹配 "類型 + 名稱" - (簡化了模式，使其更穩健)
             {'pattern': r"(買家|买家|buyer|客戶)\s+([a-zA-Z0-9&.-]+)", 'column': 'sd.BuyerName', 'type': '買家'},
             {'pattern': r"(申請方|申请方|申請廠商|申请厂商|applicant)\s+([a-zA-Z0-9&.-]+)", 'column': 'sd.ApplicantName', 'type': '申請方'},
             {'pattern': r"(付款方|付款厂商|invoiceto)\s+([a-zA-Z0-9&.-]+)", 'column': 'sd.InvoiceToName', 'type': '付款方'},
             {'pattern': r"(代理商|agent)\s+([a-zA-Z0-9&.-]+)", 'column': 'sd.AgentName', 'type': '代理商'},
             # 模式3: 单独匹配一个 ID - (保持不變)
             {'pattern': r"\b([A-Z]\d{4}[A-Z])\b", 'column': 'sd.ApplicantID', 'type': 'ID'}
         ]
         for p in entity_patterns:
             match = re.search(p['pattern'], question, re.IGNORECASE)
             if match:
                 entity_value = match.group(2) if len(match.groups()) > 1 else match.group(1)
                 entity_match_data = {
-                    "type": p['type'],
                     "name": entity_value.strip().upper(),
                     "column": p['column']
                 }
                 break
         # ==============================================================================
         #  第一層：模組化意圖偵測與動態SQL組合
         # ==============================================================================
         intents = {}
         sql_components = {
             'select': [], 'from': "", 'joins': [], 'where': [],
             'group_by': [], 'order_by': [], 'log_parts': []
         }
         # --- 運行一系列獨立的意圖偵測器 ---
         # 偵測器 2.1: 核心動作意圖
         if any(kw in q_lower for kw in ['幾份', '多少', '數量', '總數', 'how many', 'count']):
             intents['action'] = 'count'
@@ -566,7 +575,7 @@ class TextToSQLSystem:
             sql_components['select'].append("jt.JobNo, jt.ReportAuthorization")
             sql_components['order_by'].append("jt.ReportAuthorization DESC")
             sql_components['log_parts'].append("報告列表")
         # 偵測器 2.2: 時間意圖
         year_match = re.search(r'(\d{4})\s*年?', question)
         month_match = re.search(r'(\d{1,2})\s*月', question)
@@ -578,7 +587,7 @@ class TextToSQLSystem:
             month = month_match.group(1).zfill(2)
             sql_components['where'].append(f"strftime('%m', jt.ReportAuthorization) = '{month}'")
             sql_components['log_parts'].append(f"{month}月")
         # 偵測器 2.3: 實體意圖
         if entity_match_data:
             if "TSR53SampleDescription" not in " ".join(sql_components['joins']):
@@ -590,7 +599,7 @@ class TextToSQLSystem:
             sql_components['log_parts'].append(entity_match_data["type"] + ":" + entity_name)
             if intents.get('action') == 'list':
                 sql_components['select'].append("sd.BuyerName")
         # 偵測器 2.4: 評級意圖
         if 'fail' in q_lower or '失敗' in q_lower:
             if "TSR53SampleDescription" not in " ".join(sql_components['joins']):
@@ -602,7 +611,7 @@ class TextToSQLSystem:
                  sql_components['joins'].append("JOIN TSR53SampleDescription AS sd ON jt.JobNo = sd.JobNo")
             sql_components['where'].append("sd.OverallRating = 'Pass'")
             sql_components['log_parts'].append("Pass")
         # 偵測器 2.5: 實驗組 (LabGroup) 意圖 (帶有別名映射)
         lab_group_mapping = {'A': 'TA', 'B': 'TB', 'C': 'TC', 'D': 'TD', 'E': 'TE', 'Y': 'TY'}
         lab_group_match = re.search(r'([A-Z]{1,2})組', question, re.IGNORECASE)
@@ -612,54 +621,54 @@ class TextToSQLSystem:
             sql_components['joins'].append("JOIN JobItemsInProgress AS jip ON jt.JobNo = jip.JobNo")
             sql_components['where'].append(f"jip.LabGroup = '{db_lab_group}'")
             sql_components['log_parts'].append(f"{user_input_group}組(->{db_lab_group})")
         # --- 3. 判斷是否觸發了模板，並動態組合 SQL ---
         if 'action' in intents:
             sql_components['from'] = "FROM JobTimeline AS jt"
             # 只要有任何篩選條件，就加上報告已授權的基礎限制
             if sql_components['where']:
                  sql_components['where'].insert(0, "jt.ReportAuthorization IS NOT NULL")
             select_clause = "SELECT " + ", ".join(sorted(list(set(sql_components['select']))))
             from_clause = sql_components['from']
             joins_clause = " ".join(sql_components['joins'])
             where_clause = "WHERE " + " AND ".join(sql_components['where']) if sql_components['where'] else ""
             orderby_clause = "ORDER BY " + ", ".join(sql_components['order_by']) if sql_components['order_by'] else ""
             template_sql = f"{select_clause} {from_clause} {joins_clause} {where_clause} {orderby_clause};"
             query_log = " ".join(sql_components['log_parts'])
             self._log(f"🔄 偵測到組合意圖【{query_log}】，啟用動態模板。", "INFO")
             return self._finalize_sql(template_sql, f"模板覆寫: {query_log} 查詢")
         # ==============================================================================
         #  第二层：AI 生成修正流程 (Fallback)
         # ==============================================================================
         self._log("未觸發任何模板，嘗試解析並修正 AI 輸出...", "INFO")
         parsed_sql = parse_sql_from_response(raw_response)
         if not parsed_sql:
             self._log(f"❌ 未能從模型回應中解析出任何 SQL。原始回應: {raw_response}", "ERROR")
             return None, f"無法解析SQL。原始回應:\n{raw_response}"
         self._log(f"📊 解析出的原始 SQL: {parsed_sql}", "DEBUG")
         fixed_sql = " " + parsed_sql.strip() + " "
         fixes_applied_fallback = []
         dialect_corrections = {r'YEAR\s*\(([^)]+)\)': r"strftime('%Y', \1)"}
         for pattern, replacement in dialect_corrections.items():
             if re.search(pattern, fixed_sql, re.IGNORECASE):
                 fixed_sql = re.sub(pattern, replacement, fixed_sql, flags=re.IGNORECASE)
                 fixes_applied_fallback.append(f"修正方言: {pattern}")
         schema_corrections = {'TSR53Report':'TSR53SampleDescription', 'TSR53InvoiceReportNo':'JobNo', 'TSR53ReportNo':'JobNo', 'TSR53InvoiceNo':'JobNo', 'TSR53InvoiceCreditNoteNo':'InvoiceCreditNoteNo', 'TSR53InvoiceLocalAmount':'LocalAmount', 'Status':'OverallRating', 'ReportStatus':'OverallRating'}
         for wrong, correct in schema_corrections.items():
             pattern = r'\b' + re.escape(wrong) + r'\b'
             if re.search(pattern, fixed_sql, re.IGNORECASE):
                 fixed_sql = re.sub(pattern, correct, fixed_sql, flags=re.IGNORECASE)
                 fixes_applied_fallback.append(f"映射 Schema: '{wrong}' -> '{correct}'")
         log_msg = "AI 生成並成功修正" if fixes_applied_fallback else "AI 生成且無需修正"
         return self._finalize_sql(fixed_sql, log_msg)
@@ -830,7 +839,7 @@ Your single SQLite query response:
             # 2. 建立提示詞
             self._log("📝 建立 Prompt...")
             prompt = self._build_prompt(question, examples)
             # --- 新增：如果是第二次嘗試，加入修正指令 ---
             if attempt > 0:
                 correction_prompt = "\nYour previous attempt failed because you did not provide a valid SQL query. REMEMBER: ONLY output the SQL code inside a ```sql block. DO NOT write comments or explanations.\nSQL:\n```sql\n"

 from typing import List, Dict, Tuple, Optional
 import faiss
 from functools import lru_cache
+import re
 # 使用 transformers 替代 sentence-transformers
 from transformers import AutoModel, AutoTokenizer
     return f"[{get_current_time()}] [{level.upper()}] {message}"
 def parse_sql_from_response(response_text: str) -> Optional[str]:
+    """更健壯的 SQL 擷取 (multi-line 安全版)"""
     if not response_text:
         return None
+    text = response_text.strip()
+    # 1) 取得所有 ```sql / ``` 區塊，優先使用
+    code_blocks = re.findall(r"```(?:sql)?\s*\n([\s\S]*?)```", text, flags=re.IGNORECASE)
+    candidates = []
+    for block in code_blocks:
+        b = block.strip()
+        if 'select' in b.lower():
+            candidates.append(b)
+    # 2) 若無 code block，直接以正則抓第一個 SELECT...; 或到結尾
+    if not candidates:
+        m = re.search(r"SELECT\b[\s\S]*?(?:;|$)", text, flags=re.IGNORECASE)
+        if m:
+            candidates.append(m.group(0).strip())
+    if not candidates:
+        return None
+    def clean(sql_raw: str) -> str:
+        # 去除註解行與多餘空白
+        lines = []
+        for line in sql_raw.split('\n'):
+            l = line.strip()
+            if not l:
+                continue
+            if l.startswith('--') or l.startswith('#'):
+                continue
+            lines.append(l)
+        sql_clean = ' '.join(lines)
+        # 移除多個反引號殘留
+        sql_clean = sql_clean.replace('```', '').strip()
+        # 若有多個分號只保留第一個前面內容後加單一分號
+        if ';' in sql_clean:
+            first_part = sql_clean.split(';')[0].strip()
+            sql_clean = first_part
+        if not sql_clean.lower().startswith('select'):
+            return ''
+        if not sql_clean.endswith(';'):
+            sql_clean += ';'
+        return sql_clean
+    for cand in candidates:
+        cleaned = clean(cand)
+        if cleaned:
+            return cleaned
     return None
 # ==================== Text-to-SQL 核心類 ====================
             return self._generate_fallback_sql(prompt)
         try:
+            # 重要: 移除 ";" 讓模型可輸出完整查詢（包含結尾分號前所有內容）
             output = self.llm(
                 prompt,
                 max_tokens=350,
                 temperature=0.05,
                 top_p=0.9,
                 echo=False,
+                stop=["```", "\n\n", "</s>"]
             )
             self._log(f"🧠 模型原始輸出 (Raw Output): {output}", "DEBUG")
                 # --- 清理邏輯結束 ---
             else:
                 self._log("❌ 模型的原始輸出格式不正確或為空。", "ERROR")
+                return ""
         except Exception as e:
             self._log(f"❌ 模型生成過程中發生嚴重錯誤: {e}", "CRITICAL")
         - 根據問題是關於「報告」還是「測試項目」來智能地決定計數目標。
         """
         q_lower = question.lower()
         # ==============================================================================
         #  第零層：統一實體識別引擎 (Unified Entity Recognition Engine)
         # ==============================================================================
             {'pattern': r"(申請方|申请方|申請廠商|申请厂商|applicant)\s*(?:id|代號|代碼|代号|代码)\s*'\"?\b([A-Z]\d{4}[A-Z])\b'\"?", 'column': 'sd.ApplicantID', 'type': '申請方ID'},
             {'pattern': r"(付款方|付款厂商|invoiceto)\s*(?:id|代號|代碼|代号|代码)\s*'\"?\b([A-Z]\d{4}[A-Z])\b'\"?", 'column': 'sd.InvoiceToID', 'type': '付款方ID'},
             {'pattern': r"(代理商|agent)\s*(?:id|代號|代碼|代号|代码)\s*'\"?\b([A-Z]\d{4}[A-Z])\b'\"?", 'column': 'sd.AgentID', 'type': '代理商ID'},
             # 模式2: 匹配 "類型 + 名稱" - (簡化了模式，使其更穩健)
             {'pattern': r"(買家|买家|buyer|客戶)\s+([a-zA-Z0-9&.-]+)", 'column': 'sd.BuyerName', 'type': '買家'},
             {'pattern': r"(申請方|申请方|申請廠商|申请厂商|applicant)\s+([a-zA-Z0-9&.-]+)", 'column': 'sd.ApplicantName', 'type': '申請方'},
             {'pattern': r"(付款方|付款厂商|invoiceto)\s+([a-zA-Z0-9&.-]+)", 'column': 'sd.InvoiceToName', 'type': '付款方'},
             {'pattern': r"(代理商|agent)\s+([a-zA-Z0-9&.-]+)", 'column': 'sd.AgentName', 'type': '代理商'},
             # 模式3: 单独匹配一个 ID - (保持不變)
             {'pattern': r"\b([A-Z]\d{4}[A-Z])\b", 'column': 'sd.ApplicantID', 'type': 'ID'}
         ]
         for p in entity_patterns:
             match = re.search(p['pattern'], question, re.IGNORECASE)
             if match:
                 entity_value = match.group(2) if len(match.groups()) > 1 else match.group(1)
                 entity_match_data = {
+                    "type": p['type'],
                     "name": entity_value.strip().upper(),
                     "column": p['column']
                 }
                 break
         # ==============================================================================
         #  第一層：模組化意圖偵測與動態SQL組合
         # ==============================================================================
         intents = {}
         sql_components = {
             'select': [], 'from': "", 'joins': [], 'where': [],
             'group_by': [], 'order_by': [], 'log_parts': []
         }
         # --- 運行一系列獨立的意圖偵測器 ---
         # 偵測器 2.1: 核心動作意圖
         if any(kw in q_lower for kw in ['幾份', '多少', '數量', '總數', 'how many', 'count']):
             intents['action'] = 'count'
             sql_components['select'].append("jt.JobNo, jt.ReportAuthorization")
             sql_components['order_by'].append("jt.ReportAuthorization DESC")
             sql_components['log_parts'].append("報告列表")
         # 偵測器 2.2: 時間意圖
         year_match = re.search(r'(\d{4})\s*年?', question)
         month_match = re.search(r'(\d{1,2})\s*月', question)
             month = month_match.group(1).zfill(2)
             sql_components['where'].append(f"strftime('%m', jt.ReportAuthorization) = '{month}'")
             sql_components['log_parts'].append(f"{month}月")
         # 偵測器 2.3: 實體意圖
         if entity_match_data:
             if "TSR53SampleDescription" not in " ".join(sql_components['joins']):
             sql_components['log_parts'].append(entity_match_data["type"] + ":" + entity_name)
             if intents.get('action') == 'list':
                 sql_components['select'].append("sd.BuyerName")
         # 偵測器 2.4: 評級意圖
         if 'fail' in q_lower or '失敗' in q_lower:
             if "TSR53SampleDescription" not in " ".join(sql_components['joins']):
                  sql_components['joins'].append("JOIN TSR53SampleDescription AS sd ON jt.JobNo = sd.JobNo")
             sql_components['where'].append("sd.OverallRating = 'Pass'")
             sql_components['log_parts'].append("Pass")
         # 偵測器 2.5: 實驗組 (LabGroup) 意圖 (帶有別名映射)
         lab_group_mapping = {'A': 'TA', 'B': 'TB', 'C': 'TC', 'D': 'TD', 'E': 'TE', 'Y': 'TY'}
         lab_group_match = re.search(r'([A-Z]{1,2})組', question, re.IGNORECASE)
             sql_components['joins'].append("JOIN JobItemsInProgress AS jip ON jt.JobNo = jip.JobNo")
             sql_components['where'].append(f"jip.LabGroup = '{db_lab_group}'")
             sql_components['log_parts'].append(f"{user_input_group}組(->{db_lab_group})")
         # --- 3. 判斷是否觸發了模板，並動態組合 SQL ---
         if 'action' in intents:
             sql_components['from'] = "FROM JobTimeline AS jt"
             # 只要有任何篩選條件，就加上報告已授權的基礎限制
             if sql_components['where']:
                  sql_components['where'].insert(0, "jt.ReportAuthorization IS NOT NULL")
             select_clause = "SELECT " + ", ".join(sorted(list(set(sql_components['select']))))
             from_clause = sql_components['from']
             joins_clause = " ".join(sql_components['joins'])
             where_clause = "WHERE " + " AND ".join(sql_components['where']) if sql_components['where'] else ""
             orderby_clause = "ORDER BY " + ", ".join(sql_components['order_by']) if sql_components['order_by'] else ""
             template_sql = f"{select_clause} {from_clause} {joins_clause} {where_clause} {orderby_clause};"
             query_log = " ".join(sql_components['log_parts'])
             self._log(f"🔄 偵測到組合意圖【{query_log}】，啟用動態模板。", "INFO")
             return self._finalize_sql(template_sql, f"模板覆寫: {query_log} 查詢")
         # ==============================================================================
         #  第二层：AI 生成修正流程 (Fallback)
         # ==============================================================================
         self._log("未觸發任何模板，嘗試解析並修正 AI 輸出...", "INFO")
         parsed_sql = parse_sql_from_response(raw_response)
         if not parsed_sql:
             self._log(f"❌ 未能從模型回應中解析出任何 SQL。原始回應: {raw_response}", "ERROR")
             return None, f"無法解析SQL。原始回應:\n{raw_response}"
         self._log(f"📊 解析出的原始 SQL: {parsed_sql}", "DEBUG")
         fixed_sql = " " + parsed_sql.strip() + " "
         fixes_applied_fallback = []
         dialect_corrections = {r'YEAR\s*\(([^)]+)\)': r"strftime('%Y', \1)"}
         for pattern, replacement in dialect_corrections.items():
             if re.search(pattern, fixed_sql, re.IGNORECASE):
                 fixed_sql = re.sub(pattern, replacement, fixed_sql, flags=re.IGNORECASE)
                 fixes_applied_fallback.append(f"修正方言: {pattern}")
         schema_corrections = {'TSR53Report':'TSR53SampleDescription', 'TSR53InvoiceReportNo':'JobNo', 'TSR53ReportNo':'JobNo', 'TSR53InvoiceNo':'JobNo', 'TSR53InvoiceCreditNoteNo':'InvoiceCreditNoteNo', 'TSR53InvoiceLocalAmount':'LocalAmount', 'Status':'OverallRating', 'ReportStatus':'OverallRating'}
         for wrong, correct in schema_corrections.items():
             pattern = r'\b' + re.escape(wrong) + r'\b'
             if re.search(pattern, fixed_sql, re.IGNORECASE):
                 fixed_sql = re.sub(pattern, correct, fixed_sql, flags=re.IGNORECASE)
                 fixes_applied_fallback.append(f"映射 Schema: '{wrong}' -> '{correct}'")
         log_msg = "AI 生成並成功修正" if fixes_applied_fallback else "AI 生成且無需修正"
         return self._finalize_sql(fixed_sql, log_msg)
             # 2. 建立提示詞
             self._log("📝 建立 Prompt...")
             prompt = self._build_prompt(question, examples)
             # --- 新增：如果是第二次嘗試，加入修正指令 ---
             if attempt > 0:
                 correction_prompt = "\nYour previous attempt failed because you did not provide a valid SQL query. REMEMBER: ONLY output the SQL code inside a ```sql block. DO NOT write comments or explanations.\nSQL:\n```sql\n"