Spaces:

Paul720810
/

Softline-SQL-Assistant

Sleeping

App Files Files Community

Paul720810 commited on Sep 3, 2025

Commit

7431cad

verified ·

1 Parent(s): 5fc665e

Update app.py

Browse files

Files changed (1) hide show

app.py +137 -217

app.py CHANGED Viewed

@@ -14,14 +14,7 @@ import numpy as np
 # ==================== 配置區 ====================
 HF_TOKEN = os.environ.get("HF_TOKEN", None) # 建議從環境變數讀取
 DATASET_REPO_ID = "Paul720810/Text-to-SQL-Softline"
-SIMILARITY_THRESHOLD = 0.6
-# 多個備用LLM模型 (注意：在當前邏輯中並未使用)
-LLM_MODELS = [
-    "https://api-inference.huggingface.co/models/gpt2",
-    "https://api-inference.huggingface.co/models/distilgpt2",
-    "https://api-inference.huggingface.co/models/microsoft/DialoGPT-small"
-]
 print("=" * 60)
 print("🤖 智能 Text-to-SQL 系統啟動中...")
@@ -45,67 +38,48 @@ def validate_sql(sql_query: str) -> Dict:
     security_issues = []
     sql_upper = sql_clean.upper()
-    # 檢查危險操作
     dangerous_keywords = ['DROP', 'DELETE', 'INSERT', 'UPDATE', 'ALTER', 'TRUNCATE', 'EXEC', 'EXECUTE']
     for keyword in dangerous_keywords:
         if f" {keyword} " in f" {sql_upper} ":
             security_issues.append(f"危險操作: {keyword}")
-    # 檢查基本語法
     if "SELECT" not in sql_upper:
         security_issues.append("缺少SELECT")
     if "FROM" not in sql_upper:
         security_issues.append("缺少FROM")
     is_valid = not security_issues
     is_safe = all('危險' not in issue for issue in security_issues)
-    return {
-        "valid": is_valid,
-        "issues": security_issues,
-        "is_safe": is_safe,
-        "empty": False
-    }
 def analyze_question_type(question: str) -> Dict:
-    """分析問題類型和關鍵詞"""
     question_lower = question.lower()
     analysis = {
         "type": "unknown",
         "keywords": [],
-        "has_count": False,
-        "has_date": False,
-        "has_group": False,
-        "has_comparison": False
     }
-    # 檢測關鍵詞
-    keywords_sets = {
-        "sales": ["銷售", "業績", "金額", "收入", "sale", "revenue"],
-        "customer": ["客戶", "買家", "用戶", "customer", "client"],
-        "product": ["產品", "商品", "項目", "product", "item"],
-        "time": ["時間", "日期", "月份", "年", "月", "最近", "date", "month", "year"],
-        "report": ["報告", "完成", "份", "report", "complete"],
-        "count": ["多少", "幾個", "數量", "count", "how many"],
-        "comparison": ["比較", "vs", " versus", "對比", "相比"]
-    }
-    for category, keywords in keywords_sets.items():
-        if any(keyword in question_lower for keyword in keywords):
-            analysis["keywords"].append(category)
-    # 特殊檢測
-    analysis["has_count"] = "count" in analysis["keywords"]
-    analysis["has_date"] = "time" in analysis["keywords"]
-    analysis["has_group"] = any(word in question_lower for word in ["每", "各", "group", "每個"])
-    analysis["has_comparison"] = "comparison" in analysis["keywords"]
-    # 確定主要類型
-    if analysis["keywords"]:
-        analysis["type"] = analysis["keywords"][0]
     return analysis
 # ==================== 完整數據加載模塊 ====================
@@ -114,77 +88,47 @@ class CompleteDataLoader:
         self.hf_token = hf_token
         self.questions = []
         self.sql_answers = []
-        self.sql_quality = []  # 記錄每個SQL的質量評分
         self.schema_data = {}
     def load_complete_dataset(self) -> bool:
-        """加載完整數據集（包括空白SQL）"""
         try:
             print(f"[{get_current_time()}] 正在加載完整數據集 '{DATASET_REPO_ID}'...")
             raw_dataset = load_dataset(DATASET_REPO_ID, token=self.hf_token)['train']
-            print("解析全部 messages 格式...")
-            total_count, empty_count, valid_count = 0, 0, 0
             for item in raw_dataset:
                 try:
                     if 'messages' in item and len(item['messages']) >= 2:
                         user_content = item['messages'][0]['content']
                         assistant_content = item['messages'][1]['content']
-                        # 提取問題
                         question_match = re.search(r'指令:\s*(.*?)(?:\n|$)', user_content)
                         question = question_match.group(1).strip() if question_match else user_content
-                        # 提取SQL
                         sql_match = re.search(r'SQL查詢:\s*(.*?)(?:\n|$)', assistant_content, re.DOTALL)
-                        if sql_match:
-                            sql_query = sql_match.group(1).strip()
-                            sql_query = re.sub(r'^sql\s*', '', sql_query, flags=re.IGNORECASE)
-                            sql_query = re.sub(r'```sql|```', '', sql_query).strip()
-                        else:
-                            sql_query = assistant_content
-                        # 保存所有數據
                         self.questions.append(question)
                         self.sql_answers.append(sql_query)
-                        # 評估SQL質量
-                        validation = validate_sql(sql_query)
-                        quality_score = 1.0 if validation["valid"] else 0.3
-                        self.sql_quality.append(quality_score)
-                        total_count += 1
-                        if validation["empty"]:
-                            empty_count += 1
-                        if validation["valid"]:
-                            valid_count += 1
                 except Exception:
                     continue
-            print(f"數據加載完成: 總數 {total_count}, 有效 {valid_count}, 空白 {empty_count}")
             return True
         except Exception as e:
             print(f"數據集加載失敗: {e}")
             return False
     def load_schema(self) -> bool:
-        """加載數據庫Schema"""
         try:
-            schema_file_path = hf_hub_download(
-                repo_id=DATASET_REPO_ID,
-                filename="sqlite_schema_FULL.json",
-                repo_type='dataset',
-                token=self.hf_token
-            )
             with open(schema_file_path, 'r', encoding='utf-8') as f:
                 self.schema_data = json.load(f)
             print("Schema加載成功")
             return True
         except Exception as e:
             print(f"Schema加載失敗: {e}")
-            self.schema_data = {}
             return False
 # ==================== 檢索系統 ====================
@@ -197,19 +141,18 @@ class RetrievalSystem:
             print(f"SentenceTransformer 模型加載失敗: {e}")
             self.embedder = None
-    def compute_embeddings(self, questions: List[str]) -> None:
         if self.embedder and questions:
             print(f"正在為 {len(questions)} 個問題計算向量...")
             self.question_embeddings = self.embedder.encode(questions, convert_to_tensor=True, show_progress_bar=True)
             print("向量計算完成")
-    def retrieve_similar(self, user_question: str, top_k: int = 5) -> List[Dict]:
-        if self.embedder is None or self.question_embeddings is None or len(self.question_embeddings) == 0:
-            return []
         try:
             question_embedding = self.embedder.encode(user_question, convert_to_tensor=True)
             hits = util.semantic_search(question_embedding, self.question_embeddings, top_k=top_k)
-            return hits[0] if hits and hits[0] else []
         except Exception as e:
             print(f"檢索錯誤: {e}")
             return []
@@ -223,138 +166,131 @@ class CompleteTextToSQLSystem:
         self.initialize_system()
     def initialize_system(self):
-        """初始化系統組件"""
         print("正在初始化完整數據系統...")
         self.data_loader.load_complete_dataset()
         self.data_loader.load_schema()
-        # 為所有問題計算向量
         if self.data_loader.questions:
             self.retrieval_system.compute_embeddings(self.data_loader.questions)
         print(f"系統初始化完成，載入問題總數: {len(self.data_loader.questions)}")
-    # ===== 輔助函數 (作為類別方法) =====
-    def get_available_tables(self) -> Dict:
-        """從schema中獲取所有可用的表和欄位"""
-        if not self.data_loader.schema_data:
-            return {}
-        tables = {}
-        for table_name, columns_list in self.data_loader.schema_data.items():
-            if isinstance(columns_list, list):
-                column_names = [col["name"] for col in columns_list if "name" in col]
-                tables[table_name] = column_names
-        return tables
-    def extract_number(self, text: str, default: int = 10) -> int:
-        """從文字中提取數字"""
-        numbers = re.findall(r'\d+', text)
-        return int(numbers[0]) if numbers else default
     def generate_sql_from_question(self, question: str, analysis: Dict) -> str:
-        """根據問題分析和真實Schema生成智能SQL"""
-        question_lower = question.lower()
-        available_tables = self.get_available_tables().keys()
-        # 1. 每月/每日完成數量 - 使用 JobTimeline 相關表
-        if any(kw in question_lower for kw in ["每月", "每日", "昨天", "完成"]) and analysis["has_count"]:
-            group_match = re.search(r'([a-z]組)', question_lower)
-            if group_match:
-                group = group_match.group(1).replace('組', '').upper()
-                group_mapping = {'A': 'TA', 'B': 'TB', 'C': 'TC', 'D': 'TD'}
-                table_suffix = group_mapping.get(group, 'TA')
-                table_name = f"JobTimeline_{table_suffix}"
-                if "昨天" in question_lower:
-                    return f"SELECT COUNT(*) as 完成數量 FROM {table_name} WHERE DATE(end_time) = DATE('now','-1 day');"
-                elif "每月" in question_lower:
-                    year_match = re.search(r'(\d{4})年?', question_lower)
-                    year = year_match.group(1) if year_match else datetime.now().strftime('%Y')
-                    return f"""SELECT strftime('%Y-%m', end_time) as 月份, COUNT(*) as 完成數量 FROM {table_name} WHERE strftime('%Y', end_time) = '{year}' AND end_time IS NOT NULL GROUP BY strftime('%Y-%m', end_time) ORDER BY 月份;"""
-            return "SELECT strftime('%Y-%m', jt.end_time) as 月份, COUNT(*) as 完成數量 FROM JobTimeline jt WHERE jt.end_time IS NOT NULL GROUP BY strftime('%Y-%m', jt.end_time) ORDER BY 月份;"
-        # 2. 評級分析 - 使用 TSR53SampleDescription.OverallRating
-        elif any(kw in question_lower for kw in ["評級", "rating", "等級"]) and "TSR53SampleDescription" in available_tables:
-            if any(kw in question_lower for kw in ["分佈", "統計", "多少"]):
-                return "SELECT OverallRating as 評級, COUNT(*) as 數量, ROUND(COUNT(*) * 100.0 / (SELECT COUNT(*) FROM TSR53SampleDescription), 2) as 百分比 FROM TSR53SampleDescription WHERE OverallRating IS NOT NULL GROUP BY OverallRating ORDER BY 數量 DESC;"
-            elif "fail" in question_lower or "失敗" in question_lower:
-                return "SELECT JobNo as 工作單號, ApplicantName as 申請方, OverallRating as 評級 FROM TSR53SampleDescription WHERE OverallRating = 'Fail' ORDER BY JobNo;"
-        # 3. 金額相關查詢 - 使用 TSR53Invoice
-        elif any(kw in question_lower for kw in ["金額", "總額", "收入", "invoice"]) and any(kw in question_lower for kw in ["最高", "最大", "top"]):
-            limit_num = self.extract_number(question_lower, default=10)
-            return f"""WITH JobTotalAmount AS (SELECT JobNo, SUM(LocalAmount) AS TotalAmount FROM (SELECT DISTINCT JobNo, InvoiceCreditNoteNo, LocalAmount FROM TSR53Invoice WHERE LocalAmount IS NOT NULL) GROUP BY JobNo) SELECT jta.JobNo as 工作單號, sd.ApplicantName as 申請方, jta.TotalAmount as 總金額 FROM JobTotalAmount jta JOIN TSR53SampleDescription sd ON sd.JobNo = jta.JobNo ORDER BY jta.TotalAmount DESC LIMIT {limit_num};"""
-        # 4. 公司/客戶相關查詢
-        elif any(kw in question_lower for kw in ["公司", "客戶", "申請方", "付款方"]):
-            if any(kw in question_lower for kw in ["最多", "top", "排名"]):
-                return "SELECT ApplicantName as 申請方名稱, COUNT(*) as 工作單數量 FROM TSR53SampleDescription WHERE ApplicantName IS NOT NULL GROUP BY ApplicantName ORDER BY 工作單數量 DESC LIMIT 10;"
-            return "SELECT ApplicantName as 申請方, InvoiceToName as 付款方, COUNT(*) as 工作單數量 FROM TSR53SampleDescription WHERE ApplicantName IS NOT NULL GROUP BY ApplicantName, InvoiceToName ORDER BY 工作單數量 DESC;"
-        # ... 其他規則可以繼續添加 ...
-        # 預設查詢 - 顯示基本工作單資訊
-        return "SELECT JobNo as 工作單號, ApplicantName as 申請方, InvoiceToName as 付款方, OverallRating as 評級 FROM TSR53SampleDescription LIMIT 20;"
-    def repair_empty_sql(self, original_sql: str, user_question: str, similar_question: str) -> str:
-        """修復空白或無效的SQL"""
-        validation = validate_sql(original_sql)
-        if not validation["valid"]:
-            analysis = analyze_question_type(user_question)
-            repaired_sql = self.generate_sql_from_question(user_question, analysis)
-            return f"-- 根據類似問題 '{similar_question}' (原SQL無效) 自動生成的查詢\n{repaired_sql}"
-        return original_sql
     def generate_sql(self, user_question: str) -> Tuple[str, str]:
-        """主流程：生成SQL查詢"""
         log_messages = [f"⏰ {get_current_time()} 開始處理"]
         if not user_question or not user_question.strip():
             return "請輸入您的問題。", "錯誤: 問題為空"
         # 1. 檢索最相似的問題
-        if self.data_loader.questions:
-            hits = self.retrieval_system.retrieve_similar(user_question)
-            if hits:
-                best_hit = hits[0]
-                similarity_score = best_hit['score']
                 corpus_id = best_hit['corpus_id']
                 similar_question = self.data_loader.questions[corpus_id]
                 original_sql = self.data_loader.sql_answers[corpus_id]
-                log_messages.append(f"🔍 檢索到最相似問題: '{similar_question}'")
-                log_messages.append(f"📊 相似度: {similarity_score:.3f}")
-                if similarity_score > SIMILARITY_THRESHOLD:
-                    repaired_sql = self.repair_empty_sql(original_sql, user_question, similar_question)
-                    log_messages.append(f"✅ 相似度高於閾值 {SIMILARITY_THRESHOLD}，採用檢索結果。")
-                    return repaired_sql, "\n".join(log_messages)
                 else:
-                    log_messages.append(f"ℹ️ 相似度低於閾值 {SIMILARITY_THRESHOLD}，轉為智能生成。")
-        # 2. 如果檢索失敗或相似度不足，智能生成SQL
-        log_messages.append("🤖 找不到高相似度結果，啟用智能生成規則...")
         analysis = analyze_question_type(user_question)
-        intelligent_sql = self.generate_sql_from_question(user_question, analysis)
-        log_messages.append(f"📋 問題分析: {analysis['type']} 類型, 關鍵詞: {analysis['keywords']}")
         log_messages.append("✅ 智能生成完成。")
         return intelligent_sql, "\n".join(log_messages)
 # ==================== 初始化系統 ====================
-print("準備初始化 Text-to-SQL 系統...")
-# 檢查 HF_TOKEN 是否存在
 if HF_TOKEN is None:
-    print("\n" + "="*60)
-    print("⚠️ 警告: Hugging Face Token 未設置。")
-    print("請在環境變數中設定 HF_TOKEN 才能從私人數據集下載資料。")
-    print("="*60 + "\n")
-    # 這裡可以選擇退出或繼續，但下載會失敗
     text_to_sql_system = None
 else:
     text_to_sql_system = CompleteTextToSQLSystem(HF_TOKEN)
@@ -366,51 +302,35 @@ def process_query(user_question: str) -> Tuple[str, str, str]:
         return "系統未初始化", error_msg, error_msg
     sql_result, log_message = text_to_sql_system.generate_sql(user_question)
-    return sql_result, "✅ SQL生成完成", log_message
 with gr.Blocks(title="智慧Text-to-SQL系統", theme=gr.themes.Soft()) as demo:
-    gr.Markdown("# 🚀 智慧 Text-to-SQL 系統")
-    gr.Markdown("📊 **模式**: 讀取雲端數據集並結合「檢索」與「規則生成」兩種模式。")
     with gr.Row():
-        question_input = gr.Textbox(
-            label="📝 請在此輸入您的問題",
-            placeholder="例如：2023年每月完成多少份報告？ 或 哪個客戶的訂單總金額最高？",
-            lines=3,
-            scale=4
-        )
         submit_btn = gr.Button("🚀 生成SQL", variant="primary", scale=1)
     with gr.Accordion("🔍 結果與日誌", open=True):
-        sql_output = gr.Code(
-            label="📊 生成的SQL查詢",
-            language="sql",
-            lines=8
-        )
         status_output = gr.Textbox(label="🔍 執行狀態", interactive=False)
-        log_output = gr.Textbox(label="📋 詳細日誌", lines=5, interactive=False)
-    # 預設範例
     gr.Examples(
         examples=[
-            "昨天完成了多少個工作單?",
-            "A組每月完成數量是多少？",
-            "哪個申請方的失敗評級最多？",
-            "找出總金額最高的10筆訂單",
-            "統計所有評級的分佈"
         ],
         inputs=question_input
     )
-    submit_btn.click(
-        process_query,
-        inputs=question_input,
-        outputs=[sql_output, status_output, log_output]
-    )
 if __name__ == "__main__":
-    print("Gradio 介面啟動中...")
-    if text_to_sql_system is None:
-        print("無法啟動 Gradio，因為系統初始化失敗。")
-    else:
         demo.launch(server_name="0.0.0.0", server_port=7860, share=False)

 # ==================== 配置區 ====================
 HF_TOKEN = os.environ.get("HF_TOKEN", None) # 建議從環境變數讀取
 DATASET_REPO_ID = "Paul720810/Text-to-SQL-Softline"
+SIMILARITY_THRESHOLD = 0.65 # 適度提高閾值，確保檢索到的問題意圖更一致
 print("=" * 60)
 print("🤖 智能 Text-to-SQL 系統啟動中...")
     security_issues = []
     sql_upper = sql_clean.upper()
     dangerous_keywords = ['DROP', 'DELETE', 'INSERT', 'UPDATE', 'ALTER', 'TRUNCATE', 'EXEC', 'EXECUTE']
     for keyword in dangerous_keywords:
         if f" {keyword} " in f" {sql_upper} ":
             security_issues.append(f"危險操作: {keyword}")
     if "SELECT" not in sql_upper:
         security_issues.append("缺少SELECT")
     if "FROM" not in sql_upper:
         security_issues.append("缺少FROM")
     is_valid = not security_issues
     is_safe = all('危險' not in issue for issue in security_issues)
+    return {"valid": is_valid, "issues": security_issues, "is_safe": is_safe, "empty": False}
 def analyze_question_type(question: str) -> Dict:
+    """增強的問題分析 - 更精確的意圖識別"""
     question_lower = question.lower()
     analysis = {
         "type": "unknown",
         "keywords": [],
+        "has_count": "多少" in question_lower or "幾個" in question_lower or "數量" in question_lower,
+        "has_date": "時間" in question_lower or "日期" in question_lower or "月份" in question_lower or "年" in question_lower,
+        "has_group": "每" in question_lower or "各" in question_lower or "分組" in question_lower,
+        "specific_intent": "general_query"  # 新增：具體意圖，預設為通用查詢
     }
+    # **更精確的意圖識別**
+    if "每月" in question_lower and ("完成" in question_lower or "報告" in question_lower or "工作單" in question_lower):
+        analysis["specific_intent"] = "monthly_completion_count"
+        analysis["type"] = "time_series"
+    elif ("評級" in question_lower or "pass" in question_lower or "fail" in question_lower) and ("統計" in question_lower or "分佈" in question_lower or "多少" in question_lower):
+        analysis["specific_intent"] = "rating_distribution"
+        analysis["type"] = "statistics"
+    elif "金額" in question_lower and ("最高" in question_lower or "top" in question_lower or "排名" in question_lower):
+        analysis["specific_intent"] = "amount_ranking"
+        analysis["type"] = "ranking"
+    elif ("公司" in question_lower or "客戶" in question_lower or "申請方" in question_lower) and ("統計" in question_lower or "數量" in question_lower or "排名" in question_lower):
+        analysis["specific_intent"] = "company_statistics"
+        analysis["type"] = "statistics"
     return analysis
 # ==================== 完整數據加載模塊 ====================
         self.hf_token = hf_token
         self.questions = []
         self.sql_answers = []
+        self.sql_quality = []
         self.schema_data = {}
     def load_complete_dataset(self) -> bool:
         try:
             print(f"[{get_current_time()}] 正在加載完整數據集 '{DATASET_REPO_ID}'...")
             raw_dataset = load_dataset(DATASET_REPO_ID, token=self.hf_token)['train']
             for item in raw_dataset:
                 try:
                     if 'messages' in item and len(item['messages']) >= 2:
                         user_content = item['messages'][0]['content']
                         assistant_content = item['messages'][1]['content']
                         question_match = re.search(r'指令:\s*(.*?)(?:\n|$)', user_content)
                         question = question_match.group(1).strip() if question_match else user_content
                         sql_match = re.search(r'SQL查詢:\s*(.*?)(?:\n|$)', assistant_content, re.DOTALL)
+                        sql_query = sql_match.group(1).strip() if sql_match else assistant_content
+                        sql_query = re.sub(r'```sql|```', '', sql_query).strip()
                         self.questions.append(question)
                         self.sql_answers.append(sql_query)
                 except Exception:
                     continue
+            print(f"數據加載完成: 總數 {len(self.questions)}")
             return True
         except Exception as e:
             print(f"數據集加載失敗: {e}")
             return False
     def load_schema(self) -> bool:
         try:
+            schema_file_path = hf_hub_download(repo_id=DATASET_REPO_ID, filename="sqlite_schema_FULL.json", repo_type='dataset', token=self.hf_token)
             with open(schema_file_path, 'r', encoding='utf-8') as f:
                 self.schema_data = json.load(f)
             print("Schema加載成功")
             return True
         except Exception as e:
             print(f"Schema加載失敗: {e}")
             return False
 # ==================== 檢索系統 ====================
             print(f"SentenceTransformer 模型加載失敗: {e}")
             self.embedder = None
+    def compute_embeddings(self, questions: List[str]):
         if self.embedder and questions:
             print(f"正在為 {len(questions)} 個問題計算向量...")
             self.question_embeddings = self.embedder.encode(questions, convert_to_tensor=True, show_progress_bar=True)
             print("向量計算完成")
+    def retrieve_similar(self, user_question: str, top_k: int = 1) -> List[Dict]:
+        if self.embedder is None or self.question_embeddings is None: return []
         try:
             question_embedding = self.embedder.encode(user_question, convert_to_tensor=True)
             hits = util.semantic_search(question_embedding, self.question_embeddings, top_k=top_k)
+            return hits[0] if hits else []
         except Exception as e:
             print(f"檢索錯誤: {e}")
             return []
         self.initialize_system()
     def initialize_system(self):
         print("正在初始化完整數據系統...")
         self.data_loader.load_complete_dataset()
         self.data_loader.load_schema()
         if self.data_loader.questions:
             self.retrieval_system.compute_embeddings(self.data_loader.questions)
         print(f"系統初始化完成，載入問題總數: {len(self.data_loader.questions)}")
+    def extract_year(self, text: str) -> str:
+        """從文字中提取年份，若無則返回當年"""
+        year_match = re.search(r'(\d{4})', text)
+        return year_match.group(1) if year_match else datetime.now().strftime('%Y')
     def generate_sql_from_question(self, question: str, analysis: Dict) -> str:
+        """通用SQL生成器 (作為最終備用)"""
+        # 此函數現在作為無法識別具體意圖時的通用後備方案
+        return f"""-- 通用查詢範本
+SELECT
+    JobNo as 工作單號,
+    ApplicantName as 申請方,
+    OverallRating as 評級
+FROM TSR53SampleDescription
+LIMIT 20;"""
+    def intelligent_repair_sql(self, user_question: str, similar_question: str) -> str:
+        """智能修復SQL - 基於當前使用者問題的意圖"""
+        analysis = analyze_question_type(user_question)
+        intent = analysis["specific_intent"]
+        comment = f"-- 根據類似問題 '{similar_question}' (原SQL無效) 進行智能修復\n"
+        if intent == "monthly_completion_count":
+            year = self.extract_year(user_question)
+            return comment + f"""-- 查詢 {year} 年每月完成的工作單數量
+SELECT
+    strftime('%Y-%m', jt.end_time) as 月份,
+    COUNT(*) as 完成數量
+FROM JobTimeline jt
+WHERE strftime('%Y', jt.end_time) = '{year}' AND jt.end_time IS NOT NULL
+GROUP BY strftime('%Y-%m', jt.end_time)
+ORDER BY 月份;"""
+        elif intent == "rating_distribution":
+            return comment + """-- 查詢評級分佈統計
+SELECT
+    OverallRating as 評級,
+    COUNT(*) as 數量,
+    ROUND(COUNT(*) * 100.0 / (SELECT COUNT(*) FROM TSR53SampleDescription WHERE OverallRating IS NOT NULL), 2) as 百分比
+FROM TSR53SampleDescription
+WHERE OverallRating IS NOT NULL
+GROUP BY OverallRating
+ORDER BY 數量 DESC;"""
+        elif intent == "amount_ranking":
+            return comment + """-- 查詢工作單金額排名
+WITH JobTotalAmount AS (
+    SELECT JobNo, SUM(LocalAmount) AS TotalAmount
+    FROM (SELECT DISTINCT JobNo, InvoiceCreditNoteNo, LocalAmount FROM TSR53Invoice WHERE LocalAmount IS NOT NULL)
+    GROUP BY JobNo
+)
+SELECT
+    jta.JobNo as 工作單號,
+    sd.ApplicantName as 申請方,
+    jta.TotalAmount as 總金額
+FROM JobTotalAmount jta
+JOIN TSR53SampleDescription sd ON sd.JobNo = jta.JobNo
+ORDER BY jta.TotalAmount DESC
+LIMIT 10;"""
+        elif intent == "company_statistics":
+            return comment + """-- 查詢申請方工作單統計
+SELECT
+    ApplicantName as 申請方名稱,
+    COUNT(*) as 工作單數量
+FROM TSR53SampleDescription
+WHERE ApplicantName IS NOT NULL
+GROUP BY ApplicantName
+ORDER BY 工作單數量 DESC
+LIMIT 20;"""
+        # 如果無法判斷具體意圖，使用原始的通用生成邏輯
+        return comment + self.generate_sql_from_question(user_question, analysis)
     def generate_sql(self, user_question: str) -> Tuple[str, str]:
+        """主流程：生成SQL查詢 (改進版本)"""
         log_messages = [f"⏰ {get_current_time()} 開始處理"]
         if not user_question or not user_question.strip():
             return "請輸入您的問題。", "錯誤: 問題為空"
         # 1. 檢索最相似的問題
+        hits = self.retrieval_system.retrieve_similar(user_question)
+        if hits:
+            best_hit = hits[0]
+            similarity_score = best_hit['score']
+            log_messages.append(f"🔍 檢索到最相似問題 (相似度: {similarity_score:.3f})")
+            if similarity_score > SIMILARITY_THRESHOLD:
                 corpus_id = best_hit['corpus_id']
                 similar_question = self.data_loader.questions[corpus_id]
                 original_sql = self.data_loader.sql_answers[corpus_id]
+                validation = validate_sql(original_sql)
+                if validation["valid"] and validation["is_safe"]:
+                    log_messages.append("✅ 相似度高，且原SQL有效，直接採用。")
+                    return original_sql, "\n".join(log_messages)
                 else:
+                    log_messages.append(f"⚠️ 相似度高，但原SQL無效 ({', '.join(validation['issues'])})。")
+                    log_messages.append("🛠️ 啟用智能修復...")
+                    repaired_sql = self.intelligent_repair_sql(user_question, similar_question)
+                    return repaired_sql, "\n".join(log_messages)
+        log_messages.append("🤖 未找到高相似度或有效的範本，根據問題直接生成。")
         analysis = analyze_question_type(user_question)
+        # 直接使用修復邏輯來生成，因為它本身就是基於意圖的生成器
+        intelligent_sql = self.intelligent_repair_sql(user_question, "無相似問題")
+        log_messages.append(f"📋 問題意圖分析: {analysis['specific_intent']}")
         log_messages.append("✅ 智能生成完成。")
         return intelligent_sql, "\n".join(log_messages)
 # ==================== 初始化系統 ====================
 if HF_TOKEN is None:
+    print("\n" + "="*60 + "\n⚠️ 警告: Hugging Face Token 未設置。\n" + "="*60 + "\n")
     text_to_sql_system = None
 else:
     text_to_sql_system = CompleteTextToSQLSystem(HF_TOKEN)
         return "系統未初始化", error_msg, error_msg
     sql_result, log_message = text_to_sql_system.generate_sql(user_question)
+    return sql_result, "✅ 處理完成", log_message
 with gr.Blocks(title="智慧Text-to-SQL系統", theme=gr.themes.Soft()) as demo:
+    gr.Markdown("# 🚀 智慧 Text-to-SQL 系統 (進階修復版)")
+    gr.Markdown("📊 **模式**: 結合「檢索驗證」與「意圖導向生成」，即使資料庫範本有誤也能提供準確查詢。")
     with gr.Row():
+        question_input = gr.Textbox(label="📝 請在此輸入您的問題", placeholder="例如：2024年每月完成多少份報告？", lines=3, scale=4)
         submit_btn = gr.Button("🚀 生成SQL", variant="primary", scale=1)
     with gr.Accordion("🔍 結果與日誌", open=True):
+        sql_output = gr.Code(label="📊 生成的SQL查詢", language="sql", lines=10)
         status_output = gr.Textbox(label="🔍 執行狀態", interactive=False)
+        log_output = gr.Textbox(label="📋 詳細日誌", lines=6, interactive=False)
     gr.Examples(
         examples=[
+            "2023 年每月完成多少份報告？",
+            "統計一下各種評級的分佈",
+            "找出總金額最高的5筆訂單來自哪個申請方",
+            "哪個客戶的工作單數量最多？"
         ],
         inputs=question_input
     )
 if __name__ == "__main__":
+    if text_to_sql_system:
+        print("Gradio 介面啟動中...")
         demo.launch(server_name="0.0.0.0", server_port=7860, share=False)
+    else:
+        print("無法啟動 Gradio，因為系統初始化失敗。")