Spaces:

Paul720810
/

Softline-SQL-Assistant

Sleeping

App Files Files Community

Paul720810 commited on Sep 3, 2025

Commit

352a657

verified ·

1 Parent(s): 7431cad

Update app.py

Browse files

Files changed (1) hide show

app.py +150 -98

app.py CHANGED Viewed

@@ -30,33 +30,33 @@ def validate_sql(sql_query: str) -> Dict:
     """驗證SQL語句的語法和安全性"""
     if not sql_query or not sql_query.strip():
         return {"valid": False, "issues": ["SQL語句為空"], "is_safe": False, "empty": True}
     sql_clean = sql_query.strip()
     if len(sql_clean) < 5:
         return {"valid": False, "issues": ["SQL過短"], "is_safe": False, "empty": True}
     security_issues = []
     sql_upper = sql_clean.upper()
     dangerous_keywords = ['DROP', 'DELETE', 'INSERT', 'UPDATE', 'ALTER', 'TRUNCATE', 'EXEC', 'EXECUTE']
     for keyword in dangerous_keywords:
         if f" {keyword} " in f" {sql_upper} ":
             security_issues.append(f"危險操作: {keyword}")
     if "SELECT" not in sql_upper:
         security_issues.append("缺少SELECT")
     if "FROM" not in sql_upper:
         security_issues.append("缺少FROM")
     is_valid = not security_issues
     is_safe = all('危險' not in issue for issue in security_issues)
     return {"valid": is_valid, "issues": security_issues, "is_safe": is_safe, "empty": False}
 def analyze_question_type(question: str) -> Dict:
     """增強的問題分析 - 更精確的意圖識別"""
     question_lower = question.lower()
     analysis = {
         "type": "unknown",
         "keywords": [],
@@ -65,7 +65,7 @@ def analyze_question_type(question: str) -> Dict:
         "has_group": "每" in question_lower or "各" in question_lower or "分組" in question_lower,
         "specific_intent": "general_query"  # 新增：具體意圖，預設為通用查詢
     }
     # **更精確的意圖識別**
     if "每月" in question_lower and ("完成" in question_lower or "報告" in question_lower or "工作單" in question_lower):
         analysis["specific_intent"] = "monthly_completion_count"
@@ -79,7 +79,7 @@ def analyze_question_type(question: str) -> Dict:
     elif ("公司" in question_lower or "客戶" in question_lower or "申請方" in question_lower) and ("統計" in question_lower or "數量" in question_lower or "排名" in question_lower):
         analysis["specific_intent"] = "company_statistics"
         analysis["type"] = "statistics"
     return analysis
 # ==================== 完整數據加載模塊 ====================
@@ -90,36 +90,42 @@ class CompleteDataLoader:
         self.sql_answers = []
         self.sql_quality = []
         self.schema_data = {}
     def load_complete_dataset(self) -> bool:
         try:
             print(f"[{get_current_time()}] 正在加載完整數據集 '{DATASET_REPO_ID}'...")
             raw_dataset = load_dataset(DATASET_REPO_ID, token=self.hf_token)['train']
-            for item in raw_dataset:
                 try:
                     if 'messages' in item and len(item['messages']) >= 2:
                         user_content = item['messages'][0]['content']
                         assistant_content = item['messages'][1]['content']
                         question_match = re.search(r'指令:\s*(.*?)(?:\n|$)', user_content)
                         question = question_match.group(1).strip() if question_match else user_content
                         sql_match = re.search(r'SQL查詢:\s*(.*?)(?:\n|$)', assistant_content, re.DOTALL)
                         sql_query = sql_match.group(1).strip() if sql_match else assistant_content
                         sql_query = re.sub(r'```sql|```', '', sql_query).strip()
-                        self.questions.append(question)
-                        self.sql_answers.append(sql_query)
-                except Exception:
                     continue
-            print(f"數據加載完成: 總數 {len(self.questions)}")
-            return True
         except Exception as e:
             print(f"數據集加載失敗: {e}")
             return False
     def load_schema(self) -> bool:
         try:
             schema_file_path = hf_hub_download(repo_id=DATASET_REPO_ID, filename="sqlite_schema_FULL.json", repo_type='dataset', token=self.hf_token)
@@ -146,7 +152,7 @@ class RetrievalSystem:
             print(f"正在為 {len(questions)} 個問題計算向量...")
             self.question_embeddings = self.embedder.encode(questions, convert_to_tensor=True, show_progress_bar=True)
             print("向量計算完成")
     def retrieve_similar(self, user_question: str, top_k: int = 1) -> List[Dict]:
         if self.embedder is None or self.question_embeddings is None: return []
         try:
@@ -164,7 +170,7 @@ class CompleteTextToSQLSystem:
         self.data_loader = CompleteDataLoader(hf_token)
         self.retrieval_system = RetrievalSystem()
         self.initialize_system()
     def initialize_system(self):
         print("正在初始化完整數據系統...")
         self.data_loader.load_complete_dataset()
@@ -182,110 +188,135 @@ class CompleteTextToSQLSystem:
         """通用SQL生成器 (作為最終備用)"""
         # 此函數現在作為無法識別具體意圖時的通用後備方案
         return f"""-- 通用查詢範本
-SELECT
-    JobNo as 工作單號,
-    ApplicantName as 申請方,
-    OverallRating as 評級
-FROM TSR53SampleDescription
 LIMIT 20;"""
     def intelligent_repair_sql(self, user_question: str, similar_question: str) -> str:
         """智能修復SQL - 基於當前使用者問題的意圖"""
         analysis = analyze_question_type(user_question)
         intent = analysis["specific_intent"]
-        comment = f"-- 根據類似問題 '{similar_question}' (原SQL無效) 進行智能修復\n"
         if intent == "monthly_completion_count":
             year = self.extract_year(user_question)
             return comment + f"""-- 查詢 {year} 年每月完成的工作單數量
-SELECT
-    strftime('%Y-%m', jt.end_time) as 月份,
-    COUNT(*) as 完成數量
-FROM JobTimeline jt
-WHERE strftime('%Y', jt.end_time) = '{year}' AND jt.end_time IS NOT NULL
-GROUP BY strftime('%Y-%m', jt.end_time)
-ORDER BY 月份;"""
         elif intent == "rating_distribution":
             return comment + """-- 查詢評級分佈統計
-SELECT
-    OverallRating as 評級,
-    COUNT(*) as 數量,
-    ROUND(COUNT(*) * 100.0 / (SELECT COUNT(*) FROM TSR53SampleDescription WHERE OverallRating IS NOT NULL), 2) as 百分比
-FROM TSR53SampleDescription
-WHERE OverallRating IS NOT NULL
-GROUP BY OverallRating
-ORDER BY 數量 DESC;"""
         elif intent == "amount_ranking":
             return comment + """-- 查詢工作單金額排名
-WITH JobTotalAmount AS (
-    SELECT JobNo, SUM(LocalAmount) AS TotalAmount
-    FROM (SELECT DISTINCT JobNo, InvoiceCreditNoteNo, LocalAmount FROM TSR53Invoice WHERE LocalAmount IS NOT NULL)
-    GROUP BY JobNo
-)
-SELECT
-    jta.JobNo as 工作單號,
-    sd.ApplicantName as 申請方,
-    jta.TotalAmount as 總金額
-FROM JobTotalAmount jta
-JOIN TSR53SampleDescription sd ON sd.JobNo = jta.JobNo
-ORDER BY jta.TotalAmount DESC
-LIMIT 10;"""
         elif intent == "company_statistics":
             return comment + """-- 查詢申請方工作單統計
-SELECT
-    ApplicantName as 申請方名稱,
-    COUNT(*) as 工作單數量
-FROM TSR53SampleDescription
-WHERE ApplicantName IS NOT NULL
-GROUP BY ApplicantName
-ORDER BY 工作單數量 DESC
-LIMIT 20;"""
-        # 如果無法判斷具體意圖，使用原始的通用生成邏輯
-        return comment + self.generate_sql_from_question(user_question, analysis)
     def generate_sql(self, user_question: str) -> Tuple[str, str]:
         """主流程：生成SQL查詢 (改進版本)"""
-        log_messages = [f"⏰ {get_current_time()} 開始處理"]
         if not user_question or not user_question.strip():
-            return "請輸入您的問題。", "錯誤: 問題為空"
-        # 1. 檢索最相似的問題
         hits = self.retrieval_system.retrieve_similar(user_question)
         if hits:
             best_hit = hits[0]
             similarity_score = best_hit['score']
-            log_messages.append(f"🔍 檢索到最相似問題 (相似度: {similarity_score:.3f})")
             if similarity_score > SIMILARITY_THRESHOLD:
-                corpus_id = best_hit['corpus_id']
-                similar_question = self.data_loader.questions[corpus_id]
                 original_sql = self.data_loader.sql_answers[corpus_id]
                 validation = validate_sql(original_sql)
                 if validation["valid"] and validation["is_safe"]:
-                    log_messages.append("✅ 相似度高，且原SQL有效，直接採用。")
                     return original_sql, "\n".join(log_messages)
                 else:
-                    log_messages.append(f"⚠️ 相似度高，但原SQL無效 ({', '.join(validation['issues'])})。")
                     log_messages.append("🛠️ 啟用智能修復...")
                     repaired_sql = self.intelligent_repair_sql(user_question, similar_question)
                     return repaired_sql, "\n".join(log_messages)
-        log_messages.append("🤖 未找到高相似度或有效的範本，根據問題直接生成。")
-        analysis = analyze_question_type(user_question)
-        # 直接使用修復邏輯來生成，因為它本身就是基於意圖的生成器
         intelligent_sql = self.intelligent_repair_sql(user_question, "無相似問題")
-        log_messages.append(f"📋 問題意圖分析: {analysis['specific_intent']}")
-        log_messages.append("✅ 智能生成完成。")
         return intelligent_sql, "\n".join(log_messages)
 # ==================== 初始化系統 ====================
@@ -307,26 +338,47 @@ def process_query(user_question: str) -> Tuple[str, str, str]:
 with gr.Blocks(title="智慧Text-to-SQL系統", theme=gr.themes.Soft()) as demo:
     gr.Markdown("# 🚀 智慧 Text-to-SQL 系統 (進階修復版)")
     gr.Markdown("📊 **模式**: 結合「檢索驗證」與「意圖導向生成」，即使資料庫範本有誤也能提供準確查詢。")
     with gr.Row():
-        question_input = gr.Textbox(label="📝 請在此輸入您的問題", placeholder="例如：2024年每月完成多少份報告？", lines=3, scale=4)
         submit_btn = gr.Button("🚀 生成SQL", variant="primary", scale=1)
     with gr.Accordion("🔍 結果與日誌", open=True):
         sql_output = gr.Code(label="📊 生成的SQL查詢", language="sql", lines=10)
         status_output = gr.Textbox(label="🔍 執行狀態", interactive=False)
         log_output = gr.Textbox(label="📋 詳細日誌", lines=6, interactive=False)
     gr.Examples(
         examples=[
-            "2023 年每月完成多少份報告？",
-            "統計一下各種評級的分佈",
-            "找出總金額最高的5筆訂單來自哪個申請方",
-            "哪個客戶的工作單數量最多？"
         ],
         inputs=question_input
     )
 if __name__ == "__main__":
     if text_to_sql_system:
         print("Gradio 介面啟動中...")

     """驗證SQL語句的語法和安全性"""
     if not sql_query or not sql_query.strip():
         return {"valid": False, "issues": ["SQL語句為空"], "is_safe": False, "empty": True}
     sql_clean = sql_query.strip()
     if len(sql_clean) < 5:
         return {"valid": False, "issues": ["SQL過短"], "is_safe": False, "empty": True}
     security_issues = []
     sql_upper = sql_clean.upper()
     dangerous_keywords = ['DROP', 'DELETE', 'INSERT', 'UPDATE', 'ALTER', 'TRUNCATE', 'EXEC', 'EXECUTE']
     for keyword in dangerous_keywords:
         if f" {keyword} " in f" {sql_upper} ":
             security_issues.append(f"危險操作: {keyword}")
     if "SELECT" not in sql_upper:
         security_issues.append("缺少SELECT")
     if "FROM" not in sql_upper:
         security_issues.append("缺少FROM")
     is_valid = not security_issues
     is_safe = all('危險' not in issue for issue in security_issues)
     return {"valid": is_valid, "issues": security_issues, "is_safe": is_safe, "empty": False}
 def analyze_question_type(question: str) -> Dict:
     """增強的問題分析 - 更精確的意圖識別"""
     question_lower = question.lower()
     analysis = {
         "type": "unknown",
         "keywords": [],
         "has_group": "每" in question_lower or "各" in question_lower or "分組" in question_lower,
         "specific_intent": "general_query"  # 新增：具體意圖，預設為通用查詢
     }
     # **更精確的意圖識別**
     if "每月" in question_lower and ("完成" in question_lower or "報告" in question_lower or "工作單" in question_lower):
         analysis["specific_intent"] = "monthly_completion_count"
     elif ("公司" in question_lower or "客戶" in question_lower or "申請方" in question_lower) and ("統計" in question_lower or "數量" in question_lower or "排名" in question_lower):
         analysis["specific_intent"] = "company_statistics"
         analysis["type"] = "statistics"
     return analysis
 # ==================== 完整數據加載模塊 ====================
         self.sql_answers = []
         self.sql_quality = []
         self.schema_data = {}
     def load_complete_dataset(self) -> bool:
         try:
             print(f"[{get_current_time()}] 正在加載完整數據集 '{DATASET_REPO_ID}'...")
             raw_dataset = load_dataset(DATASET_REPO_ID, token=self.hf_token)['train']
+            successful_loads = 0
+            total_items = len(raw_dataset)
+            for idx, item in enumerate(raw_dataset):
                 try:
                     if 'messages' in item and len(item['messages']) >= 2:
                         user_content = item['messages'][0]['content']
                         assistant_content = item['messages'][1]['content']
                         question_match = re.search(r'指令:\s*(.*?)(?:\n|$)', user_content)
                         question = question_match.group(1).strip() if question_match else user_content
                         sql_match = re.search(r'SQL查詢:\s*(.*?)(?:\n|$)', assistant_content, re.DOTALL)
                         sql_query = sql_match.group(1).strip() if sql_match else assistant_content
                         sql_query = re.sub(r'```sql|```', '', sql_query).strip()
+                        if question and sql_query:  # 只加載有效的問答對
+                            self.questions.append(question)
+                            self.sql_answers.append(sql_query)
+                            successful_loads += 1
+                except Exception as e:
+                    print(f"跳過第 {idx} 項資料，錯誤: {e}")
                     continue
+            print(f"數據加載完成: 成功載入 {successful_loads}/{total_items} 項")
+            return successful_loads > 0
         except Exception as e:
             print(f"數據集加載失敗: {e}")
             return False
     def load_schema(self) -> bool:
         try:
             schema_file_path = hf_hub_download(repo_id=DATASET_REPO_ID, filename="sqlite_schema_FULL.json", repo_type='dataset', token=self.hf_token)
             print(f"正在為 {len(questions)} 個問題計算向量...")
             self.question_embeddings = self.embedder.encode(questions, convert_to_tensor=True, show_progress_bar=True)
             print("向量計算完成")
     def retrieve_similar(self, user_question: str, top_k: int = 1) -> List[Dict]:
         if self.embedder is None or self.question_embeddings is None: return []
         try:
         self.data_loader = CompleteDataLoader(hf_token)
         self.retrieval_system = RetrievalSystem()
         self.initialize_system()
     def initialize_system(self):
         print("正在初始化完整數據系統...")
         self.data_loader.load_complete_dataset()
         """通用SQL生成器 (作為最終備用)"""
         # 此函數現在作為無法識別具體意圖時的通用後備方案
         return f"""-- 通用查詢範本
+SELECT
+    JobNo as 工作單號,
+    ApplicantName as 申請方,
+    OverallRating as 評級
+FROM TSR53SampleDescription
 LIMIT 20;"""
     def intelligent_repair_sql(self, user_question: str, similar_question: str) -> str:
         """智能修復SQL - 基於當前使用者問題的意圖"""
         analysis = analyze_question_type(user_question)
         intent = analysis["specific_intent"]
+        if similar_question != "無相似問題":
+            comment = f"-- 根據類似問題 '{similar_question}' (原SQL無效) 進行智能修復\n"
+        else:
+            comment = f"-- 根據問題意圖 '{intent}' 智能生成SQL\n"
         if intent == "monthly_completion_count":
             year = self.extract_year(user_question)
             return comment + f"""-- 查詢 {year} 年每月完成的工作單數量
+    SELECT
+        strftime('%Y-%m', jt.ReportAuthorization) as 月份,
+        COUNT(*) as 完成數量
+    FROM JobTimeline jt
+    WHERE strftime('%Y', jt.ReportAuthorization) = '{year}'
+        AND jt.ReportAuthorization IS NOT NULL
+    GROUP BY strftime('%Y-%m', jt.ReportAuthorization)
+    ORDER BY 月份;"""
         elif intent == "rating_distribution":
             return comment + """-- 查詢評級分佈統計
+    SELECT
+        OverallRating as 評級,
+        COUNT(*) as 數量,
+        ROUND(COUNT(*) * 100.0 / (
+            SELECT COUNT(*)
+            FROM TSR53SampleDescription
+            WHERE OverallRating IS NOT NULL
+        ), 2) as 百分比
+    FROM TSR53SampleDescription
+    WHERE OverallRating IS NOT NULL
+    GROUP BY OverallRating
+    ORDER BY 數量 DESC;"""
         elif intent == "amount_ranking":
             return comment + """-- 查詢工作單金額排名
+    WITH JobTotalAmount AS (
+        SELECT JobNo, SUM(LocalAmount) AS TotalAmount
+        FROM (
+            SELECT DISTINCT JobNo, InvoiceCreditNoteNo, LocalAmount
+            FROM TSR53Invoice
+            WHERE LocalAmount IS NOT NULL
+        )
+        GROUP BY JobNo
+    )
+    SELECT
+        jta.JobNo as 工作單號,
+        sd.ApplicantName as 申請方,
+        jta.TotalAmount as 總金額
+    FROM JobTotalAmount jta
+    JOIN TSR53SampleDescription sd ON sd.JobNo = jta.JobNo
+    WHERE sd.ApplicantName IS NOT NULL
+    ORDER BY jta.TotalAmount DESC
+    LIMIT 10;"""
         elif intent == "company_statistics":
             return comment + """-- 查詢申請方工作單統計
+    SELECT
+        ApplicantName as 申請方名稱,
+        COUNT(*) as 工作單數量
+    FROM TSR53SampleDescription
+    WHERE ApplicantName IS NOT NULL
+    GROUP BY ApplicantName
+    ORDER BY 工作單數量 DESC
+    LIMIT 20;"""
+        # 通用查詢模板
+        return comment + """-- 通用查詢範本
+    SELECT
+        JobNo as 工作單號,
+        ApplicantName as 申請方,
+        BuyerName as 買方,
+        OverallRating as 評級
+    FROM TSR53SampleDescription
+    WHERE ApplicantName IS NOT NULL
+    LIMIT 20;"""
     def generate_sql(self, user_question: str) -> Tuple[str, str]:
         """主流程：生成SQL查詢 (改進版本)"""
+        log_messages = [f"⏰ {get_current_time()} 開始處理問題: '{user_question[:50]}...'"]
         if not user_question or not user_question.strip():
+            return "-- 錯誤: 請輸入有效問題\nSELECT '請輸入您的問題' as 錯誤信息;", "錯誤: 問題為空"
+        # 1. 問題分析
+        analysis = analyze_question_type(user_question)
+        log_messages.append(f"📋 問題分析 - 意圖: {analysis['specific_intent']}, 類型: {analysis['type']}")
+        # 2. 檢索最相似的問題
         hits = self.retrieval_system.retrieve_similar(user_question)
         if hits:
             best_hit = hits[0]
             similarity_score = best_hit['score']
+            corpus_id = best_hit['corpus_id']
+            similar_question = self.data_loader.questions[corpus_id]
+            log_messages.append(f"🔍 找到相似問題 (相似度: {similarity_score:.3f}): '{similar_question[:50]}...'")
             if similarity_score > SIMILARITY_THRESHOLD:
                 original_sql = self.data_loader.sql_answers[corpus_id]
                 validation = validate_sql(original_sql)
                 if validation["valid"] and validation["is_safe"]:
+                    log_messages.append("✅ 相似度高且原SQL有效，直接採用")
                     return original_sql, "\n".join(log_messages)
                 else:
+                    log_messages.append(f"⚠️ 原SQL有問題: {', '.join(validation['issues'])}")
                     log_messages.append("🛠️ 啟用智能修復...")
                     repaired_sql = self.intelligent_repair_sql(user_question, similar_question)
+                    log_messages.append("✅ 智能修復完成")
                     return repaired_sql, "\n".join(log_messages)
+            else:
+                log_messages.append(f"📉 相似度 ({similarity_score:.3f}) 低於閾值 ({SIMILARITY_THRESHOLD})")
+        log_messages.append("🤖 未找到合適範本，使用意圖生成")
         intelligent_sql = self.intelligent_repair_sql(user_question, "無相似問題")
+        log_messages.append("✅ 智能生成完成")
         return intelligent_sql, "\n".join(log_messages)
 # ==================== 初始化系統 ====================
 with gr.Blocks(title="智慧Text-to-SQL系統", theme=gr.themes.Soft()) as demo:
     gr.Markdown("# 🚀 智慧 Text-to-SQL 系統 (進階修復版)")
     gr.Markdown("📊 **模式**: 結合「檢索驗證」與「意圖導向生成」，即使資料庫範本有誤也能提供準確查詢。")
     with gr.Row():
+        question_input = gr.Textbox(
+            label="📝 請在此輸入您的問題",
+            placeholder="例如：2024年每月完成多少份報告？",
+            lines=3,
+            scale=4
+        )
         submit_btn = gr.Button("🚀 生成SQL", variant="primary", scale=1)
     with gr.Accordion("🔍 結果與日誌", open=True):
         sql_output = gr.Code(label="📊 生成的SQL查詢", language="sql", lines=10)
         status_output = gr.Textbox(label="🔍 執行狀態", interactive=False)
         log_output = gr.Textbox(label="📋 詳細日誌", lines=6, interactive=False)
+    # 改進的範例
     gr.Examples(
         examples=[
+            "2024年每月完成多少份報告？",
+            "統計各種評級(Pass/Fail)的分布情況",
+            "找出總金額最高的10個工作單來自哪些申請方",
+            "哪些客戶的工作單數量最多？",
+            "A組昨天完成了多少個測試項目？",
+            "2024年Q1期間評級為Fail且總金額超過10000的工作單"
         ],
         inputs=question_input
     )
+    # 綁定事件
+    submit_btn.click(
+        fn=process_query,
+        inputs=[question_input],
+        outputs=[sql_output, status_output, log_output]
+    )
+    question_input.submit(
+        fn=process_query,
+        inputs=[question_input],
+        outputs=[sql_output, status_output, log_output]
+    )
 if __name__ == "__main__":
     if text_to_sql_system:
         print("Gradio 介面啟動中...")