Spaces:

Paul720810
/

Softline-SQL-Assistant

Sleeping

App Files Files Community

Paul720810 commited on Sep 3, 2025

Commit

954de7f

verified ·

1 Parent(s): 7b1b963

Update app.py

Browse files

Files changed (1) hide show

app.py +111 -129

app.py CHANGED Viewed

@@ -2,7 +2,7 @@ import gradio as gr
 import requests
 import json
 import os
-from datasets import load_dataset, Dataset
 from sentence_transformers import SentenceTransformer, util
 import torch
 from huggingface_hub import hf_hub_download
@@ -11,24 +11,23 @@ import re
 # --- 配置區 ---
 HF_TOKEN = os.environ.get("HF_TOKEN")
 DATASET_REPO_ID = "Paul720810/Text-to-SQL-Softline"
-# 使用正確的模型名稱（7B版本更適合免費使用）
-LLM_API_URL = "https://api-inference.huggingface.co/models/codellama/CodeLlama-7b-hf"
-SIMILARITY_THRESHOLD = 0.90
 print("--- [1/5] 開始初始化應用 ---")
 # --- 1. 載入知識庫 ---
-qa_dataset = None
-schema_data = {}
 questions = []
 sql_answers = []
 try:
     print(f"--- [2/5] 正在從 '{DATASET_REPO_ID}' 載入知識庫... ---")
     raw_dataset = load_dataset(DATASET_REPO_ID, token=HF_TOKEN)['train']
-    # 解析新的 messages 格式
-    print("--- > 檢測到 'messages' 格式，正在解析...")
     for item in raw_dataset:
         try:
@@ -36,80 +35,48 @@ try:
                 user_content = item['messages'][0]['content']
                 assistant_content = item['messages'][1]['content']
-                # 從用戶消息中提取問題
                 question_match = re.search(r'指令:\s*(.*?)(?:\n|$)', user_content)
-                if question_match:
-                    question = question_match.group(1).strip()
-                else:
-                    # 如果沒有找到指令，使用整個內容
-                    question = user_content
-                # 從助手消息中提取SQL
                 sql_match = re.search(r'SQL查詢:\s*(.*?)(?:\n|$)', assistant_content, re.DOTALL)
                 if sql_match:
                     sql_query = sql_match.group(1).strip()
-                    # 清理SQL語句
-                    sql_query = re.sub(r'^sql\s*', '', sql_query)  # 移除開頭的sql
-                    sql_query = re.sub(r'```sql|```', '', sql_query).strip()  # 移除代碼塊標記
                 else:
                     sql_query = assistant_content
                 questions.append(question)
                 sql_answers.append(sql_query)
-        except (KeyError, IndexError, TypeError) as e:
             print(f"解析錯誤，跳過該條目: {e}")
             continue
-    # 創建問答數據集
-    if questions:
-        qa_dataset = Dataset.from_dict({
-            'question': questions,
-            'sql': sql_answers
-        })
-    else:
-        raise ValueError("沒有成功解析出任何問答對")
-    # 載入並解析 Schema JSON
-    schema_file_path = "sqlite_schema_FULL.json"
     try:
-        hf_hub_download(repo_id=DATASET_REPO_ID, filename=schema_file_path,
-                       repo_type='dataset', local_dir='.', token=HF_TOKEN)
         with open(schema_file_path, 'r', encoding='utf-8') as f:
             schema_data = json.load(f)
     except Exception as e:
         print(f"警告: 無法載入Schema文件: {e}")
-        schema_data = {}
-    print(f"--- > 成功解析 {len(questions)} 條問答範例。 ---")
 except Exception as e:
-    print(f"!!! 錯誤: 處理Dataset時發生問題: {e}")
-    # 創建備用數據集
     questions = ["示例問題"]
-    sql_answers = ["SELECT '請檢查數據集格式' AS error;"]
-    qa_dataset = Dataset.from_dict({"question": questions, "sql": sql_answers})
-# --- 2. 構建 DDL 和初始化檢索模型 ---
-def load_schema_as_ddl(schema_dict: dict) -> str:
-    ddl_string = "/* 數據庫結構 */\n"
-    for table_name, columns in schema_dict.items():
-        if not isinstance(columns, list):
-            continue
-        ddl_string += f"CREATE TABLE `{table_name}` (\n"
-        ddl_cols = []
-        for col in columns:
-            col_name = col.get('name', 'unknown')
-            col_type = col.get('type', 'TEXT')
-            col_desc = col.get('description', '')
-            ddl_cols.append(f"  `{col_name}` {col_type} -- {col_desc}")
-        ddl_string += ",\n".join(ddl_cols) + "\n);\n\n"
-    return ddl_string
-SCHEMA_DDL = load_schema_as_ddl(schema_data)
-print("--- [3/5] 正在載入句向量模型 (all-MiniLM-L6-v2)... ---")
 embedder = SentenceTransformer('all-MiniLM-L6-v2', device='cpu')
 # 計算問題向量
@@ -118,10 +85,28 @@ if questions:
     question_embeddings = embedder.encode(questions, convert_to_tensor=True, show_progress_bar=True)
     print("--- > 向量計算完成！ ---")
 else:
-    print("--- [4/5] 警告：沒有可用的問題來計算向量。 ---")
     question_embeddings = torch.Tensor([])
-# --- 3. 混合系統核心邏輯 ---
 def get_sql_query(user_question: str):
     if not user_question:
         return "請輸入您的問題。", "日誌：用戶未輸入問題。"
@@ -129,50 +114,43 @@ def get_sql_query(user_question: str):
     log_messages = []
     # 檢索相似問題
-    if len(questions) > 0:
-        question_embedding = embedder.encode(user_question, convert_to_tensor=True)
-        hits = util.semantic_search(question_embedding, question_embeddings, top_k=3)
-        if hits and hits[0]:
-            most_similar_hit = hits[0][0]
-            similarity_score = most_similar_hit['score']
-            similar_question = questions[most_similar_hit['corpus_id']]
-            log_messages.append(f"檢索到相似問題: '{similar_question}' (相似度: {similarity_score:.4f})")
-            if similarity_score > SIMILARITY_THRESHOLD:
-                sql_result = sql_answers[most_similar_hit['corpus_id']]
-                log_messages.append(f"相似度 > {SIMILARITY_THRESHOLD}，直接返回預先SQL")
-                return sql_result, "\n".join(log_messages)
-        else:
-            log_messages.append("檢索失敗：找不到相似問題")
     else:
         log_messages.append("知識庫為空，跳過檢索")
     # LLM生成模式
     log_messages.append("進入LLM生成模式...")
-    # 構建示例上下文
-    examples_context = ""
-    if 'hits' in locals() and hits and hits[0]:
-        for i, hit in enumerate(hits[0][:2]):
-            examples_context += f"問題: {questions[hit['corpus_id']]}\nSQL: {sql_answers[hit['corpus_id']]}\n\n"
-    # 構建提示詞
-    prompt = f"""你是一個SQL專家。請根據數據庫結構生成SQL查詢。
-數據庫結構：
-{SCHEMA_DDL}
-參考示例：
-{examples_context}
-請為以下問題生成SQL查詢：
-{user_question}
-只輸出SQL語句，不要其他內容：
-"""
     log_messages.append("正在請求雲端LLM...")
@@ -180,7 +158,7 @@ def get_sql_query(user_question: str):
     payload = {
         "inputs": prompt,
         "parameters": {
-            "max_new_tokens": 300,
             "temperature": 0.1,
             "do_sample": False
         }
@@ -194,74 +172,78 @@ def get_sql_query(user_question: str):
             if isinstance(result, list) and len(result) > 0:
                 generated_text = result[0]['generated_text'].strip()
-                # 清理輸出，只保留SQL
-                if "```sql" in generated_text:
-                    generated_text = generated_text.split("```sql")[1].split("```")[0].strip()
-                elif "```" in generated_text:
-                    generated_text = generated_text.split("```")[1].strip() if len(generated_text.split("```")) > 2 else generated_text
                 log_messages.append("LLM生成成功！")
                 return generated_text, "\n".join(log_messages)
-            else:
-                raise Exception(f"API返回格式異常: {result}")
         else:
-            raise Exception(f"API錯誤: {response.status_code} - {response.text}")
     except Exception as e:
         error_msg = f"LLM API調用失敗: {str(e)}"
         log_messages.append(error_msg)
-        # 提供備用答案
-        backup_sql = "SELECT 'AI服務暫時不可用，請稍後重試' AS status;"
         return backup_sql, "\n".join(log_messages)
-# --- 4. 創建 Gradio Web 界面 ---
-print("--- [5/5] 正在創建 Gradio Web 界面... ---")
-with gr.Blocks(theme=gr.themes.Soft()) as demo:
-    gr.Markdown("# 🚀 智能 Text-to-SQL 系統 (混合模式)")
-    gr.Markdown("輸入自然語言問題，系統會智能生成SQL查詢")
     with gr.Row():
         question_input = gr.Textbox(
-            label="輸入您的問題",
-            placeholder="例如：查詢去年的銷售總額",
-            lines=2,
-            scale=4
         )
-        submit_button = gr.Button("生成SQL", variant="primary", scale=1)
     with gr.Row():
         sql_output = gr.Code(
-            label="生成的 SQL 查詢",
             language="sql",
-            lines=6
         )
     with gr.Row():
         log_output = gr.Textbox(
-            label="系統日誌",
-            lines=4,
             interactive=False
         )
-    submit_button.click(
         fn=get_sql_query,
         inputs=question_input,
         outputs=[sql_output, log_output]
     )
     gr.Examples(
         examples=[
-            "2024年最好的5個客戶以及業績",
-            "比較2023年跟2024年的業績",
-            "上週C組完成了幾份報告",
-            "有沒有快到期的訂單？",
-            "哪個客戶的付款最不及時？"
         ],
-        inputs=question_input,
-        label="示例問題"
     )
-print("--- 應用準備啟動 ---")
 if __name__ == "__main__":
-    demo.launch(server_name="0.0.0.0", server_port=7860, share=False)

 import requests
 import json
 import os
+from datasets import load_dataset
 from sentence_transformers import SentenceTransformer, util
 import torch
 from huggingface_hub import hf_hub_download
 # --- 配置區 ---
 HF_TOKEN = os.environ.get("HF_TOKEN")
 DATASET_REPO_ID = "Paul720810/Text-to-SQL-Softline"
+# 使用更可靠且免費的模型
+LLM_API_URL = "https://api-inference.huggingface.co/models/microsoft/DialoGPT-large"
+SIMILARITY_THRESHOLD = 0.85  # 降低閾值以提高檢索命中率
 print("--- [1/5] 開始初始化應用 ---")
 # --- 1. 載入知識庫 ---
 questions = []
 sql_answers = []
+schema_data = {}
 try:
     print(f"--- [2/5] 正在從 '{DATASET_REPO_ID}' 載入知識庫... ---")
     raw_dataset = load_dataset(DATASET_REPO_ID, token=HF_TOKEN)['train']
+    # 解析 messages 格式
+    print("--- > 解析 messages 格式...")
     for item in raw_dataset:
         try:
                 user_content = item['messages'][0]['content']
                 assistant_content = item['messages'][1]['content']
+                # 提取問題
                 question_match = re.search(r'指令:\s*(.*?)(?:\n|$)', user_content)
+                question = question_match.group(1).strip() if question_match else user_content
+                # 提取SQL
                 sql_match = re.search(r'SQL查詢:\s*(.*?)(?:\n|$)', assistant_content, re.DOTALL)
                 if sql_match:
                     sql_query = sql_match.group(1).strip()
+                    sql_query = re.sub(r'^sql\s*', '', sql_query)
+                    sql_query = re.sub(r'```sql|```', '', sql_query).strip()
                 else:
                     sql_query = assistant_content
                 questions.append(question)
                 sql_answers.append(sql_query)
+        except Exception as e:
             print(f"解析錯誤，跳過該條目: {e}")
             continue
+    print(f"--- > 成功解析 {len(questions)} 條問答範例 ---")
+    # 載入Schema
     try:
+        schema_file_path = hf_hub_download(
+            repo_id=DATASET_REPO_ID,
+            filename="sqlite_schema_FULL.json",
+            repo_type='dataset',
+            token=HF_TOKEN
+        )
         with open(schema_file_path, 'r', encoding='utf-8') as f:
             schema_data = json.load(f)
     except Exception as e:
         print(f"警告: 無法載入Schema文件: {e}")
 except Exception as e:
+    print(f"錯誤: 載入數據集失敗: {e}")
     questions = ["示例問題"]
+    sql_answers = ["SELECT '數據庫連接成功' AS status;"]
+# --- 2. 初始化檢索模型 ---
+print("--- [3/5] 正在載入句向量模型... ---")
 embedder = SentenceTransformer('all-MiniLM-L6-v2', device='cpu')
 # 計算問題向量
     question_embeddings = embedder.encode(questions, convert_to_tensor=True, show_progress_bar=True)
     print("--- > 向量計算完成！ ---")
 else:
+    print("--- [4/5] 警告：沒有可用的問題 ---")
     question_embeddings = torch.Tensor([])
+# --- 3. 構建DDL ---
+def build_schema_context(schema_dict):
+    if not schema_dict:
+        return "/* 無Schema信息 */"
+    context = "/* 數據庫表結構 */\n"
+    for table_name, columns in schema_dict.items():
+        if isinstance(columns, list):
+            context += f"\n-- 表: {table_name}\n"
+            for col in columns:
+                col_name = col.get('name', 'unknown')
+                col_type = col.get('type', 'TEXT')
+                col_desc = col.get('description', '')
+                context += f"--   {col_name} ({col_type}) - {col_desc}\n"
+    return context
+SCHEMA_CONTEXT = build_schema_context(schema_data)
+# --- 4. 核心邏輯 ---
 def get_sql_query(user_question: str):
     if not user_question:
         return "請輸入您的問題。", "日誌：用戶未輸入問題。"
     log_messages = []
     # 檢索相似問題
+    if len(questions) > 0 and len(question_embeddings) > 0:
+        try:
+            question_embedding = embedder.encode(user_question, convert_to_tensor=True)
+            hits = util.semantic_search(question_embedding, question_embeddings, top_k=3)
+            if hits and hits[0]:
+                most_similar_hit = hits[0][0]
+                similarity_score = most_similar_hit['score']
+                similar_question = questions[most_similar_hit['corpus_id']]
+                log_messages.append(f"檢索到相似問題: '{similar_question}' (相似度: {similarity_score:.3f})")
+                if similarity_score > SIMILARITY_THRESHOLD:
+                    sql_result = sql_answers[most_similar_hit['corpus_id']]
+                    log_messages.append(f"相似度 > {SIMILARITY_THRESHOLD}，直接返回預先SQL")
+                    return sql_result, "\n".join(log_messages)
+                else:
+                    log_messages.append(f"相似度低於閾值 {SIMILARITY_THRESHOLD}")
+            else:
+                log_messages.append("檢索失敗：找不到相似問題")
+        except Exception as e:
+            log_messages.append(f"檢索過程出錯: {e}")
     else:
         log_messages.append("知識庫為空，跳過檢索")
     # LLM生成模式
     log_messages.append("進入LLM生成模式...")
+    # 構建提示詞 - 更簡單的版本
+    prompt = f"""請根據以下數據庫結構，為這個問題生成SQL查詢：
+{SCHEMA_CONTEXT}
+問題：{user_question}
+請只輸出SQL語句："""
     log_messages.append("正在請求雲端LLM...")
     payload = {
         "inputs": prompt,
         "parameters": {
+            "max_new_tokens": 200,
             "temperature": 0.1,
             "do_sample": False
         }
             if isinstance(result, list) and len(result) > 0:
                 generated_text = result[0]['generated_text'].strip()
+                # 簡單清理
+                generated_text = re.sub(r'^```sql|```$', '', generated_text).strip()
                 log_messages.append("LLM生成成功！")
                 return generated_text, "\n".join(log_messages)
         else:
+            raise Exception(f"API錯誤: {response.status_code}")
     except Exception as e:
         error_msg = f"LLM API調用失敗: {str(e)}"
         log_messages.append(error_msg)
+        # 提供更有用的備用答案
+        backup_sql = "SELECT 'AI服務暫時不可用，請稍後再試或聯繫管理員' AS status;"
         return backup_sql, "\n".join(log_messages)
+# --- 5. 創建界面 ---
+print("--- [5/5] 正在創建 Web 界面... ---")
+with gr.Blocks(title="智能Text-to-SQL系統") as demo:
+    gr.Markdown("# 🤖 智能 Text-to-SQL 系統")
+    gr.Markdown("輸入自然語言問題，自動生成SQL查詢")
     with gr.Row():
         question_input = gr.Textbox(
+            label="您的問題",
+            placeholder="例如：查詢去年的銷售數據",
+            lines=2
         )
+    with gr.Row():
+        submit_btn = gr.Button("生成SQL", variant="primary")
+        clear_btn = gr.Button("清除")
     with gr.Row():
         sql_output = gr.Code(
+            label="生成的SQL",
             language="sql",
+            lines=5
         )
     with gr.Row():
         log_output = gr.Textbox(
+            label="執行日誌",
+            lines=3,
             interactive=False
         )
+    # 綁定事件
+    submit_btn.click(
         fn=get_sql_query,
         inputs=question_input,
         outputs=[sql_output, log_output]
     )
+    clear_btn.click(
+        fn=lambda: ["", ""],
+        inputs=[],
+        outputs=[sql_output, log_output]
+    )
+    # 示例
     gr.Examples(
         examples=[
+            "查詢2024年銷售額最高的產品",
+            "顯示最近30天的訂單",
+            "統計每個客戶的訂單數量",
+            "找出庫存不足的商品"
         ],
+        inputs=question_input
     )
+print("--- 應用啟動完成 ---")
 if __name__ == "__main__":
+    demo.launch(server_name="0.0.0.0", server_port=7860)