Spaces:

Paul720810
/

Softline-SQL-Assistant

Sleeping

App Files Files Community

Paul720810 commited on Sep 4, 2025

Commit

fea361d

verified ·

1 Parent(s): 6c5a7ad

Update app.py

Browse files

Files changed (1) hide show

app.py +109 -281

app.py CHANGED Viewed

@@ -1,148 +1,103 @@
 import gradio as gr
-import requests
-import json
 import os
 import re
 from datetime import datetime
 from datasets import load_dataset
 from sentence_transformers import SentenceTransformer, util
-import torch
 from huggingface_hub import hf_hub_download
 from typing import List, Dict, Tuple, Optional
-import numpy as np
 # ==================== 配置區 ====================
-HF_TOKEN = os.environ.get("HF_TOKEN", None)
 DATASET_REPO_ID = "Paul720810/Text-to-SQL-Softline"
-# === 修改開始 ===
-# 我們不再需要硬性的相似度閾值，因為現在的策略是「參考」而非「直接採用」。
-# SIMILARITY_THRESHOLD = 0.65
-# 新增一個配置，決定要檢索多少個範例來當作參考
-FEW_SHOT_EXAMPLES_COUNT = 2 # 檢索最相似的2個範例
-# === 修改結束 ===
-# 雲端環境檢測
-IS_SPACES = os.environ.get("SPACE_ID") is not None
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
 print("=" * 60)
-print("🤖 智能 Text-to-SQL 系統啟動中...")
-print(f"📊 模式: 讀取全部數據（來自 {DATASET_REPO_ID}）")
-print(f"🌐 環境: {'Hugging Face Spaces' if IS_SPACES else '本地環境'}")
 print(f"💻 設備: {DEVICE}")
 print("=" * 60)
-# ==================== 獨立工具函數 (不依賴類別實例) ====================
 def get_current_time():
-    """獲取當前時間字串"""
     return datetime.now().strftime('%Y-%m-%d %H:%M:%S')
 def format_log(message: str, level: str = "INFO") -> str:
-    """格式化日誌訊息"""
     return f"[{get_current_time()}] [{level.upper()}] {message}"
 def parse_sql_from_response(response_text: str) -> Optional[str]:
-    """從API回應中提取SQL代碼"""
     match = re.search(r"```sql\n(.*?)\n```", response_text, re.DOTALL)
     if match:
         return match.group(1).strip()
-    # 新增備用解析：如果找不到```sql ...```，直接嘗試解析JSON中的SQL
-    try:
-        data = json.loads(response_text)
-        if "SQL查詢" in data and "```sql" in data["SQL查詢"]:
-             match = re.search(r"```sql\n(.*?)\n```", data["SQL查詢"], re.DOTALL)
-             if match:
-                return match.group(1).strip()
-    except json.JSONDecodeError:
-        pass # 不是合法的JSON，忽略
     return None
-# ==================== 核心 Text-to-SQL 系統類別 ====================
-from transformers import AutoModelForCausalLM, AutoTokenizer
 class TextToSQLSystem:
-    def __init__(self, model_name='sentence-transformers/paraphrase-multilingual-mpnet-base-v2'):
         self.log_history = []
         self._log("初始化系統...")
-        # 載入檢索模型
         self.schema = self._load_schema()
-        self.model = SentenceTransformer(model_name, device=DEVICE)
         self.dataset, self.corpus_embeddings = self._load_and_encode_dataset()
-        # ✅ 載入你自己的 Hugging Face 模型
-        self.generation_model_id = "Paul720810/qwen2.5-coder-1.5b-sql-finetuned"
-        self.tokenizer = AutoTokenizer.from_pretrained(self.generation_model_id)
-        self.generation_model = AutoModelForCausalLM.from_pretrained(
-            self.generation_model_id,
-            device_map="auto",
-            torch_dtype="auto"
         )
-        self._log("✅ 系統初始化完成，已準備就緒。")
-    def huggingface_api_call(self, prompt: str) -> str:
-        """直接使用本地載入的模型生成結果"""
-        try:
-            self._log("🧠 開始本地生成 SQL...")
-            inputs = self.tokenizer(prompt, return_tensors="pt").to(self.generation_model.device)
-            outputs = self.generation_model.generate(
-                **inputs,
-                max_new_tokens=512,
-                do_sample=True,
-                temperature=0.7,
-                top_p=0.9
-            )
-            result = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
-            self._log("✅ 本地生成完成。")
-            return result
-        except Exception as e:
-            self._log(f"❌ 本地生成失敗: {e}", "ERROR")
-            return f"本地生成錯誤: {e}"
     def _log(self, message: str, level: str = "INFO"):
         self.log_history.append(format_log(message, level))
         print(format_log(message, level))
     def _load_schema(self) -> Dict:
-        """從JSON檔案載入資料庫結構"""
         try:
-            schema_path = hf_hub_download(repo_id=DATASET_REPO_ID, filename="sqlite_schema_FULL.json", repo_type="dataset")
-            with open(schema_path, 'r', encoding='utf-8') as f:
                 self._log("成功載入資料庫結構 (sqlite_schema_FULL.json)")
                 return json.load(f)
         except Exception as e:
-            self._log(f"❌ 載入資料庫結構失敗: {e}", "ERROR")
             return {}
-    def _format_schema_for_prompt(self) -> str:
-        """將 schema JSON 物件格式化為清晰的字串，用於提示"""
-        formatted_string = "資料庫結構 (Database Schema):\n"
-        for table_name, columns in self.schema.items():
-            formatted_string += f"Table: {table_name}\n"
-            for col in columns:
-                col_name = col.get('name', 'N/A')
-                col_type = col.get('type', 'N/A')
-                col_desc = col.get('description', '')
-                formatted_string += f"  - {col_name} ({col_type}) # {col_desc}\n"
-            formatted_string += "\n"
-        return formatted_string
-    def _load_and_encode_dataset(self) -> Tuple[Optional[List[Dict]], Optional[torch.Tensor]]:
-        """載入訓練數據集並對問題進行編碼"""
         try:
             dataset = load_dataset(DATASET_REPO_ID, data_files="training_data.jsonl", split="train")
-            # 提取所有 "user" 的 "content" 作為語料庫
             corpus = [item['messages'][0]['content'] for item in dataset]
-            self._log(f"正在對 {len(corpus)} 個範例問題進行編碼...")
             embeddings = self.model.encode(corpus, convert_to_tensor=True, device=DEVICE)
             self._log("✅ 範例問題編碼完成。")
             return dataset, embeddings
@@ -151,212 +106,85 @@ class TextToSQLSystem:
             return None, None
     def find_most_similar(self, question: str, top_k: int) -> List[Dict]:
-        """尋找最相似的K個問題及其對應的SQL"""
-        if self.corpus_embeddings is None or self.dataset is None:
-            return []
-        question_embedding = self.model.encode(question, convert_to_tensor=True, device=DEVICE)
-        cos_scores = util.cos_sim(question_embedding, self.corpus_embeddings)[0]
-        top_results = torch.topk(cos_scores, k=min(top_k, len(self.corpus_embeddings)))
-        similar_examples = []
-        for score, idx in zip(top_results[0], top_results[1]):
             item = self.dataset[idx.item()]
-            user_content = item['messages'][0]['content']
-            assistant_content = item['messages'][1]['content']
-            # 從 assistant_content 中提取純 SQL
-            sql_query = parse_sql_from_response(assistant_content)
-            if not sql_query:
-                # 如果解析失敗，可能是格式問題，這裡做個備份
-                sql_query = "無法解析範例SQL"
-            similar_examples.append({
-                "similarity": score.item(),
-                "question": user_content,
-                "sql": sql_query
-            })
-        return similar_examples
-    # === 修改開始: 重寫核心處理邏輯 ===
-    def _build_prompt_for_generation(self, user_question: str, examples: List[Dict]) -> str:
-        """
-        **新增的函數**
-        根據我們的「檢索-增強-生成」策略，建立一個豐富的提示(Prompt)。
-        """
-        # 1. 任務指令 (System Instruction)
-        #    明確告訴 AI 它的角色和目標。
         system_instruction = (
-            "你是一位頂尖的資料庫專家，精通 SQLite。你的任務是根據使用者提出的問題，"
-            "參考提供的資料庫結構和相似的 SQL 查詢範例，生成一個精確、高效的 SQLite 查詢語法。\n"
-            "請將最終的 SQL 查詢語法包裝在 ```sql ... ``` 區塊中。"
         )
-        # 2. 資料庫結構 (Database Schema)
-        #    讓 AI 了解有哪些資料表和欄位可用。
-        schema_string = self._format_schema_for_prompt()
-        # 3. 參考範例 (Few-shot Examples)
-        #    給 AI 看「過去的優良作業」，讓它學習語法風格和邏輯。
-        examples_string = "--- 參考範例 ---\n"
-        if not examples:
-            examples_string += "無\n"
-        else:
-            for i, example in enumerate(examples, 1):
-                # 為了讓提示更清晰，我們只取範例中的 `指令` 部分
-                clean_question = re.search(r"指令:\s*(.*)", example['question'])
-                if clean_question:
-                    question_to_show = clean_question.group(1).strip()
-                else:
-                    question_to_show = example['question'] # 如果格式不符，顯示原文
-                examples_string += f"範例 {i}:\n"
-                examples_string += f"  - 使用者問題: \"{question_to_show}\"\n"
-                examples_string += f"  - SQL 查詢:\n```sql\n{example['sql']}\n```\n\n"
-        # 4. 新的使用者問題 (User's New Question)
-        #    這是 AI 這次需要解決的核心問題。
-        final_question_section = (
-            "--- 任務開始 ---\n"
-            f"請根據以上的資料庫結構和參考範例，為以下使用者問題生成 SQL 查詢：\n"
-            f"使用者問題: \"{user_question}\""
-        )
-        # 組合完整的提示
-        full_prompt = (
-            f"{system_instruction}\n\n"
-            f"{schema_string}\n"
-            f"{examples_string}"
-            f"{final_question_section}"
-        )
-        self._log("已建立給 AI 的完整提示 (Prompt):\n" + "="*20 + f"\n{full_prompt}\n" + "="*20)
-        return full_prompt
     def process_question(self, question: str) -> Tuple[str, str]:
-        """
-        處理使用者問題的核心函數。
-        採用「檢索-增強-生成」(RAG) 流程。
-        """
-        self.log_history = [] # 清空上次日誌
-        self._log(f"⏰ 開始處理問題: '{question}'")
-        # 步驟 1: 檢索 (Retrieval)
-        # 無論如何，都先尋找最相似的範例作為參考資料。
-        self._log(f"🔍 正在從 {len(self.dataset)} 個範例中尋找最相似的 {FEW_SHOT_EXAMPLES_COUNT} 個參考...")
-        similar_examples = self.find_most_similar(question, top_k=FEW_SHOT_EXAMPLES_COUNT)
-        if similar_examples:
-            for ex in similar_examples:
-                 self._log(f"  - 找到相似範例 (相似度: {ex['similarity']:.3f}): '{ex['question'][:50]}...'")
-        else:
-            self._log("  - 未找到相似範例。", "WARNING")
-        # 步驟 2: 增強 (Augmentation)
-        # 建立一個包含所有必要資訊的豐富提示。
-        self._log("📝 正在建立給 AI 的完整提示 (Prompt)...")
-        prompt = self._build_prompt_for_generation(question, similar_examples)
-        # 步驟 3: 生成 (Generation)
-        # 將判斷權交給 AI，讓它根據完整的上下文生成 SQL。
-        self._log("🧠 將判斷權交給 AI，開始生成 SQL...")
-        api_response = self.huggingface_api_call(prompt)
-        # 處理並回傳結果
-        sql_query = parse_sql_from_response(api_response)
-        if sql_query:
-            self._log(f"✅ 成功從 AI 回應中解析出 SQL！")
-            status = "生成成功"
-            return sql_query, status
-        else:
-            self._log("❌ 未能從 AI 回應中解析出有效的 SQL。", "ERROR")
-            self._log(f"  - AI 原始回應: {api_response}", "DEBUG")
-            status = "生成失敗"
-            return f"無法從 AI 的回應中提取 SQL。\n\n原始回應:\n{api_response}", status
-    # === 修改結束 ===
-# ==================== Gradio 介面設定 ====================
-text_to_sql_system = None
-try:
-    text_to_sql_system = TextToSQLSystem()
-except Exception as e:
-    print(f"初始化 TextToSQLSystem 失敗: {e}")
-def process_query(question: str) -> Tuple[str, str, str]:
-    """Gradio 的處理函數"""
-    if not text_to_sql_system:
-        error_msg = "系統初始化失敗，無法處理請求。"
-        return error_msg, "失敗", error_msg
-    if not question.strip():
-        return "", "等待輸入", "請輸入您的問題。"
-    sql_result, status = text_to_sql_system.process_question(question)
-    log_output = "\n".join(text_to_sql_system.log_history)
-    return sql_result, status, log_output
-# Gradio 介面佈局
-with gr.Blocks(theme=gr.themes.Soft(), title="Text-to-SQL 智能查詢系統") as demo:
-    gr.Markdown("# 📊 Text-to-SQL 智能查詢系統")
-    gr.Markdown("輸入您的自然語言問題，系統將自動轉換為 SQL 查詢語法。")
     with gr.Row():
         with gr.Column(scale=2):
-            question_input = gr.Textbox(
-                lines=3,
-                label="💬 您的問題",
-                placeholder="例如：2024年每月完成了多少份報告？"
-            )
-            submit_btn = gr.Button("🚀 生成 SQL", variant="primary")
-            status_output = gr.Textbox(label="處理狀態", interactive=False)
         with gr.Column(scale=3):
-            sql_output = gr.Code(label="🤖 生成的 SQL 查詢", language="sql")
-    with gr.Accordion("🔍 顯示詳細處理日誌", open=False):
-        log_output = gr.Textbox(lines=15, label="日誌", interactive=False)
-    # 優化的範例
-    gr.Examples(
-        examples=[
-            "2024年每月完成多少份報告？",
-            "統計各種評級(Pass/Fail)的分布情況",
-            "找出總金額最高的10個工作單來自哪些申請方",
-            "哪些客戶的工作單數量最多？",
-            "A組昨天完成了多少個測試項目？",
-            "2024年Q1期間評級為Fail且總金額超過10000的工作單"
-        ],
-        inputs=question_input,
-        label="💡 範例問題 (點擊試用)"
-    )
-    # 綁定事件
-    submit_btn.click(
-        fn=process_query,
-        inputs=[question_input],
-        outputs=[sql_output, status_output, log_output]
-    )
-    question_input.submit(
-        fn=process_query,
-        inputs=[question_input],
-        outputs=[sql_output, status_output, log_output]
-    )
 if __name__ == "__main__":
-    if text_to_sql_system:
-        print("Gradio 介面啟動中...")
-        # 根據環境選擇啟動參數
-        if IS_SPACES:
-            # Hugging Face Spaces 環境
-            print("🌐 在 Hugging Face Spaces 環境中啟動...")
-            demo.launch(
-                server_name="0.0.0.0",
-                server_port=7860,
-            )
-        else:
-            # 本地環境
-            print("🏠 在本地環境中啟動 ([http://127.0.0.1:7860](http://127.0.0.1:7860))...")
-            demo.launch()

 import gradio as gr
 import os
 import re
+import json
+import torch
+import numpy as np
 from datetime import datetime
 from datasets import load_dataset
 from sentence_transformers import SentenceTransformer, util
 from huggingface_hub import hf_hub_download
+from llama_cpp import Llama
 from typing import List, Dict, Tuple, Optional
 # ==================== 配置區 ====================
 DATASET_REPO_ID = "Paul720810/Text-to-SQL-Softline"
+GGUF_REPO_ID = "Paul720810/gguf-models"
+GGUF_FILENAME = "qwen2.5-coder-1.5b-sql-finetuned.q4_k_m.gguf"
+FEW_SHOT_EXAMPLES_COUNT = 2
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
 print("=" * 60)
+print("🤖 Text-to-SQL (GGUF) 系統啟動中...")
+print(f"📊 數據集: {DATASET_REPO_ID}")
 print(f"💻 設備: {DEVICE}")
 print("=" * 60)
+# ==================== 工具函數 ====================
 def get_current_time():
     return datetime.now().strftime('%Y-%m-%d %H:%M:%S')
 def format_log(message: str, level: str = "INFO") -> str:
     return f"[{get_current_time()}] [{level.upper()}] {message}"
 def parse_sql_from_response(response_text: str) -> Optional[str]:
+    """從模型輸出提取 SQL"""
     match = re.search(r"```sql\n(.*?)\n```", response_text, re.DOTALL)
     if match:
         return match.group(1).strip()
     return None
+# ==================== Text-to-SQL 核心類 ====================
 class TextToSQLSystem:
+    def __init__(self, embed_model='sentence-transformers/paraphrase-multilingual-mpnet-base-v2'):
         self.log_history = []
         self._log("初始化系統...")
+        # 1. 載入 schema
         self.schema = self._load_schema()
+        # 2. 載入檢索模型
+        self.model = SentenceTransformer(embed_model, device=DEVICE)
         self.dataset, self.corpus_embeddings = self._load_and_encode_dataset()
+        # 3. 載入 GGUF 模型
+        model_path = hf_hub_download(
+            repo_id=GGUF_REPO_ID,
+            filename=GGUF_FILENAME,
+            repo_type="dataset"
         )
+        self.llm = Llama(
+            model_path=model_path,
+            n_ctx=4096,
+            n_threads=8,
+            verbose=False
+        )
+        self._log(f"✅ 已載入 GGUF 模型: {GGUF_FILENAME}")
     def _log(self, message: str, level: str = "INFO"):
         self.log_history.append(format_log(message, level))
         print(format_log(message, level))
     def _load_schema(self) -> Dict:
         try:
+            schema_path = hf_hub_download(
+                repo_id=DATASET_REPO_ID,
+                filename="sqlite_schema_FULL.json",
+                repo_type="dataset"
+            )
+            with open(schema_path, "r", encoding="utf-8") as f:
                 self._log("成功載入資料庫結構 (sqlite_schema_FULL.json)")
                 return json.load(f)
         except Exception as e:
+            self._log(f"❌ 載入 schema 失敗: {e}", "ERROR")
             return {}
+    def _format_schema_for_prompt(self) -> str:
+        formatted = "資料庫結構:\n"
+        for table, cols in self.schema.items():
+            formatted += f"Table: {table}\n"
+            for col in cols:
+                formatted += f"  - {col['name']} ({col['type']}) # {col.get('description','')}\n"
+            formatted += "\n"
+        return formatted
+    def _load_and_encode_dataset(self):
         try:
             dataset = load_dataset(DATASET_REPO_ID, data_files="training_data.jsonl", split="train")
             corpus = [item['messages'][0]['content'] for item in dataset]
+            self._log(f"正在編碼 {len(corpus)} 個問題...")
             embeddings = self.model.encode(corpus, convert_to_tensor=True, device=DEVICE)
             self._log("✅ 範例問題編碼完成。")
             return dataset, embeddings
             return None, None
     def find_most_similar(self, question: str, top_k: int) -> List[Dict]:
+        if self.corpus_embeddings is None: return []
+        q_emb = self.model.encode(question, convert_to_tensor=True, device=DEVICE)
+        scores = util.cos_sim(q_emb, self.corpus_embeddings)[0]
+        top = torch.topk(scores, k=min(top_k, len(self.corpus_embeddings)))
+        results = []
+        for score, idx in zip(top[0], top[1]):
             item = self.dataset[idx.item()]
+            q = item['messages'][0]['content']
+            a = item['messages'][1]['content']
+            sql = parse_sql_from_response(a) or "無法解析範例SQL"
+            results.append({"similarity": score.item(), "question": q, "sql": sql})
+        return results
+    def _build_prompt(self, user_q: str, examples: List[Dict]) -> str:
         system_instruction = (
+            "你是一位資料庫專家，請根據使用者的問題，參考資料庫結構與範例，"
+            "生成正確的 SQLite 查詢，並用 ```sql ... ``` 包起來。"
         )
+        schema_str = self._format_schema_for_prompt()
+        ex_str = "--- 範例 ---\n"
+        for i, ex in enumerate(examples, 1):
+            ex_str += f"範例 {i} 問題: {ex['question']}\nSQL:\n```sql\n{ex['sql']}\n```\n\n"
+        return f"{system_instruction}\n\n{schema_str}\n{ex_str}\n--- 使用者問題 ---\n{user_q}"
+    def huggingface_api_call(self, prompt: str) -> str:
+        try:
+            self._log("🧠 使用 GGUF 模型生成 SQL...")
+            output = self.llm(prompt, max_tokens=512, stop=["</s>"])
+            text = output["choices"][0]["text"]
+            return text
+        except Exception as e:
+            self._log(f"❌ 生成失敗: {e}", "ERROR")
+            return f"生成失敗: {e}"
     def process_question(self, question: str) -> Tuple[str, str]:
+        self.log_history = []
+        self._log(f"⏰ 問題: {question}")
+        self._log("🔍 尋找相似範例...")
+        examples = self.find_most_similar(question, FEW_SHOT_EXAMPLES_COUNT)
+        self._log("📝 建立 Prompt...")
+        prompt = self._build_prompt(question, examples)
+        self._log("🧠 開始生成...")
+        response = self.huggingface_api_call(prompt)
+        sql = parse_sql_from_response(response)
+        if sql:
+            self._log("✅ 成功解析 SQL")
+            return sql, "生成成功"
+        else:
+            self._log("❌ 未能解析 SQL", "ERROR")
+            return f"原始回應:\n{response}", "生成失敗"
+# ==================== Gradio 介面 ====================
+text_to_sql_system = TextToSQLSystem()
+def process_query(q: str):
+    if not q.strip():
+        return "", "等待輸入", "請輸入問題"
+    sql, status = text_to_sql_system.process_question(q)
+    logs = "\n".join(text_to_sql_system.log_history)
+    return sql, status, logs
+with gr.Blocks(theme=gr.themes.Soft(), title="Text-to-SQL Assistant (GGUF)") as demo:
+    gr.Markdown("# 📊 Text-to-SQL Assistant (GGUF)")
     with gr.Row():
         with gr.Column(scale=2):
+            inp = gr.Textbox(lines=3, label="💬 問題")
+            btn = gr.Button("🚀 生成 SQL")
+            status = gr.Textbox(label="狀態", interactive=False)
         with gr.Column(scale=3):
+            sql_out = gr.Code(label="🤖 SQL", language="sql")
+    with gr.Accordion("日誌", open=False):
+        logs = gr.Textbox(lines=15, label="處理日誌", interactive=False)
+    btn.click(process_query, inputs=[inp], outputs=[sql_out, status, logs])
+    inp.submit(process_query, inputs=[inp], outputs=[sql_out, status, logs])
 if __name__ == "__main__":
+    demo.launch(server_name="0.0.0.0", server_port=7860)