Spaces:

Paul720810
/

Softline-SQL-Assistant

Sleeping

App Files Files Community

Paul720810 commited on Sep 4, 2025

Commit

9fbce62

verified ·

1 Parent(s): b27f7e5

Update app.py

Browse files

Files changed (1) hide show

app.py +95 -73

app.py CHANGED Viewed

@@ -6,25 +6,29 @@ import torch
 import numpy as np
 from datetime import datetime
 from datasets import load_dataset
-from sentence_transformers import SentenceTransformer, util
 from huggingface_hub import hf_hub_download
 from llama_cpp import Llama
 from typing import List, Dict, Tuple, Optional
 import faiss
 from functools import lru_cache
 # ==================== 配置區 ====================
 DATASET_REPO_ID = "Paul720810/Text-to-SQL-Softline"
 GGUF_REPO_ID = "Paul720810/gguf-models"
 GGUF_FILENAME = "qwen2.5-coder-1.5b-sql-finetuned.q4_k_m.gguf"
-FEW_SHOT_EXAMPLES_COUNT = 1  # 只使用1个最相关的范例
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
 print("=" * 60)
-print("🤖 Text-to-SQL (GGUF) 極速版系統啟動中...")
 print(f"📊 數據集: {DATASET_REPO_ID}")
-print(f"🤖 GGUF 模型: {GGUF_REPO_ID}/{GGUF_FILENAME}")
 print(f"💻 設備: {DEVICE}")
 print("=" * 60)
@@ -59,38 +63,45 @@ def parse_sql_from_response(response_text: str) -> Optional[str]:
 # ==================== Text-to-SQL 核心類 ====================
 class TextToSQLSystem:
-    def __init__(self, embed_model='all-MiniLM-L6-v2'):
         self.log_history = []
-        self._log("初始化極速系統...")
         self.query_cache = {}
-        # 並行載入所有組件
-        import threading
-        self.schema = {}
-        self.model = None
-        self.dataset = None
-        self.corpus_embeddings = None
-        self.faiss_index = None
-        self.llm = None
-        threads = [
-            threading.Thread(target=self._load_schema),
-            threading.Thread(target=self._load_embedding_model),
-            threading.Thread(target=self._load_gguf_model)
-        ]
-        for t in threads:
-            t.start()
-        for t in threads:
-            t.join()
-        self._log("✅ 所有組件載入完成")
     def _log(self, message: str, level: str = "INFO"):
         self.log_history.append(format_log(message, level))
         print(format_log(message, level))
-    def _load_schema(self):
         """載入數據庫結構"""
         try:
             schema_path = hf_hub_download(
@@ -99,51 +110,58 @@ class TextToSQLSystem:
                 repo_type="dataset"
             )
             with open(schema_path, "r", encoding="utf-8") as f:
-                self.schema = json.load(f)
                 self._log("✅ 數據庫結構載入完成")
         except Exception as e:
             self._log(f"❌ 載入 schema 失敗: {e}", "ERROR")
-    def _load_embedding_model(self):
-        """載入檢索模型和數據"""
         try:
-            self.model = SentenceTransformer('all-MiniLM-L6-v2', device=DEVICE)
             dataset = load_dataset(DATASET_REPO_ID, data_files="training_data.jsonl", split="train")
-            self.dataset = dataset
             corpus = [item['messages'][0]['content'] for item in dataset]
             self._log(f"正在編碼 {len(corpus)} 個問題...")
-            embeddings = self.model.encode(corpus, convert_to_tensor=True, device=DEVICE)
-            self.corpus_embeddings = embeddings
             # 建立 FAISS 索引
-            embeddings_np = embeddings.cpu().numpy()
-            self.faiss_index = faiss.IndexFlatIP(embeddings_np.shape[1])
-            self.faiss_index.add(embeddings_np)
-            self._log("✅ FAISS 向量索引建立完成")
-        except Exception as e:
-            self._log(f"❌ 載入檢索模型失敗: {e}", "ERROR")
-    def _load_gguf_model(self):
-        """載入 GGUF 模型"""
-        try:
-            model_path = hf_hub_download(
-                repo_id=GGUF_REPO_ID,
-                filename=GGUF_FILENAME,
-                repo_type="dataset"
-            )
-            self.llm = Llama(
-                model_path=model_path,
-                n_ctx=1024,
-                n_threads=os.cpu_count(),
-                n_batch=512,
-                n_gpu_layers=0,
-                verbose=False
-            )
-            self._log("✅ GGUF 模型載入完成")
         except Exception as e:
-            self._log(f"❌ 載入 GGUF 模型失敗: {e}", "ERROR")
     def _identify_relevant_tables(self, question: str) -> List[str]:
         """智能識別問題相關的表"""
@@ -171,13 +189,13 @@ class TextToSQLSystem:
         if not self.schema:
             return "無數據庫結構信息"
-        formatted = "相關表結構:\n"
         for table in table_names:
             if table in self.schema:
-                formatted += f"## {table}\n"
                 for col in self.schema[table][:6]:  # 只顯示前6個列
                     col_desc = col.get('description', '')
-                    formatted += f"- {col['name']} ({col['type']})"
                     if col_desc:
                         formatted += f" # {col_desc}"
                     formatted += "\n"
@@ -185,18 +203,17 @@ class TextToSQLSystem:
         return formatted
-    @lru_cache(maxsize=100)
     def find_most_similar(self, question: str, top_k: int) -> List[Dict]:
         """使用 FAISS 快速檢索相似問題"""
         if self.faiss_index is None or self.dataset is None:
             return []
         try:
-            q_emb = self.model.encode(question, convert_to_tensor=True, device=DEVICE)
-            q_emb_np = q_emb.cpu().numpy().reshape(1, -1)
             # FAISS 搜索
-            distances, indices = self.faiss_index.search(q_emb_np, min(top_k + 2, len(self.dataset)))
             results = []
             seen_questions = set()
@@ -205,6 +222,9 @@ class TextToSQLSystem:
                 if len(results) >= top_k:
                     break
                 item = self.dataset[idx]
                 q_content = item['messages'][0]['content']
                 a_content = item['messages'][1]['content']
@@ -236,15 +256,15 @@ class TextToSQLSystem:
         schema_str = self._format_relevant_schema(relevant_tables)
         # 極簡指令
-        system_instruction = "生成SQL查詢。只輸出```sql...```內容。確保SQL語法正確。"
         # 只顯示一個最有用的範例
         ex_str = ""
         if examples:
             best_example = examples[0]
-            ex_str = f"參考範例:\n問題: {best_example['question']}\nSQL: ```sql\n{best_example['sql']}\n```\n\n"
-        prompt = f"{system_instruction}\n{schema_str}\n{ex_str}問題: {user_q}\nSQL:"
         # 檢查長度，如果太長則進一步精簡
         if len(prompt) > 1500:
@@ -295,6 +315,8 @@ class TextToSQLSystem:
         # 檢索相似範例
         self._log("🔍 尋找相似範例...")
         examples = self.find_most_similar(question, FEW_SHOT_EXAMPLES_COUNT)
         # 建立提示詞
         self._log("📝 建立 Prompt...")
@@ -339,9 +361,9 @@ examples = [
     "A組昨天完成了多少個測試項目？"
 ]
-with gr.Blocks(theme=gr.themes.Soft(), title="Text-to-SQL 極速助手") as demo:
-    gr.Markdown("# ⚡ Text-to-SQL 極速助手 (GGUF)")
-    gr.Markdown("輸入自然語言問題，自動生成SQL查詢")
     with gr.Row():
         with gr.Column(scale=2):

 import numpy as np
 from datetime import datetime
 from datasets import load_dataset
 from huggingface_hub import hf_hub_download
 from llama_cpp import Llama
 from typing import List, Dict, Tuple, Optional
 import faiss
 from functools import lru_cache
+# 使用 transformers 替代 sentence-transformers
+from transformers import AutoModel, AutoTokenizer
+import torch.nn.functional as F
 # ==================== 配置區 ====================
 DATASET_REPO_ID = "Paul720810/Text-to-SQL-Softline"
 GGUF_REPO_ID = "Paul720810/gguf-models"
 GGUF_FILENAME = "qwen2.5-coder-1.5b-sql-finetuned.q4_k_m.gguf"
+FEW_SHOT_EXAMPLES_COUNT = 1
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
+EMBED_MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"
 print("=" * 60)
+print("🤖 Text-to-SQL 系統啟動中...")
 print(f"📊 數據集: {DATASET_REPO_ID}")
+print(f"🤖 嵌入模型: {EMBED_MODEL_NAME}")
 print(f"💻 設備: {DEVICE}")
 print("=" * 60)
 # ==================== Text-to-SQL 核心類 ====================
 class TextToSQLSystem:
+    def __init__(self, embed_model_name=EMBED_MODEL_NAME):
         self.log_history = []
+        self._log("初始化系統...")
         self.query_cache = {}
+        # 1. 載入嵌入模型（使用 transformers）
+        self._log(f"載入嵌入模型: {embed_model_name}")
+        self.embed_tokenizer = AutoTokenizer.from_pretrained(embed_model_name)
+        self.embed_model = AutoModel.from_pretrained(embed_model_name)
+        if DEVICE == "cuda":
+            self.embed_model = self.embed_model.cuda()
+        # 2. 載入數據庫結構
+        self.schema = self._load_schema()
+        # 3. 載入數據集並建立索引
+        self.dataset, self.faiss_index = self._load_and_index_dataset()
+        # 4. 載入 GGUF 模型
+        self._log("載入 GGUF 模型...")
+        model_path = hf_hub_download(
+            repo_id=GGUF_REPO_ID,
+            filename=GGUF_FILENAME,
+            repo_type="dataset"
+        )
+        self.llm = Llama(
+            model_path=model_path,
+            n_ctx=1024,
+            n_threads=os.cpu_count(),
+            n_batch=512,
+            verbose=False
+        )
+        self._log("✅ 系統初始化完成")
     def _log(self, message: str, level: str = "INFO"):
         self.log_history.append(format_log(message, level))
         print(format_log(message, level))
+    def _load_schema(self) -> Dict:
         """載入數據庫結構"""
         try:
             schema_path = hf_hub_download(
                 repo_type="dataset"
             )
             with open(schema_path, "r", encoding="utf-8") as f:
                 self._log("✅ 數據庫結構載入完成")
+                return json.load(f)
         except Exception as e:
             self._log(f"❌ 載入 schema 失敗: {e}", "ERROR")
+            return {}
+    def _encode_texts(self, texts):
+        """編碼文本為嵌入向量"""
+        if isinstance(texts, str):
+            texts = [texts]
+        inputs = self.embed_tokenizer(texts, padding=True, truncation=True,
+                                    return_tensors="pt", max_length=512)
+        if DEVICE == "cuda":
+            inputs = {k: v.cuda() for k, v in inputs.items()}
+        with torch.no_grad():
+            outputs = self.embed_model(**inputs)
+        # 使用平均池化
+        embeddings = outputs.last_hidden_state.mean(dim=1)
+        return embeddings.cpu()
+    def _load_and_index_dataset(self):
+        """載入數據集並建立 FAISS 索引"""
         try:
             dataset = load_dataset(DATASET_REPO_ID, data_files="training_data.jsonl", split="train")
             corpus = [item['messages'][0]['content'] for item in dataset]
             self._log(f"正在編碼 {len(corpus)} 個問題...")
+            # 批量編碼
+            embeddings_list = []
+            batch_size = 32
+            for i in range(0, len(corpus), batch_size):
+                batch_texts = corpus[i:i+batch_size]
+                batch_embeddings = self._encode_texts(batch_texts)
+                embeddings_list.append(batch_embeddings)
+                self._log(f"已編碼 {min(i+batch_size, len(corpus))}/{len(corpus)}")
+            all_embeddings = torch.cat(embeddings_list, dim=0).numpy()
             # 建立 FAISS 索引
+            index = faiss.IndexFlatIP(all_embeddings.shape[1])
+            index.add(all_embeddings.astype('float32'))
+            self._log("✅ 向量索引建立完成")
+            return dataset, index
         except Exception as e:
+            self._log(f"❌ 載入數據失敗: {e}", "ERROR")
+            return None, None
     def _identify_relevant_tables(self, question: str) -> List[str]:
         """智能識別問題相關的表"""
         if not self.schema:
             return "無數據庫結構信息"
+        formatted = "## 相關表結構:\n\n"
         for table in table_names:
             if table in self.schema:
+                formatted += f"### {table}\n"
                 for col in self.schema[table][:6]:  # 只顯示前6個列
                     col_desc = col.get('description', '')
+                    formatted += f"- **{col['name']}** ({col['type']})"
                     if col_desc:
                         formatted += f" # {col_desc}"
                     formatted += "\n"
         return formatted
     def find_most_similar(self, question: str, top_k: int) -> List[Dict]:
         """使用 FAISS 快速檢索相似問題"""
         if self.faiss_index is None or self.dataset is None:
             return []
         try:
+            # 編碼問題
+            q_embedding = self._encode_texts([question]).numpy().astype('float32')
             # FAISS 搜索
+            distances, indices = self.faiss_index.search(q_embedding, min(top_k + 2, len(self.dataset)))
             results = []
             seen_questions = set()
                 if len(results) >= top_k:
                     break
+                if idx >= len(self.dataset):  # 確保索引有效
+                    continue
                 item = self.dataset[idx]
                 q_content = item['messages'][0]['content']
                 a_content = item['messages'][1]['content']
         schema_str = self._format_relevant_schema(relevant_tables)
         # 極簡指令
+        system_instruction = "你是一位SQL專家。請生成準確的SQLite查詢語句。只輸出```sql...```內容。"
         # 只顯示一個最有用的範例
         ex_str = ""
         if examples:
             best_example = examples[0]
+            ex_str = f"## 參考範例:\n問題: {best_example['question']}\nSQL: ```sql\n{best_example['sql']}\n```\n\n"
+        prompt = f"{system_instruction}\n\n{schema_str}\n{ex_str}## 當前問題:\n{user_q}\n\n## SQL查詢:"
         # 檢查長度，如果太長則進一步精簡
         if len(prompt) > 1500:
         # 檢索相似範例
         self._log("🔍 尋找相似範例...")
         examples = self.find_most_similar(question, FEW_SHOT_EXAMPLES_COUNT)
+        if examples:
+            self._log(f"✅ 找到 {len(examples)} 個相似範例")
         # 建立提示詞
         self._log("📝 建立 Prompt...")
     "A組昨天完成了多少個測試項目？"
 ]
+with gr.Blocks(theme=gr.themes.Soft(), title="Text-to-SQL 智能助手") as demo:
+    gr.Markdown("# ⚡ Text-to-SQL 智能助手")
+    gr.Markdown("輸入自然語言問題，自動生成SQL查詢語句")
     with gr.Row():
         with gr.Column(scale=2):