Spaces:

Paul720810
/

Softline-SQL-Assistant

Sleeping

App Files Files Community

Paul720810 commited on Sep 3, 2025

Commit

afb724a

verified ·

1 Parent(s): 352a657

Update app.py

Browse files

Files changed (1) hide show

app.py +113 -16

app.py CHANGED Viewed

@@ -16,9 +16,15 @@ HF_TOKEN = os.environ.get("HF_TOKEN", None) # 建議從環境變數讀取
 DATASET_REPO_ID = "Paul720810/Text-to-SQL-Softline"
 SIMILARITY_THRESHOLD = 0.65 # 適度提高閾值，確保檢索到的問題意圖更一致
 print("=" * 60)
 print("🤖 智能 Text-to-SQL 系統啟動中...")
 print(f"📊 模式: 讀取全部數據（來自 {DATASET_REPO_ID}）")
 print("=" * 60)
 # ==================== 獨立工具函數 (不依賴類別實例) ====================
@@ -98,6 +104,7 @@ class CompleteDataLoader:
             successful_loads = 0
             total_items = len(raw_dataset)
             for idx, item in enumerate(raw_dataset):
                 try:
@@ -105,22 +112,57 @@ class CompleteDataLoader:
                         user_content = item['messages'][0]['content']
                         assistant_content = item['messages'][1]['content']
                         question_match = re.search(r'指令:\s*(.*?)(?:\n|$)', user_content)
-                        question = question_match.group(1).strip() if question_match else user_content
                         sql_match = re.search(r'SQL查詢:\s*(.*?)(?:\n|$)', assistant_content, re.DOTALL)
-                        sql_query = sql_match.group(1).strip() if sql_match else assistant_content
                         sql_query = re.sub(r'```sql|```', '', sql_query).strip()
-                        if question and sql_query:  # 只加載有效的問答對
-                            self.questions.append(question)
-                            self.sql_answers.append(sql_query)
-                            successful_loads += 1
                 except Exception as e:
-                    print(f"跳過第 {idx} 項資料，錯誤: {e}")
                     continue
             print(f"數據加載完成: 成功載入 {successful_loads}/{total_items} 項")
             return successful_loads > 0
         except Exception as e:
             print(f"數據集加載失敗: {e}")
@@ -141,17 +183,44 @@ class CompleteDataLoader:
 class RetrievalSystem:
     def __init__(self):
         try:
-            self.embedder = SentenceTransformer('all-MiniLM-L6-v2', device='cpu')
             self.question_embeddings = None
         except Exception as e:
-            print(f"SentenceTransformer 模型加載失敗: {e}")
             self.embedder = None
     def compute_embeddings(self, questions: List[str]):
         if self.embedder and questions:
             print(f"正在為 {len(questions)} 個問題計算向量...")
-            self.question_embeddings = self.embedder.encode(questions, convert_to_tensor=True, show_progress_bar=True)
-            print("向量計算完成")
     def retrieve_similar(self, user_question: str, top_k: int = 1) -> List[Dict]:
         if self.embedder is None or self.question_embeddings is None: return []
@@ -336,8 +405,13 @@ def process_query(user_question: str) -> Tuple[str, str, str]:
     return sql_result, "✅ 處理完成", log_message
 with gr.Blocks(title="智慧Text-to-SQL系統", theme=gr.themes.Soft()) as demo:
-    gr.Markdown("# 🚀 智慧 Text-to-SQL 系統 (進階修復版)")
     gr.Markdown("📊 **模式**: 結合「檢索驗證」與「意圖導向生成」，即使資料庫範本有誤也能提供準確查詢。")
     with gr.Row():
         question_input = gr.Textbox(
@@ -353,7 +427,7 @@ with gr.Blocks(title="智慧Text-to-SQL系統", theme=gr.themes.Soft()) as demo:
         status_output = gr.Textbox(label="🔍 執行狀態", interactive=False)
         log_output = gr.Textbox(label="📋 詳細日誌", lines=6, interactive=False)
-    # 改進的範例
     gr.Examples(
         examples=[
             "2024年每月完成多少份報告？",
@@ -363,7 +437,8 @@ with gr.Blocks(title="智慧Text-to-SQL系統", theme=gr.themes.Soft()) as demo:
             "A組昨天完成了多少個測試項目？",
             "2024年Q1期間評級為Fail且總金額超過10000的工作單"
         ],
-        inputs=question_input
     )
     # 綁定事件
@@ -382,7 +457,29 @@ with gr.Blocks(title="智慧Text-to-SQL系統", theme=gr.themes.Soft()) as demo:
 if __name__ == "__main__":
     if text_to_sql_system:
         print("Gradio 介面啟動中...")
-        demo.launch(server_name="0.0.0.0", server_port=7860, share=False)
     else:
-        print("無法啟動 Gradio，因為系統初始化失敗。")

 DATASET_REPO_ID = "Paul720810/Text-to-SQL-Softline"
 SIMILARITY_THRESHOLD = 0.65 # 適度提高閾值，確保檢索到的問題意圖更一致
+# 雲端環境檢測
+IS_SPACES = os.environ.get("SPACE_ID") is not None
+DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
 print("=" * 60)
 print("🤖 智能 Text-to-SQL 系統啟動中...")
 print(f"📊 模式: 讀取全部數據（來自 {DATASET_REPO_ID}）")
+print(f"🌐 環境: {'Hugging Face Spaces' if IS_SPACES else '本地環境'}")
+print(f"💻 設備: {DEVICE}")
 print("=" * 60)
 # ==================== 獨立工具函數 (不依賴類別實例) ====================
             successful_loads = 0
             total_items = len(raw_dataset)
+            skipped_reasons = {"empty_question": 0, "empty_sql": 0, "parse_error": 0, "invalid_format": 0}
             for idx, item in enumerate(raw_dataset):
                 try:
                         user_content = item['messages'][0]['content']
                         assistant_content = item['messages'][1]['content']
+                        # 改進的問題提取邏輯
                         question_match = re.search(r'指令:\s*(.*?)(?:\n|$)', user_content)
+                        if question_match:
+                            question = question_match.group(1).strip()
+                        else:
+                            # 如果沒有找到「指令:」格式，嘗試直接使用內容
+                            question = user_content.strip()
+                        # 改進的SQL提取邏輯
                         sql_match = re.search(r'SQL查詢:\s*(.*?)(?:\n|$)', assistant_content, re.DOTALL)
+                        if sql_match:
+                            sql_query = sql_match.group(1).strip()
+                        else:
+                            # 如果沒有找到「SQL查詢:」格式，嘗試提取SQL代碼塊
+                            sql_block_match = re.search(r'```sql\s*(.*?)\s*```', assistant_content, re.DOTALL)
+                            if sql_block_match:
+                                sql_query = sql_block_match.group(1).strip()
+                            else:
+                                sql_query = assistant_content.strip()
+                        # 清理SQL查詢
                         sql_query = re.sub(r'```sql|```', '', sql_query).strip()
+                        # 驗證數據質量
+                        if not question or len(question.strip()) < 3:
+                            skipped_reasons["empty_question"] += 1
+                            continue
+                        if not sql_query or len(sql_query.strip()) < 10:
+                            skipped_reasons["empty_sql"] += 1
+                            continue
+                        # 基本SQL驗證
+                        if "SELECT" not in sql_query.upper():
+                            skipped_reasons["invalid_format"] += 1
+                            continue
+                        self.questions.append(question)
+                        self.sql_answers.append(sql_query)
+                        successful_loads += 1
+                    else:
+                        skipped_reasons["invalid_format"] += 1
                 except Exception as e:
+                    skipped_reasons["parse_error"] += 1
+                    if idx < 5:  # 只顯示前5個錯誤
+                        print(f"跳過第 {idx} 項資料，錯誤: {e}")
                     continue
             print(f"數據加載完成: 成功載入 {successful_loads}/{total_items} 項")
+            print(f"跳過原因統計: 問題為空({skipped_reasons['empty_question']}) | SQL為空({skipped_reasons['empty_sql']}) | 格式錯誤({skipped_reasons['invalid_format']}) | 解析錯誤({skipped_reasons['parse_error']})")
             return successful_loads > 0
         except Exception as e:
             print(f"數據集加載失敗: {e}")
 class RetrievalSystem:
     def __init__(self):
         try:
+            # 根據環境選擇設備
+            device = DEVICE if 'DEVICE' in globals() else 'cpu'
+            print(f"🔧 初始化 SentenceTransformer (設備: {device})...")
+            self.embedder = SentenceTransformer('all-MiniLM-L6-v2', device=device)
             self.question_embeddings = None
+            print("✅ SentenceTransformer 模型加載成功")
         except Exception as e:
+            print(f"❌ SentenceTransformer 模型加載失敗: {e}")
             self.embedder = None
     def compute_embeddings(self, questions: List[str]):
         if self.embedder and questions:
             print(f"正在為 {len(questions)} 個問題計算向量...")
+            try:
+                # 雲端環境優化：分批處理以節省記憶體
+                batch_size = 32 if IS_SPACES else 64
+                self.question_embeddings = self.embedder.encode(
+                    questions,
+                    convert_to_tensor=True,
+                    show_progress_bar=True,
+                    batch_size=batch_size
+                )
+                print("向量計算完成")
+            except Exception as e:
+                print(f"向量計算失敗: {e}")
+                # 降級處理：使用更小的批次大小
+                try:
+                    print("嘗試使用較小批次大小重新計算...")
+                    self.question_embeddings = self.embedder.encode(
+                        questions,
+                        convert_to_tensor=True,
+                        show_progress_bar=True,
+                        batch_size=16
+                    )
+                    print("向量計算完成（降級模式）")
+                except Exception as e2:
+                    print(f"向量計算徹底失敗: {e2}")
+                    self.question_embeddings = None
     def retrieve_similar(self, user_question: str, top_k: int = 1) -> List[Dict]:
         if self.embedder is None or self.question_embeddings is None: return []
     return sql_result, "✅ 處理完成", log_message
 with gr.Blocks(title="智慧Text-to-SQL系統", theme=gr.themes.Soft()) as demo:
+    # 環境資訊顯示
+    env_info = f"🌐 運行環境: {'Hugging Face Spaces' if IS_SPACES else '本地環境'} | 💻 設備: {DEVICE}"
+    system_status = f"📊 已載入 {len(text_to_sql_system.data_loader.questions) if text_to_sql_system else 0} 個問答範例"
+    gr.Markdown("# 🚀 智慧 Text-to-SQL 系統 (雲端版)")
     gr.Markdown("📊 **模式**: 結合「檢索驗證」與「意圖導向生成」，即使資料庫範本有誤也能提供準確查詢。")
+    gr.Markdown(f"ℹ️ {env_info} | {system_status}")
     with gr.Row():
         question_input = gr.Textbox(
         status_output = gr.Textbox(label="🔍 執行狀態", interactive=False)
         log_output = gr.Textbox(label="📋 詳細日誌", lines=6, interactive=False)
+    # 雲端環境優化的範例
     gr.Examples(
         examples=[
             "2024年每月完成多少份報告？",
             "A組昨天完成了多少個測試項目？",
             "2024年Q1期間評級為Fail且總金額超過10000的工作單"
         ],
+        inputs=question_input,
+        label="💡 範例問題 (點擊試用)"
     )
     # 綁定事件
 if __name__ == "__main__":
     if text_to_sql_system:
         print("Gradio 介面啟動中...")
+        # 根據環境選擇啟動參數
+        if IS_SPACES:
+            # Hugging Face Spaces 環境
+            print("🌐 在 Hugging Face Spaces 環境中啟動...")
+            demo.launch(
+                server_name="0.0.0.0",
+                server_port=7860,
+                share=False,
+                show_error=True,
+                quiet=False
+            )
+        else:
+            # 本地環境
+            print("🏠 在本地環境中啟動...")
+            demo.launch(
+                server_name="127.0.0.1",
+                server_port=7860,
+                share=True,  # 本地環境可以選擇分享
+                show_error=True
+            )
     else:
+        print("❌ 無法啟動 Gradio，因為系統初始化失敗。")
+        if IS_SPACES:
+            print("💡 請檢查 Hugging Face Spaces 的環境變數設置。")