Spaces:

Paul720810
/

Softline-SQL-Assistant

Sleeping

App Files Files Community

Paul720810 commited on Sep 3, 2025

Commit

f327d97

verified ·

1 Parent(s): b69d84b

Update app.py

Browse files

Files changed (1) hide show

app.py +117 -50

app.py CHANGED Viewed

@@ -124,6 +124,33 @@ class CompleteDataLoader:
         self.sql_quality = []
         self.schema_data = {}
     def load_complete_dataset(self) -> bool:
         try:
             print(f"[{get_current_time()}] 正在加載完整數據集 '{DATASET_REPO_ID}'...")
@@ -131,7 +158,7 @@ class CompleteDataLoader:
             successful_loads = 0
             total_items = len(raw_dataset)
-            skipped_reasons = {"empty_question": 0, "empty_sql": 0, "parse_error": 0, "invalid_format": 0}
             for idx, item in enumerate(raw_dataset):
                 try:
@@ -142,59 +169,86 @@ class CompleteDataLoader:
                         # 多種問題提取策略
                         question = None
-                        # 策略1: 標準「指令:」格式
-                        question_match = re.search(r'指令:\s*(.*?)(?:\n|$)', user_content)
-                        if question_match:
-                            question = question_match.group(1).strip()
-                        # 策略2: 如果沒找到，嘗試提取最後一行非空內容
                         if not question:
-                            lines = [line.strip() for line in user_content.split('\n') if line.strip()]
                             if lines:
-                                question = lines[-1]
-                        # 策略3: 直接使用整個內容（作為最後手段）
                         if not question:
                             question = user_content.strip()
-                        # 多種SQL提取策略
-                        sql_query = None
-                        # 策略1: 標準「SQL查詢:」格式
-                        sql_match = re.search(r'SQL查詢:\s*(.*?)(?:\n|$)', assistant_content, re.DOTALL)
-                        if sql_match:
-                            sql_query = sql_match.group(1).strip()
-                        # 策略2: SQL代碼塊格式
                         if not sql_query:
-                            sql_block_match = re.search(r'```sql\s*(.*?)\s*```', assistant_content, re.DOTALL)
-                            if sql_block_match:
-                                sql_query = sql_block_match.group(1).strip()
-                        # 策略3: 查找任何包含 SELECT 的行
-                        if not sql_query:
-                            for line in assistant_content.split('\n'):
-                                if 'SELECT' in line.upper():
-                                    # 從這行開始提取到最後或到下個非SQL行
-                                    sql_lines = []
-                                    found_start = False
-                                    for l in assistant_content.split('\n'):
-                                        if 'SELECT' in l.upper():
-                                            found_start = True
-                                        if found_start:
-                                            if l.strip() and not l.strip().startswith('```'):
-                                                sql_lines.append(l)
-                                            elif l.strip() == '' and sql_lines:
-                                                continue
-                                            elif found_start and len(sql_lines) > 0:
-                                                break
-                                    if sql_lines:
-                                        sql_query = '\n'.join(sql_lines).strip()
-                                        break
-                        # 策略4: 如果還是沒找到，使用整個assistant內容
-                        if not sql_query:
-                            sql_query = assistant_content.strip()
                         # 清理SQL查詢
                         if sql_query:
@@ -202,6 +256,11 @@ class CompleteDataLoader:
                             sql_query = re.sub(r'^思考過程:.*?\n', '', sql_query, flags=re.MULTILINE).strip()
                             sql_query = re.sub(r'^SQL查詢:\s*', '', sql_query, flags=re.MULTILINE).strip()
                         # 數據質量驗證（降低標準以提高利用率）
                         if not question or len(question.strip()) < 3:
                             skipped_reasons["empty_question"] += 1
@@ -223,13 +282,16 @@ class CompleteDataLoader:
                         # 調試：顯示前幾個成功案例
                         if successful_loads <= 3:
-                            print(f"成功案例 {successful_loads}:")
-                            print(f"  問題: {question[:50]}...")
-                            print(f"  SQL: {sql_query[:50]}...")
                     else:
                         skipped_reasons["invalid_format"] += 1
                 except Exception as e:
                     skipped_reasons["parse_error"] += 1
                     if idx < 3:  # 只顯示前3個錯誤
@@ -237,7 +299,7 @@ class CompleteDataLoader:
                     continue
             print(f"數據加載完成: 成功載入 {successful_loads}/{total_items} 項")
-            print(f"跳過原因統計: 問題為空({skipped_reasons['empty_question']}) | SQL為空({skipped_reasons['empty_sql']}) | 格式錯誤({skipped_reasons['invalid_format']}) | 解析錯誤({skipped_reasons['parse_error']})")
             return successful_loads > 0
         except Exception as e:
             print(f"數據集加載失敗: {e}")
@@ -317,6 +379,11 @@ class CompleteTextToSQLSystem:
     def initialize_system(self):
         print("正在初始化完整數據系統...")
         self.data_loader.load_complete_dataset()
         self.data_loader.load_schema()
         if self.data_loader.questions:

         self.sql_quality = []
         self.schema_data = {}
+    def preview_dataset_structure(self, sample_size: int = 5) -> None:
+        """預覽數據集結構以幫助調試"""
+        try:
+            print(f"📋 預覽數據集結構 (前 {sample_size} 個範例)...")
+            raw_dataset = load_dataset(DATASET_REPO_ID, token=self.hf_token)['train']
+            for i in range(min(sample_size, len(raw_dataset))):
+                item = raw_dataset[i]
+                print(f"\n--- 範例 {i+1} ---")
+                if 'messages' in item:
+                    user_content = item['messages'][0]['content']
+                    assistant_content = item['messages'][1]['content']
+                    print(f"User: {user_content[:100]}...")
+                    print(f"Assistant: {assistant_content[:100]}...")
+                    # 檢查是否為JSON格式
+                    if assistant_content.strip().startswith('{'):
+                        try:
+                            json_data = json.loads(assistant_content)
+                            print(f"JSON Keys: {list(json_data.keys())}")
+                        except:
+                            print("JSON解析失敗")
+                else:
+                    print(f"無messages字段: {list(item.keys())}")
+        except Exception as e:
+            print(f"預覽失敗: {e}")
     def load_complete_dataset(self) -> bool:
         try:
             print(f"[{get_current_time()}] 正在加載完整數據集 '{DATASET_REPO_ID}'...")
             successful_loads = 0
             total_items = len(raw_dataset)
+            skipped_reasons = {"empty_question": 0, "empty_sql": 0, "parse_error": 0, "invalid_format": 0, "json_parse_error": 0}
             for idx, item in enumerate(raw_dataset):
                 try:
                         # 多種問題提取策略
                         question = None
+                        # 策略1: 檢查是否為JSON格式的回應
+                        try:
+                            if assistant_content.strip().startswith('{'):
+                                json_data = json.loads(assistant_content)
+                                if 'sql' in json_data:
+                                    sql_query = json_data['sql']
+                                elif 'query' in json_data:
+                                    sql_query = json_data['query']
+                                else:
+                                    sql_query = None
+                                # 從JSON中提取問題 (如果有的話)
+                                if 'question' in json_data:
+                                    question = json_data['question']
+                                elif 'user_query' in json_data:
+                                    question = json_data['user_query']
+                            else:
+                                sql_query = None
+                        except json.JSONDecodeError:
+                            sql_query = None
+                        # 策略2: 標準「指令:」格式
+                        if not question:
+                            question_match = re.search(r'指令:\s*(.*?)(?:\n|$)', user_content)
+                            if question_match:
+                                question = question_match.group(1).strip()
+                        # 策略3: 如果沒找到，嘗試提取最後一行非空內容
                         if not question:
+                            lines = [line.strip() for line in user_content.split('\n') if line.strip() and not line.startswith('#')]
                             if lines:
+                                # 過濾掉看起來像標題的行
+                                for line in reversed(lines):
+                                    if not line.startswith('###') and '?' in line and len(line) > 5:
+                                        question = line
+                                        break
+                                if not question and lines:
+                                    question = lines[-1]
+                        # 策略4: 直接使用整個內容（作為最後手段）
                         if not question:
                             question = user_content.strip()
+                        # SQL提取邏輯（如果還沒從JSON中獲得）
                         if not sql_query:
+                            # 策略1: 標準「SQL查詢:」格式
+                            sql_match = re.search(r'SQL查詢:\s*(.*?)(?:\n|$)', assistant_content, re.DOTALL)
+                            if sql_match:
+                                sql_query = sql_match.group(1).strip()
+                            # 策略2: SQL代碼塊格式
+                            if not sql_query:
+                                sql_block_match = re.search(r'```sql\s*(.*?)\s*```', assistant_content, re.DOTALL)
+                                if sql_block_match:
+                                    sql_query = sql_block_match.group(1).strip()
+                            # 策略3: 查找任何包含 SELECT 的行
+                            if not sql_query:
+                                for line in assistant_content.split('\n'):
+                                    if 'SELECT' in line.upper():
+                                        # 從這行開始提取到最後或到下個非SQL行
+                                        sql_lines = []
+                                        found_start = False
+                                        for l in assistant_content.split('\n'):
+                                            if 'SELECT' in l.upper():
+                                                found_start = True
+                                            if found_start:
+                                                if l.strip() and not l.strip().startswith('```'):
+                                                    sql_lines.append(l)
+                                                elif l.strip() == '' and sql_lines:
+                                                    continue
+                                                elif found_start and len(sql_lines) > 0:
+                                                    break
+                                        if sql_lines:
+                                            sql_query = '\n'.join(sql_lines).strip()
+                                            break
+                            # 策略4: 如果還是沒找到，使用整個assistant內容
+                            if not sql_query:
+                                sql_query = assistant_content.strip()
                         # 清理SQL查詢
                         if sql_query:
                             sql_query = re.sub(r'^思考過程:.*?\n', '', sql_query, flags=re.MULTILINE).strip()
                             sql_query = re.sub(r'^SQL查詢:\s*', '', sql_query, flags=re.MULTILINE).strip()
+                        # 清理問題文本
+                        if question:
+                            question = re.sub(r'^###\s*', '', question).strip()
+                            question = re.sub(r'Your JSON Response.*', '', question).strip()
                         # 數據質量驗證（降低標準以提高利用率）
                         if not question or len(question.strip()) < 3:
                             skipped_reasons["empty_question"] += 1
                         # 調試：顯示前幾個成功案例
                         if successful_loads <= 3:
+                            print(f"✅ 成功案例 {successful_loads}:")
+                            print(f"  問題: {question[:80]}...")
+                            print(f"  SQL: {sql_query[:80]}...")
                     else:
                         skipped_reasons["invalid_format"] += 1
+                except json.JSONDecodeError as e:
+                    skipped_reasons["json_parse_error"] += 1
+                    continue
                 except Exception as e:
                     skipped_reasons["parse_error"] += 1
                     if idx < 3:  # 只顯示前3個錯誤
                     continue
             print(f"數據加載完成: 成功載入 {successful_loads}/{total_items} 項")
+            print(f"跳過原因統計: 問題為空({skipped_reasons['empty_question']}) | SQL為空({skipped_reasons['empty_sql']}) | 格式錯誤({skipped_reasons['invalid_format']}) | JSON錯誤({skipped_reasons['json_parse_error']}) | 解析錯誤({skipped_reasons['parse_error']})")
             return successful_loads > 0
         except Exception as e:
             print(f"數據集加載失敗: {e}")
     def initialize_system(self):
         print("正在初始化完整數據系統...")
+        # 首先預覽數據結構
+        self.data_loader.preview_dataset_structure(3)
+        # 然後加載數據
         self.data_loader.load_complete_dataset()
         self.data_loader.load_schema()
         if self.data_loader.questions: