Spaces:

Paul720810
/

Softline-SQL-Assistant

Sleeping

App Files Files Community

Paul720810 commited on Sep 3, 2025

Commit

0481392

verified ·

1 Parent(s): f327d97

Update app.py

Browse files

Files changed (1) hide show

app.py +161 -34

app.py CHANGED Viewed

@@ -136,8 +136,22 @@ class CompleteDataLoader:
                 if 'messages' in item:
                     user_content = item['messages'][0]['content']
                     assistant_content = item['messages'][1]['content']
-                    print(f"User: {user_content[:100]}...")
-                    print(f"Assistant: {assistant_content[:100]}...")
                     # 檢查是否為JSON格式
                     if assistant_content.strip().startswith('{'):
@@ -148,9 +162,46 @@ class CompleteDataLoader:
                             print("JSON解析失敗")
                 else:
                     print(f"無messages字段: {list(item.keys())}")
         except Exception as e:
             print(f"預覽失敗: {e}")
     def load_complete_dataset(self) -> bool:
         try:
             print(f"[{get_current_time()}] 正在加載完整數據集 '{DATASET_REPO_ID}'...")
@@ -214,74 +265,112 @@ class CompleteDataLoader:
                         # SQL提取邏輯（如果還沒從JSON中獲得）
                         if not sql_query:
-                            # 策略1: 標準「SQL查詢:」格式
-                            sql_match = re.search(r'SQL查詢:\s*(.*?)(?:\n|$)', assistant_content, re.DOTALL)
-                            if sql_match:
-                                sql_query = sql_match.group(1).strip()
-                            # 策略2: SQL代碼塊格式
                             if not sql_query:
-                                sql_block_match = re.search(r'```sql\s*(.*?)\s*```', assistant_content, re.DOTALL)
-                                if sql_block_match:
-                                    sql_query = sql_block_match.group(1).strip()
-                            # 策略3: 查找任何包含 SELECT 的行
                             if not sql_query:
-                                for line in assistant_content.split('\n'):
-                                    if 'SELECT' in line.upper():
-                                        # 從這行開始提取到最後或到下個非SQL行
-                                        sql_lines = []
-                                        found_start = False
-                                        for l in assistant_content.split('\n'):
-                                            if 'SELECT' in l.upper():
-                                                found_start = True
-                                            if found_start:
-                                                if l.strip() and not l.strip().startswith('```'):
-                                                    sql_lines.append(l)
-                                                elif l.strip() == '' and sql_lines:
-                                                    continue
-                                                elif found_start and len(sql_lines) > 0:
-                                                    break
-                                        if sql_lines:
-                                            sql_query = '\n'.join(sql_lines).strip()
                                             break
-                            # 策略4: 如果還是沒找到，使用整個assistant內容
                             if not sql_query:
-                                sql_query = assistant_content.strip()
                         # 清理SQL查詢
                         if sql_query:
                             sql_query = re.sub(r'```sql|```', '', sql_query).strip()
                             sql_query = re.sub(r'^思考過程:.*?\n', '', sql_query, flags=re.MULTILINE).strip()
                             sql_query = re.sub(r'^SQL查詢:\s*', '', sql_query, flags=re.MULTILINE).strip()
                         # 清理問題文本
                         if question:
                             question = re.sub(r'^###\s*', '', question).strip()
                             question = re.sub(r'Your JSON Response.*', '', question).strip()
                         # 數據質量驗證（降低標準以提高利用率）
                         if not question or len(question.strip()) < 3:
                             skipped_reasons["empty_question"] += 1
                             continue
-                        if not sql_query or len(sql_query.strip()) < 5:  # 降低最小長度要求
                             skipped_reasons["empty_sql"] += 1
                             continue
                         # 更寬鬆的SQL驗證
                         sql_upper = sql_query.upper()
-                        if "SELECT" not in sql_upper and "WITH" not in sql_upper:
                             skipped_reasons["invalid_format"] += 1
                             continue
                         self.questions.append(question)
                         self.sql_answers.append(sql_query)
                         successful_loads += 1
-                        # 調試：顯示前幾個成功案例
-                        if successful_loads <= 3:
                             print(f"✅ 成功案例 {successful_loads}:")
                             print(f"  問題: {question[:80]}...")
                             print(f"  SQL: {sql_query[:80]}...")
@@ -377,12 +466,50 @@ class CompleteTextToSQLSystem:
         self.retrieval_system = RetrievalSystem()
         self.initialize_system()
     def initialize_system(self):
         print("正在初始化完整數據系統...")
         # 首先預覽數據結構
         self.data_loader.preview_dataset_structure(3)
         # 然後加載數據
         self.data_loader.load_complete_dataset()
         self.data_loader.load_schema()

                 if 'messages' in item:
                     user_content = item['messages'][0]['content']
                     assistant_content = item['messages'][1]['content']
+                    print(f"User: {user_content[:120]}...")
+                    print(f"Assistant: {assistant_content[:120]}...")
+                    # 檢查SQL代碼塊
+                    sql_block_match = re.search(r'```sql\s*(.*?)\s*```', assistant_content, re.DOTALL)
+                    if sql_block_match:
+                        sql_content = sql_block_match.group(1).strip()
+                        print(f"✅ 找到SQL代碼塊: {sql_content[:60]}...")
+                    else:
+                        print("❌ 未找到SQL代碼塊")
+                        # 檢查是否有其他SQL格式
+                        if 'SELECT' in assistant_content.upper():
+                            print("⚠️ 但包含SELECT關鍵字")
+                        if 'SQL查詢:' in assistant_content:
+                            print("⚠️ 但包含'SQL查詢:'標記")
                     # 檢查是否為JSON格式
                     if assistant_content.strip().startswith('{'):
                             print("JSON解析失敗")
                 else:
                     print(f"無messages字段: {list(item.keys())}")
+            print(f"\n總數據量: {len(raw_dataset)} 項")
         except Exception as e:
             print(f"預覽失敗: {e}")
+    def diagnose_data_issues(self, sample_size: int = 20) -> None:
+        """診斷數據問題"""
+        try:
+            print(f"🔍 診斷數據問題 (檢查前 {sample_size} 個可能有問題的項目)...")
+            raw_dataset = load_dataset(DATASET_REPO_ID, token=self.hf_token)['train']
+            issues_found = {"no_sql_block": 0, "empty_assistant": 0, "parsing_error": 0, "other": 0}
+            for i in range(min(sample_size, len(raw_dataset))):
+                item = raw_dataset[i]
+                try:
+                    if 'messages' in item and len(item['messages']) >= 2:
+                        assistant_content = item['messages'][1]['content']
+                        # 檢查SQL代碼塊
+                        sql_block_match = re.search(r'```sql\s*(.*?)\s*```', assistant_content, re.DOTALL)
+                        if not sql_block_match:
+                            issues_found["no_sql_block"] += 1
+                            if issues_found["no_sql_block"] <= 3:
+                                print(f"\n❌ 無SQL代碼塊 #{i}: {assistant_content[:200]}...")
+                        if not assistant_content.strip():
+                            issues_found["empty_assistant"] += 1
+                except Exception as e:
+                    issues_found["parsing_error"] += 1
+                    if issues_found["parsing_error"] <= 2:
+                        print(f"\n💥 解析錯誤 #{i}: {e}")
+            print(f"\n📊 診斷結果:")
+            for issue, count in issues_found.items():
+                print(f"  {issue}: {count}")
+        except Exception as e:
+            print(f"診斷失敗: {e}")
     def load_complete_dataset(self) -> bool:
         try:
             print(f"[{get_current_time()}] 正在加載完整數據集 '{DATASET_REPO_ID}'...")
                         # SQL提取邏輯（如果還沒從JSON中獲得）
                         if not sql_query:
+                            # 策略1: SQL代碼塊格式（最常見）
+                            sql_block_match = re.search(r'```sql\s*(.*?)\s*```', assistant_content, re.DOTALL)
+                            if sql_block_match:
+                                sql_query = sql_block_match.group(1).strip()
+                            # 策略2: 標準「SQL查詢:」格式
                             if not sql_query:
+                                sql_match = re.search(r'SQL查詢:\s*(.*?)(?:\n\n|$)', assistant_content, re.DOTALL)
+                                if sql_match:
+                                    sql_query = sql_match.group(1).strip()
+                                    # 清理可能的代碼塊標記
+                                    sql_query = re.sub(r'```sql|```', '', sql_query).strip()
+                            # 策略3: 查找任何包含 SELECT 或 WITH 的多行內容
                             if not sql_query:
+                                lines = assistant_content.split('\n')
+                                sql_lines = []
+                                in_sql_block = False
+                                for line in lines:
+                                    line_upper = line.upper().strip()
+                                    # 開始條件：找到SQL關鍵字
+                                    if not in_sql_block and (line_upper.startswith('SELECT') or line_upper.startswith('WITH')):
+                                        in_sql_block = True
+                                        sql_lines.append(line)
+                                    # 繼續條件：在SQL塊中
+                                    elif in_sql_block:
+                                        # 結束條件：空行或看起來不像SQL的行
+                                        if not line.strip():
+                                            break
+                                        elif line.strip().startswith('```') and len(sql_lines) > 0:
+                                            break
+                                        elif line_upper.startswith('思考過程:') or line_upper.startswith('上下文:'):
                                             break
+                                        else:
+                                            sql_lines.append(line)
+                                if sql_lines:
+                                    sql_query = '\n'.join(sql_lines).strip()
+                            # 策略4: 如果還是沒找到，嘗試更寬鬆的匹配
                             if not sql_query:
+                                # 查找所有可能的SQL片段
+                                sql_patterns = [
+                                    r'(SELECT.*?FROM.*?)(?:\n\n|$)',
+                                    r'(WITH.*?SELECT.*?)(?:\n\n|$)',
+                                    r'SQL查詢:\s*\n(.*?)(?:\n\n|$)'
+                                ]
+                                for pattern in sql_patterns:
+                                    match = re.search(pattern, assistant_content, re.DOTALL | re.IGNORECASE)
+                                    if match:
+                                        candidate = match.group(1).strip()
+                                        # 基本驗證
+                                        if len(candidate) > 10 and ('SELECT' in candidate.upper() or 'WITH' in candidate.upper()):
+                                            sql_query = candidate
+                                            break
                         # 清理SQL查詢
                         if sql_query:
+                            # 移除各種標記
                             sql_query = re.sub(r'```sql|```', '', sql_query).strip()
                             sql_query = re.sub(r'^思考過程:.*?\n', '', sql_query, flags=re.MULTILINE).strip()
                             sql_query = re.sub(r'^SQL查詢:\s*', '', sql_query, flags=re.MULTILINE).strip()
+                            # 移除多餘的空行
+                            sql_query = re.sub(r'\n\s*\n', '\n', sql_query).strip()
+                            # 確保SQL完整性 - 如果以分號結尾且內容合理，保留
+                            if not sql_query.endswith(';') and len(sql_query) > 20:
+                                # 檢查是否看起來像完整的SQL
+                                if 'FROM' in sql_query.upper() and sql_query.count('(') == sql_query.count(')'):
+                                    sql_query += ';'
                         # 清理問題文本
                         if question:
                             question = re.sub(r'^###\s*', '', question).strip()
                             question = re.sub(r'Your JSON Response.*', '', question).strip()
+                            # 移除多餘的上下文信息
+                            question = re.sub(r'\n上下文:.*', '', question, flags=re.DOTALL).strip()
                         # 數據質量驗證（降低標準以提高利用率）
                         if not question or len(question.strip()) < 3:
                             skipped_reasons["empty_question"] += 1
                             continue
+                        if not sql_query or len(sql_query.strip()) < 8:  # 進一步降低最小長度要求
                             skipped_reasons["empty_sql"] += 1
+                            if idx < 10:  # 調試：顯示前10個被跳過的SQL為空的案例
+                                print(f"SQL為空案例 {idx}: 原始助手回應前100字符: {assistant_content[:100]}...")
                             continue
                         # 更寬鬆的SQL驗證
                         sql_upper = sql_query.upper()
+                        if "SELECT" not in sql_upper and "WITH" not in sql_upper and "CREATE" not in sql_upper:
                             skipped_reasons["invalid_format"] += 1
+                            if idx < 5:  # 調試：顯示前5個格式錯誤的案例
+                                print(f"格式錯誤案例 {idx}: SQL內容: {sql_query[:100]}...")
                             continue
                         self.questions.append(question)
                         self.sql_answers.append(sql_query)
                         successful_loads += 1
+                        # 調試：顯示前5個成功案例
+                        if successful_loads <= 5:
                             print(f"✅ 成功案例 {successful_loads}:")
                             print(f"  問題: {question[:80]}...")
                             print(f"  SQL: {sql_query[:80]}...")
         self.retrieval_system = RetrievalSystem()
         self.initialize_system()
+    def diagnose_data_issues(self, sample_size: int = 20) -> None:
+        """診斷數據問題"""
+        try:
+            print(f"🔍 診斷數據問題 (檢查前 {sample_size} 個可能有問題的項目)...")
+            raw_dataset = load_dataset(DATASET_REPO_ID, token=self.hf_token)['train']
+            issues_found = {"no_sql_block": 0, "empty_assistant": 0, "parsing_error": 0, "other": 0}
+            for i in range(min(sample_size, len(raw_dataset))):
+                item = raw_dataset[i]
+                try:
+                    if 'messages' in item and len(item['messages']) >= 2:
+                        assistant_content = item['messages'][1]['content']
+                        # 檢查SQL代碼塊
+                        sql_block_match = re.search(r'```sql\s*(.*?)\s*```', assistant_content, re.DOTALL)
+                        if not sql_block_match:
+                            issues_found["no_sql_block"] += 1
+                            if issues_found["no_sql_block"] <= 3:
+                                print(f"\n❌ 無SQL代碼塊 #{i}: {assistant_content[:200]}...")
+                        if not assistant_content.strip():
+                            issues_found["empty_assistant"] += 1
+                except Exception as e:
+                    issues_found["parsing_error"] += 1
+                    if issues_found["parsing_error"] <= 2:
+                        print(f"\n💥 解析錯誤 #{i}: {e}")
+            print(f"\n📊 診斷結果:")
+            for issue, count in issues_found.items():
+                print(f"  {issue}: {count}")
+        except Exception as e:
+            print(f"診斷失敗: {e}")
     def initialize_system(self):
         print("正在初始化完整數據系統...")
         # 首先預覽數據結構
         self.data_loader.preview_dataset_structure(3)
+        # 診斷數據問題
+        self.data_loader.diagnose_data_issues(10)
         # 然後加載數據
         self.data_loader.load_complete_dataset()
         self.data_loader.load_schema()