Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -124,6 +124,33 @@ class CompleteDataLoader:
|
|
| 124 |
self.sql_quality = []
|
| 125 |
self.schema_data = {}
|
| 126 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 127 |
def load_complete_dataset(self) -> bool:
|
| 128 |
try:
|
| 129 |
print(f"[{get_current_time()}] 正在加載完整數據集 '{DATASET_REPO_ID}'...")
|
|
@@ -131,7 +158,7 @@ class CompleteDataLoader:
|
|
| 131 |
|
| 132 |
successful_loads = 0
|
| 133 |
total_items = len(raw_dataset)
|
| 134 |
-
skipped_reasons = {"empty_question": 0, "empty_sql": 0, "parse_error": 0, "invalid_format": 0}
|
| 135 |
|
| 136 |
for idx, item in enumerate(raw_dataset):
|
| 137 |
try:
|
|
@@ -142,59 +169,86 @@ class CompleteDataLoader:
|
|
| 142 |
# 多種問題提取策略
|
| 143 |
question = None
|
| 144 |
|
| 145 |
-
# 策略1:
|
| 146 |
-
|
| 147 |
-
|
| 148 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 149 |
|
| 150 |
-
# 策略
|
| 151 |
if not question:
|
| 152 |
-
lines = [line.strip() for line in user_content.split('\n') if line.strip()]
|
| 153 |
if lines:
|
| 154 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 155 |
|
| 156 |
-
# 策略
|
| 157 |
if not question:
|
| 158 |
question = user_content.strip()
|
| 159 |
|
| 160 |
-
#
|
| 161 |
-
sql_query = None
|
| 162 |
-
|
| 163 |
-
# 策略1: 標準「SQL查詢:」格式
|
| 164 |
-
sql_match = re.search(r'SQL查詢:\s*(.*?)(?:\n|$)', assistant_content, re.DOTALL)
|
| 165 |
-
if sql_match:
|
| 166 |
-
sql_query = sql_match.group(1).strip()
|
| 167 |
-
|
| 168 |
-
# 策略2: SQL代碼塊格式
|
| 169 |
if not sql_query:
|
| 170 |
-
|
| 171 |
-
|
| 172 |
-
|
| 173 |
-
|
| 174 |
-
|
| 175 |
-
|
| 176 |
-
|
| 177 |
-
|
| 178 |
-
|
| 179 |
-
|
| 180 |
-
|
| 181 |
-
|
| 182 |
-
|
| 183 |
-
|
| 184 |
-
|
| 185 |
-
|
| 186 |
-
|
| 187 |
-
|
| 188 |
-
|
| 189 |
-
|
| 190 |
-
|
| 191 |
-
|
| 192 |
-
|
| 193 |
-
|
| 194 |
-
|
| 195 |
-
|
| 196 |
-
|
| 197 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 198 |
|
| 199 |
# 清理SQL查詢
|
| 200 |
if sql_query:
|
|
@@ -202,6 +256,11 @@ class CompleteDataLoader:
|
|
| 202 |
sql_query = re.sub(r'^思考過程:.*?\n', '', sql_query, flags=re.MULTILINE).strip()
|
| 203 |
sql_query = re.sub(r'^SQL查詢:\s*', '', sql_query, flags=re.MULTILINE).strip()
|
| 204 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 205 |
# 數據質量驗證(降低標準以提高利用率)
|
| 206 |
if not question or len(question.strip()) < 3:
|
| 207 |
skipped_reasons["empty_question"] += 1
|
|
@@ -223,13 +282,16 @@ class CompleteDataLoader:
|
|
| 223 |
|
| 224 |
# 調試:顯示前幾個成功案例
|
| 225 |
if successful_loads <= 3:
|
| 226 |
-
print(f"成功案例 {successful_loads}:")
|
| 227 |
-
print(f" 問題: {question[:
|
| 228 |
-
print(f" SQL: {sql_query[:
|
| 229 |
|
| 230 |
else:
|
| 231 |
skipped_reasons["invalid_format"] += 1
|
| 232 |
|
|
|
|
|
|
|
|
|
|
| 233 |
except Exception as e:
|
| 234 |
skipped_reasons["parse_error"] += 1
|
| 235 |
if idx < 3: # 只顯示前3個錯誤
|
|
@@ -237,7 +299,7 @@ class CompleteDataLoader:
|
|
| 237 |
continue
|
| 238 |
|
| 239 |
print(f"數據加載完成: 成功載入 {successful_loads}/{total_items} 項")
|
| 240 |
-
print(f"跳過原因統計: 問題為空({skipped_reasons['empty_question']}) | SQL為空({skipped_reasons['empty_sql']}) | 格式錯誤({skipped_reasons['invalid_format']}) | 解析錯誤({skipped_reasons['parse_error']})")
|
| 241 |
return successful_loads > 0
|
| 242 |
except Exception as e:
|
| 243 |
print(f"數據集加載失敗: {e}")
|
|
@@ -317,6 +379,11 @@ class CompleteTextToSQLSystem:
|
|
| 317 |
|
| 318 |
def initialize_system(self):
|
| 319 |
print("正在初始化完整數據系統...")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 320 |
self.data_loader.load_complete_dataset()
|
| 321 |
self.data_loader.load_schema()
|
| 322 |
if self.data_loader.questions:
|
|
|
|
| 124 |
self.sql_quality = []
|
| 125 |
self.schema_data = {}
|
| 126 |
|
| 127 |
+
def preview_dataset_structure(self, sample_size: int = 5) -> None:
|
| 128 |
+
"""預覽數據集結構以幫助調試"""
|
| 129 |
+
try:
|
| 130 |
+
print(f"📋 預覽數據集結構 (前 {sample_size} 個範例)...")
|
| 131 |
+
raw_dataset = load_dataset(DATASET_REPO_ID, token=self.hf_token)['train']
|
| 132 |
+
|
| 133 |
+
for i in range(min(sample_size, len(raw_dataset))):
|
| 134 |
+
item = raw_dataset[i]
|
| 135 |
+
print(f"\n--- 範例 {i+1} ---")
|
| 136 |
+
if 'messages' in item:
|
| 137 |
+
user_content = item['messages'][0]['content']
|
| 138 |
+
assistant_content = item['messages'][1]['content']
|
| 139 |
+
print(f"User: {user_content[:100]}...")
|
| 140 |
+
print(f"Assistant: {assistant_content[:100]}...")
|
| 141 |
+
|
| 142 |
+
# 檢查是否為JSON格式
|
| 143 |
+
if assistant_content.strip().startswith('{'):
|
| 144 |
+
try:
|
| 145 |
+
json_data = json.loads(assistant_content)
|
| 146 |
+
print(f"JSON Keys: {list(json_data.keys())}")
|
| 147 |
+
except:
|
| 148 |
+
print("JSON解析失敗")
|
| 149 |
+
else:
|
| 150 |
+
print(f"無messages字段: {list(item.keys())}")
|
| 151 |
+
except Exception as e:
|
| 152 |
+
print(f"預覽失敗: {e}")
|
| 153 |
+
|
| 154 |
def load_complete_dataset(self) -> bool:
|
| 155 |
try:
|
| 156 |
print(f"[{get_current_time()}] 正在加載完整數據集 '{DATASET_REPO_ID}'...")
|
|
|
|
| 158 |
|
| 159 |
successful_loads = 0
|
| 160 |
total_items = len(raw_dataset)
|
| 161 |
+
skipped_reasons = {"empty_question": 0, "empty_sql": 0, "parse_error": 0, "invalid_format": 0, "json_parse_error": 0}
|
| 162 |
|
| 163 |
for idx, item in enumerate(raw_dataset):
|
| 164 |
try:
|
|
|
|
| 169 |
# 多種問題提取策略
|
| 170 |
question = None
|
| 171 |
|
| 172 |
+
# 策略1: 檢查是否為JSON格式的回應
|
| 173 |
+
try:
|
| 174 |
+
if assistant_content.strip().startswith('{'):
|
| 175 |
+
json_data = json.loads(assistant_content)
|
| 176 |
+
if 'sql' in json_data:
|
| 177 |
+
sql_query = json_data['sql']
|
| 178 |
+
elif 'query' in json_data:
|
| 179 |
+
sql_query = json_data['query']
|
| 180 |
+
else:
|
| 181 |
+
sql_query = None
|
| 182 |
+
|
| 183 |
+
# 從JSON中提取問題 (如果有的話)
|
| 184 |
+
if 'question' in json_data:
|
| 185 |
+
question = json_data['question']
|
| 186 |
+
elif 'user_query' in json_data:
|
| 187 |
+
question = json_data['user_query']
|
| 188 |
+
else:
|
| 189 |
+
sql_query = None
|
| 190 |
+
except json.JSONDecodeError:
|
| 191 |
+
sql_query = None
|
| 192 |
+
|
| 193 |
+
# 策略2: 標準「指令:」格式
|
| 194 |
+
if not question:
|
| 195 |
+
question_match = re.search(r'指令:\s*(.*?)(?:\n|$)', user_content)
|
| 196 |
+
if question_match:
|
| 197 |
+
question = question_match.group(1).strip()
|
| 198 |
|
| 199 |
+
# 策略3: 如果沒找到,嘗試提取最後一行非空內容
|
| 200 |
if not question:
|
| 201 |
+
lines = [line.strip() for line in user_content.split('\n') if line.strip() and not line.startswith('#')]
|
| 202 |
if lines:
|
| 203 |
+
# 過濾掉看起來像標題的行
|
| 204 |
+
for line in reversed(lines):
|
| 205 |
+
if not line.startswith('###') and '?' in line and len(line) > 5:
|
| 206 |
+
question = line
|
| 207 |
+
break
|
| 208 |
+
if not question and lines:
|
| 209 |
+
question = lines[-1]
|
| 210 |
|
| 211 |
+
# 策略4: 直接使用整個內容(作為最後手段)
|
| 212 |
if not question:
|
| 213 |
question = user_content.strip()
|
| 214 |
|
| 215 |
+
# SQL提取邏輯(如果還沒從JSON中獲得)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 216 |
if not sql_query:
|
| 217 |
+
# 策略1: 標準「SQL查詢:」格式
|
| 218 |
+
sql_match = re.search(r'SQL查詢:\s*(.*?)(?:\n|$)', assistant_content, re.DOTALL)
|
| 219 |
+
if sql_match:
|
| 220 |
+
sql_query = sql_match.group(1).strip()
|
| 221 |
+
|
| 222 |
+
# 策略2: SQL代碼塊格式
|
| 223 |
+
if not sql_query:
|
| 224 |
+
sql_block_match = re.search(r'```sql\s*(.*?)\s*```', assistant_content, re.DOTALL)
|
| 225 |
+
if sql_block_match:
|
| 226 |
+
sql_query = sql_block_match.group(1).strip()
|
| 227 |
+
|
| 228 |
+
# 策略3: 查找任何包含 SELECT 的行
|
| 229 |
+
if not sql_query:
|
| 230 |
+
for line in assistant_content.split('\n'):
|
| 231 |
+
if 'SELECT' in line.upper():
|
| 232 |
+
# 從這行開始提取到最後或到下個非SQL行
|
| 233 |
+
sql_lines = []
|
| 234 |
+
found_start = False
|
| 235 |
+
for l in assistant_content.split('\n'):
|
| 236 |
+
if 'SELECT' in l.upper():
|
| 237 |
+
found_start = True
|
| 238 |
+
if found_start:
|
| 239 |
+
if l.strip() and not l.strip().startswith('```'):
|
| 240 |
+
sql_lines.append(l)
|
| 241 |
+
elif l.strip() == '' and sql_lines:
|
| 242 |
+
continue
|
| 243 |
+
elif found_start and len(sql_lines) > 0:
|
| 244 |
+
break
|
| 245 |
+
if sql_lines:
|
| 246 |
+
sql_query = '\n'.join(sql_lines).strip()
|
| 247 |
+
break
|
| 248 |
+
|
| 249 |
+
# 策略4: 如果還是沒找到,使用整個assistant內容
|
| 250 |
+
if not sql_query:
|
| 251 |
+
sql_query = assistant_content.strip()
|
| 252 |
|
| 253 |
# 清理SQL查詢
|
| 254 |
if sql_query:
|
|
|
|
| 256 |
sql_query = re.sub(r'^思考過程:.*?\n', '', sql_query, flags=re.MULTILINE).strip()
|
| 257 |
sql_query = re.sub(r'^SQL查詢:\s*', '', sql_query, flags=re.MULTILINE).strip()
|
| 258 |
|
| 259 |
+
# 清理問題文本
|
| 260 |
+
if question:
|
| 261 |
+
question = re.sub(r'^###\s*', '', question).strip()
|
| 262 |
+
question = re.sub(r'Your JSON Response.*', '', question).strip()
|
| 263 |
+
|
| 264 |
# 數據質量驗證(降低標準以提高利用率)
|
| 265 |
if not question or len(question.strip()) < 3:
|
| 266 |
skipped_reasons["empty_question"] += 1
|
|
|
|
| 282 |
|
| 283 |
# 調試:顯示前幾個成功案例
|
| 284 |
if successful_loads <= 3:
|
| 285 |
+
print(f"✅ 成功案例 {successful_loads}:")
|
| 286 |
+
print(f" 問題: {question[:80]}...")
|
| 287 |
+
print(f" SQL: {sql_query[:80]}...")
|
| 288 |
|
| 289 |
else:
|
| 290 |
skipped_reasons["invalid_format"] += 1
|
| 291 |
|
| 292 |
+
except json.JSONDecodeError as e:
|
| 293 |
+
skipped_reasons["json_parse_error"] += 1
|
| 294 |
+
continue
|
| 295 |
except Exception as e:
|
| 296 |
skipped_reasons["parse_error"] += 1
|
| 297 |
if idx < 3: # 只顯示前3個錯誤
|
|
|
|
| 299 |
continue
|
| 300 |
|
| 301 |
print(f"數據加載完成: 成功載入 {successful_loads}/{total_items} 項")
|
| 302 |
+
print(f"跳過原因統計: 問題為空({skipped_reasons['empty_question']}) | SQL為空({skipped_reasons['empty_sql']}) | 格式錯誤({skipped_reasons['invalid_format']}) | JSON錯誤({skipped_reasons['json_parse_error']}) | 解析錯誤({skipped_reasons['parse_error']})")
|
| 303 |
return successful_loads > 0
|
| 304 |
except Exception as e:
|
| 305 |
print(f"數據集加載失敗: {e}")
|
|
|
|
| 379 |
|
| 380 |
def initialize_system(self):
|
| 381 |
print("正在初始化完整數據系統...")
|
| 382 |
+
|
| 383 |
+
# 首先預覽數據結構
|
| 384 |
+
self.data_loader.preview_dataset_structure(3)
|
| 385 |
+
|
| 386 |
+
# 然後加載數據
|
| 387 |
self.data_loader.load_complete_dataset()
|
| 388 |
self.data_loader.load_schema()
|
| 389 |
if self.data_loader.questions:
|