Paul720810 commited on
Commit
f327d97
·
verified ·
1 Parent(s): b69d84b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +117 -50
app.py CHANGED
@@ -124,6 +124,33 @@ class CompleteDataLoader:
124
  self.sql_quality = []
125
  self.schema_data = {}
126
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
127
  def load_complete_dataset(self) -> bool:
128
  try:
129
  print(f"[{get_current_time()}] 正在加載完整數據集 '{DATASET_REPO_ID}'...")
@@ -131,7 +158,7 @@ class CompleteDataLoader:
131
 
132
  successful_loads = 0
133
  total_items = len(raw_dataset)
134
- skipped_reasons = {"empty_question": 0, "empty_sql": 0, "parse_error": 0, "invalid_format": 0}
135
 
136
  for idx, item in enumerate(raw_dataset):
137
  try:
@@ -142,59 +169,86 @@ class CompleteDataLoader:
142
  # 多種問題提取策略
143
  question = None
144
 
145
- # 策略1: 標準「指令:」格式
146
- question_match = re.search(r'指令:\s*(.*?)(?:\n|$)', user_content)
147
- if question_match:
148
- question = question_match.group(1).strip()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
149
 
150
- # 策略2: 如果沒找到,嘗試提取最後一行非空內容
151
  if not question:
152
- lines = [line.strip() for line in user_content.split('\n') if line.strip()]
153
  if lines:
154
- question = lines[-1]
 
 
 
 
 
 
155
 
156
- # 策略3: 直接使用整個內容(作為最後手段)
157
  if not question:
158
  question = user_content.strip()
159
 
160
- # 多種SQL提取策略
161
- sql_query = None
162
-
163
- # 策略1: 標準「SQL查詢:」格式
164
- sql_match = re.search(r'SQL查詢:\s*(.*?)(?:\n|$)', assistant_content, re.DOTALL)
165
- if sql_match:
166
- sql_query = sql_match.group(1).strip()
167
-
168
- # 策略2: SQL代碼塊格式
169
  if not sql_query:
170
- sql_block_match = re.search(r'```sql\s*(.*?)\s*```', assistant_content, re.DOTALL)
171
- if sql_block_match:
172
- sql_query = sql_block_match.group(1).strip()
173
-
174
- # 策略3: 查找任何包含 SELECT 的行
175
- if not sql_query:
176
- for line in assistant_content.split('\n'):
177
- if 'SELECT' in line.upper():
178
- # 從這行開始提取到最後或到下個非SQL行
179
- sql_lines = []
180
- found_start = False
181
- for l in assistant_content.split('\n'):
182
- if 'SELECT' in l.upper():
183
- found_start = True
184
- if found_start:
185
- if l.strip() and not l.strip().startswith('```'):
186
- sql_lines.append(l)
187
- elif l.strip() == '' and sql_lines:
188
- continue
189
- elif found_start and len(sql_lines) > 0:
190
- break
191
- if sql_lines:
192
- sql_query = '\n'.join(sql_lines).strip()
193
- break
194
-
195
- # 策略4: 如果還是沒找到,使用整個assistant內容
196
- if not sql_query:
197
- sql_query = assistant_content.strip()
 
 
 
 
 
 
 
198
 
199
  # 清理SQL查詢
200
  if sql_query:
@@ -202,6 +256,11 @@ class CompleteDataLoader:
202
  sql_query = re.sub(r'^思考過程:.*?\n', '', sql_query, flags=re.MULTILINE).strip()
203
  sql_query = re.sub(r'^SQL查詢:\s*', '', sql_query, flags=re.MULTILINE).strip()
204
 
 
 
 
 
 
205
  # 數據質量驗證(降低標準以提高利用率)
206
  if not question or len(question.strip()) < 3:
207
  skipped_reasons["empty_question"] += 1
@@ -223,13 +282,16 @@ class CompleteDataLoader:
223
 
224
  # 調試:顯示前幾個成功案例
225
  if successful_loads <= 3:
226
- print(f"成功案例 {successful_loads}:")
227
- print(f" 問題: {question[:50]}...")
228
- print(f" SQL: {sql_query[:50]}...")
229
 
230
  else:
231
  skipped_reasons["invalid_format"] += 1
232
 
 
 
 
233
  except Exception as e:
234
  skipped_reasons["parse_error"] += 1
235
  if idx < 3: # 只顯示前3個錯誤
@@ -237,7 +299,7 @@ class CompleteDataLoader:
237
  continue
238
 
239
  print(f"數據加載完成: 成功載入 {successful_loads}/{total_items} 項")
240
- print(f"跳過原因統計: 問題為空({skipped_reasons['empty_question']}) | SQL為空({skipped_reasons['empty_sql']}) | 格式錯誤({skipped_reasons['invalid_format']}) | 解析錯誤({skipped_reasons['parse_error']})")
241
  return successful_loads > 0
242
  except Exception as e:
243
  print(f"數據集加載失敗: {e}")
@@ -317,6 +379,11 @@ class CompleteTextToSQLSystem:
317
 
318
  def initialize_system(self):
319
  print("正在初始化完整數據系統...")
 
 
 
 
 
320
  self.data_loader.load_complete_dataset()
321
  self.data_loader.load_schema()
322
  if self.data_loader.questions:
 
124
  self.sql_quality = []
125
  self.schema_data = {}
126
 
127
+ def preview_dataset_structure(self, sample_size: int = 5) -> None:
128
+ """預覽數據集結構以幫助調試"""
129
+ try:
130
+ print(f"📋 預覽數據集結構 (前 {sample_size} 個範例)...")
131
+ raw_dataset = load_dataset(DATASET_REPO_ID, token=self.hf_token)['train']
132
+
133
+ for i in range(min(sample_size, len(raw_dataset))):
134
+ item = raw_dataset[i]
135
+ print(f"\n--- 範例 {i+1} ---")
136
+ if 'messages' in item:
137
+ user_content = item['messages'][0]['content']
138
+ assistant_content = item['messages'][1]['content']
139
+ print(f"User: {user_content[:100]}...")
140
+ print(f"Assistant: {assistant_content[:100]}...")
141
+
142
+ # 檢查是否為JSON格式
143
+ if assistant_content.strip().startswith('{'):
144
+ try:
145
+ json_data = json.loads(assistant_content)
146
+ print(f"JSON Keys: {list(json_data.keys())}")
147
+ except:
148
+ print("JSON解析失敗")
149
+ else:
150
+ print(f"無messages字段: {list(item.keys())}")
151
+ except Exception as e:
152
+ print(f"預覽失敗: {e}")
153
+
154
  def load_complete_dataset(self) -> bool:
155
  try:
156
  print(f"[{get_current_time()}] 正在加載完整數據集 '{DATASET_REPO_ID}'...")
 
158
 
159
  successful_loads = 0
160
  total_items = len(raw_dataset)
161
+ skipped_reasons = {"empty_question": 0, "empty_sql": 0, "parse_error": 0, "invalid_format": 0, "json_parse_error": 0}
162
 
163
  for idx, item in enumerate(raw_dataset):
164
  try:
 
169
  # 多種問題提取策略
170
  question = None
171
 
172
+ # 策略1: 檢查是否為JSON格式的回應
173
+ try:
174
+ if assistant_content.strip().startswith('{'):
175
+ json_data = json.loads(assistant_content)
176
+ if 'sql' in json_data:
177
+ sql_query = json_data['sql']
178
+ elif 'query' in json_data:
179
+ sql_query = json_data['query']
180
+ else:
181
+ sql_query = None
182
+
183
+ # 從JSON中提取問題 (如果有的話)
184
+ if 'question' in json_data:
185
+ question = json_data['question']
186
+ elif 'user_query' in json_data:
187
+ question = json_data['user_query']
188
+ else:
189
+ sql_query = None
190
+ except json.JSONDecodeError:
191
+ sql_query = None
192
+
193
+ # 策略2: 標準「指令:」格式
194
+ if not question:
195
+ question_match = re.search(r'指令:\s*(.*?)(?:\n|$)', user_content)
196
+ if question_match:
197
+ question = question_match.group(1).strip()
198
 
199
+ # 策略3: 如果沒找到,嘗試提取最後一行非空內容
200
  if not question:
201
+ lines = [line.strip() for line in user_content.split('\n') if line.strip() and not line.startswith('#')]
202
  if lines:
203
+ # 過濾掉看起來像標題的行
204
+ for line in reversed(lines):
205
+ if not line.startswith('###') and '?' in line and len(line) > 5:
206
+ question = line
207
+ break
208
+ if not question and lines:
209
+ question = lines[-1]
210
 
211
+ # 策略4: 直接使用整個內容(作為最後手段)
212
  if not question:
213
  question = user_content.strip()
214
 
215
+ # SQL提取邏輯(如果還沒從JSON中獲得)
 
 
 
 
 
 
 
 
216
  if not sql_query:
217
+ # 策略1: 標準「SQL查詢:」格式
218
+ sql_match = re.search(r'SQL查詢:\s*(.*?)(?:\n|$)', assistant_content, re.DOTALL)
219
+ if sql_match:
220
+ sql_query = sql_match.group(1).strip()
221
+
222
+ # 策略2: SQL代碼塊格式
223
+ if not sql_query:
224
+ sql_block_match = re.search(r'```sql\s*(.*?)\s*```', assistant_content, re.DOTALL)
225
+ if sql_block_match:
226
+ sql_query = sql_block_match.group(1).strip()
227
+
228
+ # 策略3: 查找任何包含 SELECT 的行
229
+ if not sql_query:
230
+ for line in assistant_content.split('\n'):
231
+ if 'SELECT' in line.upper():
232
+ # 從這行開始提取到最後或到下個非SQL行
233
+ sql_lines = []
234
+ found_start = False
235
+ for l in assistant_content.split('\n'):
236
+ if 'SELECT' in l.upper():
237
+ found_start = True
238
+ if found_start:
239
+ if l.strip() and not l.strip().startswith('```'):
240
+ sql_lines.append(l)
241
+ elif l.strip() == '' and sql_lines:
242
+ continue
243
+ elif found_start and len(sql_lines) > 0:
244
+ break
245
+ if sql_lines:
246
+ sql_query = '\n'.join(sql_lines).strip()
247
+ break
248
+
249
+ # 策略4: 如果還是沒找到,使用整個assistant內容
250
+ if not sql_query:
251
+ sql_query = assistant_content.strip()
252
 
253
  # 清理SQL查詢
254
  if sql_query:
 
256
  sql_query = re.sub(r'^思考過程:.*?\n', '', sql_query, flags=re.MULTILINE).strip()
257
  sql_query = re.sub(r'^SQL查詢:\s*', '', sql_query, flags=re.MULTILINE).strip()
258
 
259
+ # 清理問題文本
260
+ if question:
261
+ question = re.sub(r'^###\s*', '', question).strip()
262
+ question = re.sub(r'Your JSON Response.*', '', question).strip()
263
+
264
  # 數據質量驗證(降低標準以提高利用率)
265
  if not question or len(question.strip()) < 3:
266
  skipped_reasons["empty_question"] += 1
 
282
 
283
  # 調試:顯示前幾個成功案例
284
  if successful_loads <= 3:
285
+ print(f"成功案例 {successful_loads}:")
286
+ print(f" 問題: {question[:80]}...")
287
+ print(f" SQL: {sql_query[:80]}...")
288
 
289
  else:
290
  skipped_reasons["invalid_format"] += 1
291
 
292
+ except json.JSONDecodeError as e:
293
+ skipped_reasons["json_parse_error"] += 1
294
+ continue
295
  except Exception as e:
296
  skipped_reasons["parse_error"] += 1
297
  if idx < 3: # 只顯示前3個錯誤
 
299
  continue
300
 
301
  print(f"數據加載完成: 成功載入 {successful_loads}/{total_items} 項")
302
+ print(f"跳過原因統計: 問題為空({skipped_reasons['empty_question']}) | SQL為空({skipped_reasons['empty_sql']}) | 格式錯誤({skipped_reasons['invalid_format']}) | JSON錯誤({skipped_reasons['json_parse_error']}) | 解析錯誤({skipped_reasons['parse_error']})")
303
  return successful_loads > 0
304
  except Exception as e:
305
  print(f"數據集加載失敗: {e}")
 
379
 
380
  def initialize_system(self):
381
  print("正在初始化完整數據系統...")
382
+
383
+ # 首先預覽數據結構
384
+ self.data_loader.preview_dataset_structure(3)
385
+
386
+ # 然後加載數據
387
  self.data_loader.load_complete_dataset()
388
  self.data_loader.load_schema()
389
  if self.data_loader.questions: