Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -144,7 +144,7 @@ class TextToSQLSystem:
|
|
| 144 |
try:
|
| 145 |
dataset = load_dataset(DATASET_REPO_ID, data_files="training_data.jsonl", split="train")
|
| 146 |
dataset = dataset.filter(lambda ex: isinstance(ex.get("messages"), list) and len(ex["messages"]) >= 2)
|
| 147 |
-
corpus = [item['messages']['content'] for item in dataset]
|
| 148 |
self._log(f"正在編碼 {len(corpus)} 個問題...")
|
| 149 |
all_embeddings = torch.cat([self._encode_texts(corpus[i:i+32]) for i in range(0, len(corpus), 32)], dim=0).numpy()
|
| 150 |
index = faiss.IndexFlatIP(all_embeddings.shape[1])
|
|
@@ -191,8 +191,8 @@ class TextToSQLSystem:
|
|
| 191 |
if idx >= len(self.dataset): continue
|
| 192 |
item = self.dataset[idx]
|
| 193 |
if not (isinstance(item.get('messages'), list) and len(item['messages']) >= 2): continue
|
| 194 |
-
q_content = (item['messages']['content'
|
| 195 |
-
a_content = (item['messages'].get('content') or '').strip()
|
| 196 |
if not q_content or not a_content: continue
|
| 197 |
clean_q = re.sub(r"以下是一個SQL查詢任務:\s*指令:\s*", "", q_content).strip()
|
| 198 |
if clean_q in seen_questions: continue
|
|
|
|
| 144 |
try:
|
| 145 |
dataset = load_dataset(DATASET_REPO_ID, data_files="training_data.jsonl", split="train")
|
| 146 |
dataset = dataset.filter(lambda ex: isinstance(ex.get("messages"), list) and len(ex["messages"]) >= 2)
|
| 147 |
+
corpus = [item['messages'][0]['content'] for item in dataset]
|
| 148 |
self._log(f"正在編碼 {len(corpus)} 個問題...")
|
| 149 |
all_embeddings = torch.cat([self._encode_texts(corpus[i:i+32]) for i in range(0, len(corpus), 32)], dim=0).numpy()
|
| 150 |
index = faiss.IndexFlatIP(all_embeddings.shape[1])
|
|
|
|
| 191 |
if idx >= len(self.dataset): continue
|
| 192 |
item = self.dataset[idx]
|
| 193 |
if not (isinstance(item.get('messages'), list) and len(item['messages']) >= 2): continue
|
| 194 |
+
q_content = (item['messages'][0].get('content') or '').strip()
|
| 195 |
+
a_content = (item['messages'][1].get('content') or '').strip()
|
| 196 |
if not q_content or not a_content: continue
|
| 197 |
clean_q = re.sub(r"以下是一個SQL查詢任務:\s*指令:\s*", "", q_content).strip()
|
| 198 |
if clean_q in seen_questions: continue
|