Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -108,7 +108,7 @@ class TextToSQLSystem:
|
|
| 108 |
try:
|
| 109 |
dataset = load_dataset(DATASET_REPO_ID, data_files="training_data.jsonl", split="train")
|
| 110 |
dataset = dataset.filter(lambda ex: isinstance(ex.get("messages"), list) and len(ex["messages"]) >= 2)
|
| 111 |
-
corpus = [item['messages']['content'] for item in dataset]
|
| 112 |
self._log(f"正在編碼 {len(corpus)} 個問題...")
|
| 113 |
all_embeddings = torch.cat([self._encode_texts(corpus[i:i+32]) for i in range(0, len(corpus), 32)], dim=0).numpy()
|
| 114 |
index = faiss.IndexFlatIP(all_embeddings.shape)
|
|
@@ -153,8 +153,8 @@ class TextToSQLSystem:
|
|
| 153 |
if idx >= len(self.dataset): continue
|
| 154 |
item = self.dataset[idx]
|
| 155 |
if not (isinstance(item.get('messages'), list) and len(item['messages']) >= 2): continue
|
| 156 |
-
q_content = (item['messages'].get('content') or '').strip()
|
| 157 |
-
a_content = (item['messages'].get('content') or '').strip()
|
| 158 |
if not q_content or not a_content: continue
|
| 159 |
clean_q = re.sub(r"以下是一個SQL查詢任務:\s*指令:\s*", "", q_content).strip()
|
| 160 |
if clean_q in seen_questions: continue
|
|
|
|
| 108 |
try:
|
| 109 |
dataset = load_dataset(DATASET_REPO_ID, data_files="training_data.jsonl", split="train")
|
| 110 |
dataset = dataset.filter(lambda ex: isinstance(ex.get("messages"), list) and len(ex["messages"]) >= 2)
|
| 111 |
+
corpus = [item['messages'][0]['content'] for item in dataset]
|
| 112 |
self._log(f"正在編碼 {len(corpus)} 個問題...")
|
| 113 |
all_embeddings = torch.cat([self._encode_texts(corpus[i:i+32]) for i in range(0, len(corpus), 32)], dim=0).numpy()
|
| 114 |
index = faiss.IndexFlatIP(all_embeddings.shape)
|
|
|
|
| 153 |
if idx >= len(self.dataset): continue
|
| 154 |
item = self.dataset[idx]
|
| 155 |
if not (isinstance(item.get('messages'), list) and len(item['messages']) >= 2): continue
|
| 156 |
+
q_content = (item['messages'][0].get('content') or '').strip()
|
| 157 |
+
a_content = (item['messages'][1].get('content') or '').strip()
|
| 158 |
if not q_content or not a_content: continue
|
| 159 |
clean_q = re.sub(r"以下是一個SQL查詢任務:\s*指令:\s*", "", q_content).strip()
|
| 160 |
if clean_q in seen_questions: continue
|