Spaces:
Sleeping
Sleeping
Commit
·
e7a2175
1
Parent(s):
7743187
try to fix ndcg bug
Browse files
app.py
CHANGED
|
@@ -103,13 +103,13 @@ class RepLlamaModel:
|
|
| 103 |
batch_dict = create_batch_dict(self.tokenizer, batch_texts, always_add_eos="last")
|
| 104 |
batch_dict = {key: value.cuda() for key, value in batch_dict.items()}
|
| 105 |
|
| 106 |
-
with torch.cuda.amp.autocast():
|
| 107 |
-
|
| 108 |
-
|
| 109 |
-
|
| 110 |
-
|
| 111 |
-
|
| 112 |
-
|
| 113 |
|
| 114 |
self.model = self.model.cpu()
|
| 115 |
return np.concatenate(all_embeddings, axis=0)
|
|
@@ -150,6 +150,7 @@ def load_corpus_lookups(dataset_name):
|
|
| 150 |
_, p_lookup = pickle.load(f)
|
| 151 |
corpus_lookups[dataset_name] += p_lookup
|
| 152 |
logger.info(f"Loaded corpus lookups for {dataset_name}. Total entries: {len(corpus_lookups[dataset_name])}")
|
|
|
|
| 153 |
|
| 154 |
def load_queries(dataset_name):
|
| 155 |
global queries, q_lookups, qrels
|
|
@@ -181,6 +182,12 @@ def evaluate(qrels, results, k_values):
|
|
| 181 |
for k in k_values:
|
| 182 |
metrics[f"NDCG@{k}"] = round(np.mean([query_scores[f"ndcg_cut_{k}"] for query_scores in scores.values()]), 3)
|
| 183 |
metrics[f"Recall@{k}"] = round(np.mean([query_scores[f"recall_{k}"] for query_scores in scores.values()]), 3)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 184 |
|
| 185 |
return metrics
|
| 186 |
|
|
@@ -190,19 +197,35 @@ def run_evaluation(dataset, postfix):
|
|
| 190 |
current_dataset = dataset
|
| 191 |
|
| 192 |
input_texts = [f"query: {query.strip()} {postfix}".strip() for query in queries[current_dataset]]
|
|
|
|
|
|
|
|
|
|
| 193 |
q_reps = model.encode(input_texts)
|
|
|
|
|
|
|
| 194 |
all_scores, psg_indices = search_queries(dataset, q_reps)
|
| 195 |
|
| 196 |
results = {qid: dict(zip(doc_ids, map(float, scores)))
|
| 197 |
for qid, scores, doc_ids in zip(q_lookups[dataset].keys(), all_scores, psg_indices)}
|
| 198 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 199 |
metrics = evaluate(qrels[dataset], results, k_values=[10, 100])
|
| 200 |
|
| 201 |
-
return
|
| 202 |
-
"NDCG@10": metrics["NDCG@10"],
|
| 203 |
-
"Recall@100": metrics["Recall@100"]
|
| 204 |
-
}
|
| 205 |
-
|
| 206 |
|
| 207 |
@spaces.GPU
|
| 208 |
def gradio_interface(dataset, postfix):
|
|
|
|
| 103 |
batch_dict = create_batch_dict(self.tokenizer, batch_texts, always_add_eos="last")
|
| 104 |
batch_dict = {key: value.cuda() for key, value in batch_dict.items()}
|
| 105 |
|
| 106 |
+
# with torch.cuda.amp.autocast():
|
| 107 |
+
with torch.no_grad():
|
| 108 |
+
outputs = self.model(**batch_dict)
|
| 109 |
+
embeddings = pool(outputs.last_hidden_state, batch_dict['attention_mask'], 'last')
|
| 110 |
+
embeddings = F.normalize(embeddings, p=2, dim=-1)
|
| 111 |
+
logger.info(f"Encoded shape: {embeddings.shape}, Norm of first embedding: {torch.norm(embeddings[0]).item()}")
|
| 112 |
+
all_embeddings.append(embeddings.cpu().numpy())
|
| 113 |
|
| 114 |
self.model = self.model.cpu()
|
| 115 |
return np.concatenate(all_embeddings, axis=0)
|
|
|
|
| 150 |
_, p_lookup = pickle.load(f)
|
| 151 |
corpus_lookups[dataset_name] += p_lookup
|
| 152 |
logger.info(f"Loaded corpus lookups for {dataset_name}. Total entries: {len(corpus_lookups[dataset_name])}")
|
| 153 |
+
logger.info(f"Sample corpus lookup entry: {corpus_lookups[dataset_name][0]}")
|
| 154 |
|
| 155 |
def load_queries(dataset_name):
|
| 156 |
global queries, q_lookups, qrels
|
|
|
|
| 182 |
for k in k_values:
|
| 183 |
metrics[f"NDCG@{k}"] = round(np.mean([query_scores[f"ndcg_cut_{k}"] for query_scores in scores.values()]), 3)
|
| 184 |
metrics[f"Recall@{k}"] = round(np.mean([query_scores[f"recall_{k}"] for query_scores in scores.values()]), 3)
|
| 185 |
+
logger.info(f"NDCG@{k}: mean={metrics[f'NDCG@{k}']}, min={min(ndcg_scores)}, max={max(ndcg_scores)}")
|
| 186 |
+
logger.info(f"Recall@{k}: mean={metrics[f'Recall@{k}']}, min={min(recall_scores)}, max={max(recall_scores)}")
|
| 187 |
+
|
| 188 |
+
# Add these lines
|
| 189 |
+
logger.info(f"Number of queries evaluated: {len(scores)}")
|
| 190 |
+
logger.info(f"Sample evaluation score: {list(scores.items())[0]}")
|
| 191 |
|
| 192 |
return metrics
|
| 193 |
|
|
|
|
| 197 |
current_dataset = dataset
|
| 198 |
|
| 199 |
input_texts = [f"query: {query.strip()} {postfix}".strip() for query in queries[current_dataset]]
|
| 200 |
+
logger.info(f"Number of input texts: {len(input_texts)}")
|
| 201 |
+
logger.info(f"Sample input text: {input_texts[0]}")
|
| 202 |
+
|
| 203 |
q_reps = model.encode(input_texts)
|
| 204 |
+
logger.info(f"Encoded query representations shape: {q_reps.shape}")
|
| 205 |
+
|
| 206 |
all_scores, psg_indices = search_queries(dataset, q_reps)
|
| 207 |
|
| 208 |
results = {qid: dict(zip(doc_ids, map(float, scores)))
|
| 209 |
for qid, scores, doc_ids in zip(q_lookups[dataset].keys(), all_scores, psg_indices)}
|
| 210 |
|
| 211 |
+
logger.info(f"Number of results: {len(results)}")
|
| 212 |
+
logger.info(f"Sample result: {list(results.items())[0]}")
|
| 213 |
+
|
| 214 |
+
# Add these lines
|
| 215 |
+
logger.info(f"Number of queries in qrels: {len(qrels[dataset])}")
|
| 216 |
+
logger.info(f"Sample qrel: {list(qrels[dataset].items())[0]}")
|
| 217 |
+
logger.info(f"Number of queries in results: {len(results)}")
|
| 218 |
+
logger.info(f"Sample result: {list(results.items())[0]}")
|
| 219 |
+
|
| 220 |
+
# Check for mismatches
|
| 221 |
+
qrels_keys = set(qrels[dataset].keys())
|
| 222 |
+
results_keys = set(results.keys())
|
| 223 |
+
logger.info(f"Queries in qrels but not in results: {qrels_keys - results_keys}")
|
| 224 |
+
logger.info(f"Queries in results but not in qrels: {results_keys - qrels_keys}")
|
| 225 |
+
|
| 226 |
metrics = evaluate(qrels[dataset], results, k_values=[10, 100])
|
| 227 |
|
| 228 |
+
return metrics
|
|
|
|
|
|
|
|
|
|
|
|
|
| 229 |
|
| 230 |
@spaces.GPU
|
| 231 |
def gradio_interface(dataset, postfix):
|