Spaces:

JackSparrow89
/

Semantic_File

Sleeping

App Files Files Community

Semantic_File / evaluation /query_runner.py

JackSparrow89

Upload 65 files

bb04c5f verified about 1 month ago

raw

history blame contribute delete

4.71 kB

	# evaluation/query_runner.py

	from searcher.search_engine import SearchEngine


	class QueryRunner:
	"""
	Runs all evaluation queries through your SearchEngine and collects
	the ranked result lists for scoring.

	The results are formatted exactly as the Evaluator expects:
	{query_id: [(doc_id, score), ...]} ranked best-first
	"""

	def __init__(self, config_path: str = "config.yaml"):
	self.engine = SearchEngine(config_path)

	def _extract_doc_id(self, filepath: str) -> str:
	"""
	Strip dataset prefix from fake filepath so it matches qrels doc_ids.

	Examples:
	"scifact://12345" → "12345"
	"nfcorpus://MED-10" → "MED-10"
	"/real/file.pdf" → "/real/file.pdf" (real files unchanged)

	This is critical — without stripping, doc_ids like "nfcorpus://MED-10"
	will never match qrels keys like "MED-10" and all scores will be 0.0
	"""
	if "://" in filepath:
	return filepath.split("://", 1)[1]
	return filepath

	def run(
	self,
	queries: dict,
	top_k: int = 100,
	mode: str = "full",
	) -> dict:
	"""
	Run all queries and return ranked results.

	Args:
	queries — {query_id: query_text}
	top_k — number of results per query (use 100 for eval)
	mode — pipeline variant to test:
	"dense" → dense retrieval only
	"sparse" → BM25 only
	"hybrid" → dense + BM25 + RRF (no reranker)
	"full" → complete pipeline with reranker

	Returns:
	dict — {query_id: [(doc_id, rank_score), ...]}
	"""
	results = {}
	total = len(queries)

	for i, (query_id, query_text) in enumerate(queries.items(), 1):
	if i % 50 == 0:
	print(f" Running query {i}/{total}...")

	try:
	if mode == "dense":
	raw = self.engine.dense_retriever.retrieve(query_text, top_k=top_k)
	ranked = [
	(self._extract_doc_id(r["filepath"]), -r["dense_score"])
	for r in raw
	]

	elif mode == "sparse":
	raw = self.engine.sparse_retriever.retrieve(query_text, top_k=top_k)
	ranked = [
	(self._extract_doc_id(r["filepath"]), r["sparse_score"])
	for r in raw
	]

	elif mode == "hybrid":
	dense_raw = self.engine.dense_retriever.retrieve(query_text, top_k=top_k)
	sparse_raw = self.engine.sparse_retriever.retrieve(query_text, top_k=top_k)
	fused = self.engine.fusion_ranker.fuse(dense_raw, sparse_raw, top_k=top_k)
	ranked = [
	(self._extract_doc_id(r["filepath"]), r["rrf_score"])
	for r in fused
	]

	else: # full pipeline
	output = self.engine.search(query_text, top_k=top_k)
	ranked = [
	(
	self._extract_doc_id(r["filepath"]),
	r.get("rerank_score", r.get("rrf_score", 0))
	)
	for r in output["results"]
	]

	# Deduplicate by doc_id
	# multiple chunks from same doc → keep only the best score
	seen = {}
	for doc_id, score in ranked:
	if doc_id not in seen or score > seen[doc_id]:
	seen[doc_id] = score

	results[query_id] = sorted(
	seen.items(),
	key=lambda x: x[1],
	reverse=True
	)

	except Exception as e:
	print(f" Error on query {query_id}: {e}")
	results[query_id] = []

	return results


	if __name__ == "__main__":
	from evaluation.dataset_loader import DatasetLoader

	loader = DatasetLoader("data/scifact")
	queries = loader.load_queries()

	runner = QueryRunner()
	results = runner.run(queries, top_k=10, mode="full")

	sample_qid = list(results.keys())[0]
	print(f"\nQuery {sample_qid} top results:")
	for doc_id, score in results[sample_qid][:5]:
	print(f" doc {doc_id} score={score:.4f}")