Spaces:

Qar-Raz
/

NLP-RAG

Sleeping

App Files Files Community

NLP-RAG / test.py

Qar-Raz

Sync backend Docker context from GitHub main

c27a4e3 verified 13 days ago

raw

history blame

6.14 kB

	import os
	os.environ["OMP_NUM_THREADS"] = "1"
	os.environ["MKL_NUM_THREADS"] = "1"
	import sys
	import traceback
	from datetime import datetime
	from dotenv import load_dotenv

	from config_loader import cfg
	from data.vector_db import get_index_by_name
	from retriever.retriever import HybridRetriever
	from retriever.processor import ChunkProcessor
	from data.ingest import CHUNKING_TECHNIQUES

	def generate_retrieval_report(all_results, queries, output_file="retrieval_report.md"):
	"""
	Generates a Markdown document summarizing the retrieved chunks
	for each query, chunking technique, and retrieval strategy.
	"""
	timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")

	content = f"# Retrieval Testing Report\n\nGenerated: {timestamp}\n\n"
	content += "## Test Queries\n\n"
	for i, q in enumerate(queries, 1):
	content += f"{i}. {q}\n"

	content += "\n## Retrieval Results by Query\n\n"

	for q_idx, q_results in all_results.items():
	content += f"### Query {q_idx + 1}: {queries[q_idx]}\n\n"

	for tech_strat_key, chunks_data in q_results.items():
	content += f"#### Strategy & Technique: {tech_strat_key}\n\n"

	chunks = chunks_data.get('chunks', [])
	score = chunks_data.get('score', 0)

	content += f"ChunkScore: {score:.4f} \| Chunks retrieved: {len(chunks)}\n\n"

	if not chunks:
	content += "No chunks retrieved.\n\n"
	else:
	for i, chunk in enumerate(chunks, 1):
	content += f"[Chunk {i}] ({len(chunk)} chars):\n"
	content += f"```text\n{chunk}\n```\n\n"

	content += "---\n\n"

	with open(output_file, 'w', encoding='utf-8') as f:
	f.write(content)

	print(f"\nRetrieval report saved to: {output_file}")


	def main():
	# Load environment variables
	load_dotenv()

	pinecone_key = os.getenv("PINECONE_API_KEY")
	if not pinecone_key:
	raise RuntimeError("PINECONE_API_KEY not found in environment variables")

	test_queries = [
	"What is cognitive behavior therapy and how does it work?",
	"I feel like a complete failure because I made a mistake at work today. Everyone must think I am incompetent, and I will probably get fired. I just want to hide.",
	"No matter what I do, my anxiety will not go away. I am constantly worried about the future and avoid social situations because of it.",
	"I have been feeling really down lately and have no energy. It feels like nothing will ever get better and there is no point in trying."
	]

	# TECHNIQUES_TO_EVALUATE = ["fixed", "semantic", "markdown", "page"]
	# Use all 7 chunking techniques from ingest.py
	CHUNKING_TECHNIQUES_FILTERED = CHUNKING_TECHNIQUES
	print(f"Testing all {len(CHUNKING_TECHNIQUES_FILTERED)} chunking techniques:")
	for tech in CHUNKING_TECHNIQUES_FILTERED:
	print(f" - {tech['name']}: {tech['description']}")

	RETRIEVAL_STRATEGIES = [
	{"mode": "semantic", "use_mmr": False, "label": "semantic-no-mmr"},
	{"mode": "semantic", "use_mmr": True, "label": "semantic-with-mmr"},
	{"mode": "hybrid", "use_mmr": False, "label": "hybrid-no-mmr"},
	{"mode": "hybrid", "use_mmr": True, "label": "hybrid-with-mmr"},
	{"mode": "bm25", "use_mmr": False, "label": "bm25-no-mmr"},
	]

	print("Initializing ChunkProcessor to load Embedding Model...")
	proc = ChunkProcessor(model_name=cfg.processing['embedding_model'], verbose=False)


	print("Initializing HybridRetriever...")
	retriever = HybridRetriever(
	embed_model=proc.encoder,
	rerank_model_name='jinaai/jina-reranker-v1-tiny-en',
	verbose=False
	)

	all_query_results = {}

	for query_idx, query in enumerate(test_queries):
	print(f"\n{'='*80}")
	print(f"PROCESSING QUERY {query_idx + 1}/{len(test_queries)}: {query}")
	print(f"{'='*80}")

	query_results = {}

	# Connect to the single index where all techniques are stored with metadata differentiation
	index_name = "cbt-book-recursive"
	try:
	index = get_index_by_name(pinecone_key, index_name)
	stats = index.describe_index_stats()
	if stats.get('total_vector_count', 0) == 0:
	print(f" [!] Warning: Index {index_name} is empty. Proceeding for sparse test.")
	except Exception as e:
	print(f" [X] Failed to connect to index {index_name}: {e}")
	continue

	for technique in CHUNKING_TECHNIQUES_FILTERED:
	technique_name = technique['name']

	for strategy in RETRIEVAL_STRATEGIES:
	result_key = f"{technique_name} + {strategy['label']}"
	print(f"\nEvaluating: {result_key}")

	try:
	context_chunks, chunk_score = retriever.search(
	query=query,
	index=index,
	mode=strategy['mode'],
	rerank_strategy="cross-encoder",
	use_mmr=strategy['use_mmr'],
	top_k=25,
	final_k=4,
	technique_name=technique_name,
	verbose=False,
	test=True
	)

	query_results[result_key] = {
	'chunks': context_chunks,
	'score': chunk_score
	}
	print(f" -> Retrieved {len(context_chunks)} chunks (Score: {chunk_score:.4f})")

	except Exception as e:
	print(f" -> Error retrieving for {result_key}: {e}")

	all_query_results[query_idx] = query_results

	# Generate isolated retrieval test report
	generate_retrieval_report(all_query_results, test_queries)


	if __name__ == '__main__':
	main()