Spaces:

ThatITGuy
/

IJNet-assistant

Sleeping

Mohammad Haris

Deploy IJNet assistant

b87aca1 about 1 month ago

13.4 kB

	"""
	RAG Pipeline Evaluation (v2)
	------------------------------
	Comprehensive tests covering:
	- Query classification accuracy
	- Retrieval precision and recall
	- Guardrail validation (on-topic / off-topic detection)
	- End-to-end response quality (requires GROQ_API_KEY)

	Usage:
	python evaluate.py # classification + retrieval + guardrail tests
	GROQ_API_KEY=xxx python evaluate.py # all tests including end-to-end
	"""

	import os
	import sys
	import json
	from pathlib import Path
	from datetime import datetime

	sys.path.insert(0, str(Path(__file__).parent))

	from src.ingest import load_knowledge_base, build_documents, get_embeddings, build_vector_store, load_vector_store
	from src.retriever import HybridRetriever, classify_query
	from src.chain import check_guardrails


	# ---------------------------------------------------------------------------
	# TEST CASES
	# ---------------------------------------------------------------------------

	RETRIEVAL_TESTS = [
	{
	"query": "What opportunities are available for investigative journalists in Africa?",
	"expected_ids": ["opp-002", "opp-017"],
	"description": "Region + topic filter — Africa investigative",
	},
	{
	"query": "Find fellowships with deadlines in the next 30 days",
	"expected_type": "fellowship",
	"description": "Deadline + type filter — fellowships",
	},
	{
	"query": "What resources does IJNet have on AI tools for journalists?",
	"expected_ids": ["art-001", "opp-007", "opp-020"],
	"description": "Topic search — AI tools",
	},
	{
	"query": "Can you summarize the latest opportunities for product/design people in newsrooms?",
	"expected_ids": ["art-003", "opp-016", "opp-015"],
	"description": "Product/design role search",
	},
	{
	"query": "Which IJNet newsletter should I subscribe to?",
	"expected_ids": ["art-002"],
	"description": "Newsletter-specific query",
	},
	{
	"query": "What grants are available for data journalism?",
	"expected_ids": ["opp-005", "opp-013"],
	"description": "Grant type + data journalism topic",
	},
	{
	"query": "Tell me about digital security for journalists",
	"expected_ids": ["art-004"],
	"description": "Article retrieval — digital security",
	},
	{
	"query": "What training programs exist for journalists in the Middle East?",
	"expected_ids": ["opp-007", "opp-012"],
	"description": "Region filter — MENA",
	},
	{
	"query": "Climate change reporting opportunities",
	"expected_ids": ["opp-004", "opp-008"],
	"description": "Topic — environment/climate",
	},
	{
	"query": "What is IJNet?",
	"expected_ids": ["ijnet-about"],
	"description": "About IJNet query",
	},
	{
	"query": "Opportunities for women journalists in Africa",
	"expected_ids": ["opp-011"],
	"description": "Women + Africa filter",
	},
	{
	"query": "How can freelance journalists find funding?",
	"expected_ids": ["art-006"],
	"description": "Freelance funding article",
	},
	{
	"query": "fact-checking training workshops",
	"expected_ids": ["opp-017"],
	"description": "Fact-checking topic",
	},
	{
	"query": "press freedom fellowships",
	"expected_ids": ["opp-019"],
	"description": "Press freedom topic",
	},
	{
	"query": "mobile journalism webinar",
	"expected_ids": ["opp-012"],
	"description": "MoJo / mobile journalism",
	},
	]

	CLASSIFICATION_TESTS = [
	("Find fellowships with deadlines in the next 30 days", "deadline_search", {"deadline_days": 30}),
	("Opportunities for journalists in Africa", "region_search", {}),
	("Which newsletter should I subscribe to?", "newsletter", {}),
	("What is IJNet?", "about", {}),
	("AI tools for newsrooms", "general", {}),
	("Grants expiring within 60 days", "deadline_search", {"deadline_days": 60}),
	("Training programs in the Middle East", "region_search", {}),
	("Fellowships closing in the next 2 weeks", "deadline_search", {"deadline_days": 14}),
	("Data journalism awards", "general", {}),
	("What opportunities are there in South Asia?", "region_search", {}),
	]

	GUARDRAIL_TESTS = [
	# (query, should_be_allowed)
	("What fellowships are available for African journalists?", True),
	("Tell me about AI tools for newsrooms", True),
	("Which IJNet newsletter should I subscribe to?", True),
	("Hello", True),
	("Thanks for the help!", True),
	("What grants exist?", True),
	("help", True),
	# Off-topic queries
	("Write me a poem about the moon", False),
	("What's the weather in New York?", False),
	("How do I cook pasta carbonara?", False),
	("Solve this math equation: 2x + 5 = 15", False),
	("Translate this to French: hello world", False),
	("Tell me a joke", False),
	# Edge cases — should still be allowed (journalism-adjacent)
	("How can journalists use AI?", True),
	("media training opportunities", True),
	("press freedom in Asia", True),
	]

	E2E_TESTS = [
	{
	"query": "What opportunities are available for investigative journalists in Africa?",
	"must_contain": ["Africa", "investigat"],
	"must_not_contain": ["I don't have information"],
	"description": "Should find African investigative opportunities",
	},
	{
	"query": "Which IJNet newsletter should I subscribe to?",
	"must_contain": ["newsletter", "subscribe"],
	"must_not_contain": ["I don't have information"],
	"description": "Should describe newsletter options",
	},
	{
	"query": "Write me a poem about the ocean",
	"must_contain": ["journalism", "IJNet"],
	"must_not_contain": ["ocean", "poem", "sea"],
	"description": "Should reject off-topic and redirect",
	},
	{
	"query": "What AI tools can journalists use?",
	"must_contain": ["AI", "tool"],
	"must_not_contain": ["I don't have information"],
	"description": "Should discuss AI tools from the article",
	},
	{
	"query": "Are there any grants for data journalism?",
	"must_contain": ["data journalism", "grant"],
	"must_not_contain": [],
	"description": "Should find data journalism grants",
	},
	]


	# ---------------------------------------------------------------------------
	# TEST RUNNERS
	# ---------------------------------------------------------------------------

	def run_classification_tests():
	"""Test query classification accuracy."""
	print("\n" + "=" * 60)
	print("1. QUERY CLASSIFICATION TESTS")
	print("=" * 60)

	passed = 0
	for query, expected_intent, expected_filters in CLASSIFICATION_TESTS:
	result = classify_query(query)
	intent_match = result["intent"] == expected_intent

	filter_match = True
	for key, val in expected_filters.items():
	if result["filters"].get(key) != val:
	filter_match = False

	status = "✅" if (intent_match and filter_match) else "❌"
	print(f" {status} \"{query[:55]}...\"" if len(query) > 55 else f" {status} \"{query}\"")
	if not (intent_match and filter_match):
	print(f" Expected: {expected_intent}, Got: {result['intent']}")

	if intent_match and filter_match:
	passed += 1

	total = len(CLASSIFICATION_TESTS)
	print(f"\n Result: {passed}/{total} passed ({passed/total:.0%})")
	return passed, total


	def run_guardrail_tests():
	"""Test guardrail accuracy."""
	print("\n" + "=" * 60)
	print("2. GUARDRAIL TESTS")
	print("=" * 60)

	passed = 0
	for query, should_allow in GUARDRAIL_TESTS:
	is_allowed, msg = check_guardrails(query)
	correct = is_allowed == should_allow

	status = "✅" if correct else "❌"
	expected = "allow" if should_allow else "block"
	actual = "allowed" if is_allowed else "blocked"
	print(f" {status} [{expected}] \"{query[:50]}\" → {actual}")

	if correct:
	passed += 1

	total = len(GUARDRAIL_TESTS)
	print(f"\n Result: {passed}/{total} passed ({passed/total:.0%})")
	return passed, total


	def run_retrieval_tests(retriever: HybridRetriever):
	"""Test retrieval accuracy."""
	print("\n" + "=" * 60)
	print("3. RETRIEVAL TESTS")
	print("=" * 60)

	passed = 0
	total = len(RETRIEVAL_TESTS)

	for i, test in enumerate(RETRIEVAL_TESTS, 1):
	query = test["query"]
	expected_ids = test.get("expected_ids", [])
	expected_type = test.get("expected_type", None)

	results = retriever.retrieve(query)
	retrieved_ids = [doc.metadata.get("doc_id", "") for doc in results]

	test_passed = True

	if expected_ids:
	found = [eid for eid in expected_ids if eid in retrieved_ids]
	recall = len(found) / len(expected_ids)
	if recall < 0.5:
	test_passed = False

	status = "✅" if test_passed else "❌"
	print(f" {status} {test['description']}")
	if not test_passed:
	print(f" Expected: {expected_ids}, Got: {retrieved_ids}")

	if test_passed:
	passed += 1

	print(f"\n Result: {passed}/{total} passed ({passed/total:.0%})")
	return passed, total


	def run_e2e_tests(retriever: HybridRetriever):
	"""Test full end-to-end response quality. Requires GROQ_API_KEY."""
	api_key = os.environ.get("GROQ_API_KEY")
	if not api_key:
	print("\n" + "=" * 60)
	print("4. END-TO-END TESTS (SKIPPED — set GROQ_API_KEY to run)")
	print("=" * 60)
	return 0, 0

	print("\n" + "=" * 60)
	print("4. END-TO-END TESTS")
	print("=" * 60)

	from src.chain import IJNetRAGChain
	chain = IJNetRAGChain(retriever=retriever, groq_api_key=api_key)

	passed = 0
	for test in E2E_TESTS:
	query = test["query"]

	try:
	result = chain.query(query)
	answer = result["answer"].lower()

	# Check must_contain
	contains_ok = all(
	term.lower() in answer for term in test["must_contain"]
	)

	# Check must_not_contain
	not_contains_ok = all(
	term.lower() not in answer for term in test["must_not_contain"]
	)

	test_passed = contains_ok and not_contains_ok
	status = "✅" if test_passed else "❌"
	print(f" {status} {test['description']}")

	if not test_passed:
	if not contains_ok:
	missing = [t for t in test["must_contain"] if t.lower() not in answer]
	print(f" Missing terms: {missing}")
	if not not_contains_ok:
	found_bad = [t for t in test["must_not_contain"] if t.lower() in answer]
	print(f" Unwanted terms found: {found_bad}")
	print(f" Response preview: {answer[:150]}...")

	if test_passed:
	passed += 1

	except Exception as e:
	print(f" ❌ {test['description']} — Error: {e}")

	total = len(E2E_TESTS)
	print(f"\n Result: {passed}/{total} passed ({passed/total:.0%})")
	return passed, total


	# ---------------------------------------------------------------------------
	# MAIN
	# ---------------------------------------------------------------------------

	def main():
	print("=" * 60)
	print("IJNet RAG Pipeline — Evaluation Suite (v2)")
	print("=" * 60)

	# Initialize
	print("\nInitializing pipeline...")
	kb = load_knowledge_base("data/knowledge_base.json")
	documents = build_documents(kb)
	embeddings = get_embeddings()

	index_path = "data/faiss_index"
	if Path(index_path).exists():
	vector_store = load_vector_store(index_path, embeddings)
	else:
	vector_store = build_vector_store(documents, embeddings, index_path)

	retriever = HybridRetriever(
	vector_store=vector_store,
	documents=documents,
	semantic_k=8,
	bm25_k=8,
	final_k=5,
	)
	print(f"Pipeline ready. {len(documents)} documents indexed.")

	# Run all test suites
	results = []
	results.append(("Classification", *run_classification_tests()))
	results.append(("Guardrails", *run_guardrail_tests()))
	results.append(("Retrieval", *run_retrieval_tests(retriever)))
	results.append(("End-to-End", *run_e2e_tests(retriever)))

	# Summary
	print("\n" + "=" * 60)
	print("SUMMARY")
	print("=" * 60)

	total_passed = 0
	total_tests = 0
	for name, passed, total in results:
	if total > 0:
	print(f" {name:20s}: {passed}/{total} ({passed/total:.0%})")
	total_passed += passed
	total_tests += total

	if total_tests > 0:
	print(f" {'OVERALL':20s}: {total_passed}/{total_tests} ({total_passed/total_tests:.0%})")
	print("=" * 60)


	if __name__ == "__main__":
	main()