Spaces:

Sanjam19
/

GraphRAG

Sleeping

App Files Files Community

GraphRAG / backend /evaluate.py

Sanjam19

clean initial commit

a4ab72e about 1 month ago

Raw

History Blame Contribute Delete

6.67 kB

	import sys
	import time
	import json
	import os

	sys.path.insert(0, ".")

	from backend.rag import query_rag
	from backend.llm_only import query_llm_only
	from backend.graphrag import (
	query_graphrag,
	build_graph
	)

	from bert_score import (
	score as bert_score
	)

	from groq import Groq
	from dotenv import load_dotenv

	load_dotenv()

	judge = Groq(
	api_key=os.getenv(
	"GROQ_API_KEY"
	)
	)


	def llm_judge(
	question,
	ground_truth,
	answer
	) -> bool:

	prompt = f"""
	You are a lenient evaluation judge for financial questions.

	Question:
	{question}

	Ground Truth:
	{ground_truth}

	Answer:
	{answer}

	Does the Answer convey the same meaning or key facts as the Ground Truth, even if worded differently?

	Reply only YES or NO.
	"""

	r = (
	judge.chat
	.completions.create(
	model=
	"llama-3.1-8b-instant",
	messages=[{
	"role": "user",
	"content": prompt
	}],
	max_tokens=5
	)
	)

	return (
	"YES"
	in
	r.choices[0]
	.message.content.upper()
	)


	def evaluate(questions):

	G = build_graph()

	results = []

	llm_answers = []
	rag_answers = []
	grag_answers = []
	refs = []

	for q in questions:

	question = q["question"]
	ground_truth = q["answer"]

	t0 = time.time()
	llm = query_llm_only(
	question
	)
	llm_lat = round(
	time.time() - t0,
	2
	)

	t0 = time.time()
	rag = query_rag(
	question
	)
	rag_lat = round(
	time.time() - t0,
	2
	)

	t0 = time.time()
	grag = query_graphrag(
	question,
	G
	)
	grag_lat = round(
	time.time() - t0,
	2
	)

	llm_judge_pass = (
	llm_judge(
	question,
	ground_truth,
	grag["answer"]
	)
	)

	token_reduction = round(
	(
	rag["total_tokens"]
	-
	grag["total_tokens"]
	)
	/
	rag["total_tokens"]
	* 100,
	1
	)

	results.append({
	"question":
	question,

	"ground_truth":
	ground_truth,

	"llm_only":
	{
	**llm,
	"latency":
	llm_lat
	},

	"rag":
	{
	**rag,
	"latency":
	rag_lat
	},

	"graphrag":
	{
	**grag,
	"latency":
	grag_lat
	},

	"token_reduction_vs_rag":
	token_reduction,

	"llm_judge":
	(
	"PASS"
	if llm_judge_pass
	else "FAIL"
	)
	})

	llm_answers.append(
	llm["answer"]
	)

	rag_answers.append(
	rag["answer"]
	)

	grag_answers.append(
	grag["answer"]
	)

	refs.append(
	ground_truth
	)

	print(
	f"Q: "
	f"{question[:50]}"
	)

	print(
	f" "
	f"LLM:"
	f"{llm['total_tokens']}t "
	f"{llm_lat}s \| "
	f"RAG:"
	f"{rag['total_tokens']}t "
	f"{rag_lat}s \| "
	f"GraphRAG:"
	f"{grag['total_tokens']}t "
	f"{grag_lat}s"
	)

	print(
	f" "
	f"Token reduction: "
	f"{token_reduction}% "
	f"\| Judge: "
	f"{results[-1]['llm_judge']}"
	)

	print(
	"\nComputing "
	"BERTScore..."
	)

	_, _, grag_f1 = (
	bert_score(
	grag_answers,
	refs,
	lang="en",
	verbose=False
	)
	)

	_, _, rag_f1 = (
	bert_score(
	rag_answers,
	refs,
	lang="en",
	verbose=False
	)
	)

	avg_grag_bert = round(
	grag_f1.mean().item(),
	4
	)

	avg_rag_bert = round(
	rag_f1.mean().item(),
	4
	)

	judge_pass_rate = round(
	sum(
	1
	for r in results
	if r[
	"llm_judge"
	] == "PASS"
	)
	/
	len(results)
	* 100,
	1
	)

	avg_token_reduction = round(
	sum(
	r[
	"token_reduction_vs_rag"
	]
	for r in results
	)
	/
	len(results),
	1
	)

	print(
	"\n=== FINAL "
	"METRICS ==="
	)

	print(
	"GraphRAG "
	"BERTScore F1: "
	f"{avg_grag_bert}"
	)

	print(
	"RAG "
	"BERTScore F1: "
	f"{avg_rag_bert}"
	)

	print(
	"LLM-as-Judge "
	"pass rate: "
	f"{judge_pass_rate}%"
	)

	print(
	"Avg token "
	"reduction: "
	f"{avg_token_reduction}%"
	)

	summary = {
	"results":
	results,

	"summary":
	{
	"graphrag_bertscore":
	avg_grag_bert,

	"rag_bertscore":
	avg_rag_bert,

	"judge_pass_rate":
	judge_pass_rate,

	"avg_token_reduction":
	avg_token_reduction
	}
	}

	with open(
	"data/results.json",
	"w",
	encoding="utf-8"
	) as f:

	json.dump(
	summary,
	f,
	indent=2,
	ensure_ascii=False
	)

	print(
	"Saved to "
	"data/results.json"
	)

	return summary


	if __name__ == "__main__":

	metadata = json.load(
	open(
	"data/metadata.json",
	encoding="utf-8"
	)
	)

	fb_only = [
	m for m in metadata
	if m.get("source")
	== "financebench"
	]

	seen = set()
	unique = []

	for m in fb_only:

	if (
	m["question"]
	not in seen
	):
	seen.add(
	m["question"]
	)

	unique.append(m)

	questions = [
	{
	"question":
	m["question"],
	"answer":
	m["answer"]
	}
	for m in unique[:10]
	]

	evaluate(questions)