GraphRAG / backend /evaluate.py
Sanjam19's picture
clean initial commit
a4ab72e
Raw
History Blame Contribute Delete
6.67 kB
import sys
import time
import json
import os
sys.path.insert(0, ".")
from backend.rag import query_rag
from backend.llm_only import query_llm_only
from backend.graphrag import (
query_graphrag,
build_graph
)
from bert_score import (
score as bert_score
)
from groq import Groq
from dotenv import load_dotenv
load_dotenv()
judge = Groq(
api_key=os.getenv(
"GROQ_API_KEY"
)
)
def llm_judge(
question,
ground_truth,
answer
) -> bool:
prompt = f"""
You are a lenient evaluation judge for financial questions.
Question:
{question}
Ground Truth:
{ground_truth}
Answer:
{answer}
Does the Answer convey the same meaning or key facts as the Ground Truth, even if worded differently?
Reply only YES or NO.
"""
r = (
judge.chat
.completions.create(
model=
"llama-3.1-8b-instant",
messages=[{
"role": "user",
"content": prompt
}],
max_tokens=5
)
)
return (
"YES"
in
r.choices[0]
.message.content.upper()
)
def evaluate(questions):
G = build_graph()
results = []
llm_answers = []
rag_answers = []
grag_answers = []
refs = []
for q in questions:
question = q["question"]
ground_truth = q["answer"]
t0 = time.time()
llm = query_llm_only(
question
)
llm_lat = round(
time.time() - t0,
2
)
t0 = time.time()
rag = query_rag(
question
)
rag_lat = round(
time.time() - t0,
2
)
t0 = time.time()
grag = query_graphrag(
question,
G
)
grag_lat = round(
time.time() - t0,
2
)
llm_judge_pass = (
llm_judge(
question,
ground_truth,
grag["answer"]
)
)
token_reduction = round(
(
rag["total_tokens"]
-
grag["total_tokens"]
)
/
rag["total_tokens"]
* 100,
1
)
results.append({
"question":
question,
"ground_truth":
ground_truth,
"llm_only":
{
**llm,
"latency":
llm_lat
},
"rag":
{
**rag,
"latency":
rag_lat
},
"graphrag":
{
**grag,
"latency":
grag_lat
},
"token_reduction_vs_rag":
token_reduction,
"llm_judge":
(
"PASS"
if llm_judge_pass
else "FAIL"
)
})
llm_answers.append(
llm["answer"]
)
rag_answers.append(
rag["answer"]
)
grag_answers.append(
grag["answer"]
)
refs.append(
ground_truth
)
print(
f"Q: "
f"{question[:50]}"
)
print(
f" "
f"LLM:"
f"{llm['total_tokens']}t "
f"{llm_lat}s | "
f"RAG:"
f"{rag['total_tokens']}t "
f"{rag_lat}s | "
f"GraphRAG:"
f"{grag['total_tokens']}t "
f"{grag_lat}s"
)
print(
f" "
f"Token reduction: "
f"{token_reduction}% "
f"| Judge: "
f"{results[-1]['llm_judge']}"
)
print(
"\nComputing "
"BERTScore..."
)
_, _, grag_f1 = (
bert_score(
grag_answers,
refs,
lang="en",
verbose=False
)
)
_, _, rag_f1 = (
bert_score(
rag_answers,
refs,
lang="en",
verbose=False
)
)
avg_grag_bert = round(
grag_f1.mean().item(),
4
)
avg_rag_bert = round(
rag_f1.mean().item(),
4
)
judge_pass_rate = round(
sum(
1
for r in results
if r[
"llm_judge"
] == "PASS"
)
/
len(results)
* 100,
1
)
avg_token_reduction = round(
sum(
r[
"token_reduction_vs_rag"
]
for r in results
)
/
len(results),
1
)
print(
"\n=== FINAL "
"METRICS ==="
)
print(
"GraphRAG "
"BERTScore F1: "
f"{avg_grag_bert}"
)
print(
"RAG "
"BERTScore F1: "
f"{avg_rag_bert}"
)
print(
"LLM-as-Judge "
"pass rate: "
f"{judge_pass_rate}%"
)
print(
"Avg token "
"reduction: "
f"{avg_token_reduction}%"
)
summary = {
"results":
results,
"summary":
{
"graphrag_bertscore":
avg_grag_bert,
"rag_bertscore":
avg_rag_bert,
"judge_pass_rate":
judge_pass_rate,
"avg_token_reduction":
avg_token_reduction
}
}
with open(
"data/results.json",
"w",
encoding="utf-8"
) as f:
json.dump(
summary,
f,
indent=2,
ensure_ascii=False
)
print(
"Saved to "
"data/results.json"
)
return summary
if __name__ == "__main__":
metadata = json.load(
open(
"data/metadata.json",
encoding="utf-8"
)
)
fb_only = [
m for m in metadata
if m.get("source")
== "financebench"
]
seen = set()
unique = []
for m in fb_only:
if (
m["question"]
not in seen
):
seen.add(
m["question"]
)
unique.append(m)
questions = [
{
"question":
m["question"],
"answer":
m["answer"]
}
for m in unique[:10]
]
evaluate(questions)