import sys import time import json import os sys.path.insert(0, ".") from backend.rag import query_rag from backend.llm_only import query_llm_only from backend.graphrag import ( query_graphrag, build_graph ) from bert_score import ( score as bert_score ) from groq import Groq from dotenv import load_dotenv load_dotenv() judge = Groq( api_key=os.getenv( "GROQ_API_KEY" ) ) def llm_judge( question, ground_truth, answer ) -> bool: prompt = f""" You are a lenient evaluation judge for financial questions. Question: {question} Ground Truth: {ground_truth} Answer: {answer} Does the Answer convey the same meaning or key facts as the Ground Truth, even if worded differently? Reply only YES or NO. """ r = ( judge.chat .completions.create( model= "llama-3.1-8b-instant", messages=[{ "role": "user", "content": prompt }], max_tokens=5 ) ) return ( "YES" in r.choices[0] .message.content.upper() ) def evaluate(questions): G = build_graph() results = [] llm_answers = [] rag_answers = [] grag_answers = [] refs = [] for q in questions: question = q["question"] ground_truth = q["answer"] t0 = time.time() llm = query_llm_only( question ) llm_lat = round( time.time() - t0, 2 ) t0 = time.time() rag = query_rag( question ) rag_lat = round( time.time() - t0, 2 ) t0 = time.time() grag = query_graphrag( question, G ) grag_lat = round( time.time() - t0, 2 ) llm_judge_pass = ( llm_judge( question, ground_truth, grag["answer"] ) ) token_reduction = round( ( rag["total_tokens"] - grag["total_tokens"] ) / rag["total_tokens"] * 100, 1 ) results.append({ "question": question, "ground_truth": ground_truth, "llm_only": { **llm, "latency": llm_lat }, "rag": { **rag, "latency": rag_lat }, "graphrag": { **grag, "latency": grag_lat }, "token_reduction_vs_rag": token_reduction, "llm_judge": ( "PASS" if llm_judge_pass else "FAIL" ) }) llm_answers.append( llm["answer"] ) rag_answers.append( rag["answer"] ) grag_answers.append( grag["answer"] ) refs.append( ground_truth ) print( f"Q: " f"{question[:50]}" ) print( f" " f"LLM:" f"{llm['total_tokens']}t " f"{llm_lat}s | " f"RAG:" f"{rag['total_tokens']}t " f"{rag_lat}s | " f"GraphRAG:" f"{grag['total_tokens']}t " f"{grag_lat}s" ) print( f" " f"Token reduction: " f"{token_reduction}% " f"| Judge: " f"{results[-1]['llm_judge']}" ) print( "\nComputing " "BERTScore..." ) _, _, grag_f1 = ( bert_score( grag_answers, refs, lang="en", verbose=False ) ) _, _, rag_f1 = ( bert_score( rag_answers, refs, lang="en", verbose=False ) ) avg_grag_bert = round( grag_f1.mean().item(), 4 ) avg_rag_bert = round( rag_f1.mean().item(), 4 ) judge_pass_rate = round( sum( 1 for r in results if r[ "llm_judge" ] == "PASS" ) / len(results) * 100, 1 ) avg_token_reduction = round( sum( r[ "token_reduction_vs_rag" ] for r in results ) / len(results), 1 ) print( "\n=== FINAL " "METRICS ===" ) print( "GraphRAG " "BERTScore F1: " f"{avg_grag_bert}" ) print( "RAG " "BERTScore F1: " f"{avg_rag_bert}" ) print( "LLM-as-Judge " "pass rate: " f"{judge_pass_rate}%" ) print( "Avg token " "reduction: " f"{avg_token_reduction}%" ) summary = { "results": results, "summary": { "graphrag_bertscore": avg_grag_bert, "rag_bertscore": avg_rag_bert, "judge_pass_rate": judge_pass_rate, "avg_token_reduction": avg_token_reduction } } with open( "data/results.json", "w", encoding="utf-8" ) as f: json.dump( summary, f, indent=2, ensure_ascii=False ) print( "Saved to " "data/results.json" ) return summary if __name__ == "__main__": metadata = json.load( open( "data/metadata.json", encoding="utf-8" ) ) fb_only = [ m for m in metadata if m.get("source") == "financebench" ] seen = set() unique = [] for m in fb_only: if ( m["question"] not in seen ): seen.add( m["question"] ) unique.append(m) questions = [ { "question": m["question"], "answer": m["answer"] } for m in unique[:10] ] evaluate(questions)