import json import os from llm_utils import generate_with_retry import google.generativeai as genai from dotenv import load_dotenv load_dotenv() LOG_FILE = "rag_eval_logs.jsonl" MODEL_NAME = "gemini-2.5-flash" API_KEY = os.getenv("GEMINI_API_KEY") if not API_KEY: print("āŒ GEMINI_API_KEY not found in env.") exit(1) genai.configure(api_key=API_KEY) def calculate_faithfulness(answer, contexts): """ Score 0.0 to 1.0 Measure: Is the answer derived *only* from the context? """ if not contexts: return 0.0 context_text = "\n".join(contexts) prompt = f""" You are an AI Judge. Rate the 'Faithfulness' of the Answer to the Context on a scale of 0.0 to 1.0. 1.0 = Answer is strictly derived from Context. 0.0 = Answer contains hallucinations or info not in Context. Context: {context_text[:3000]} Answer: {answer} Return ONLY a single float number (e.g. 0.9). """ model = genai.GenerativeModel(MODEL_NAME) try: resp = model.generate_content(prompt) score = float(resp.text.strip()) return max(0.0, min(1.0, score)) except: return 0.5 # Default on error def calculate_relevancy(query, answer): """ Score 0.0 to 1.0 Measure: Does the answer directly address the query? """ prompt = f""" You are an AI Judge. Rate the 'Relevancy' of the Answer to the Query on a scale of 0.0 to 1.0. 1.0 = Answer directly addresses the query. 0.0 = Answer is unrelated or ignores the user. Query: {query} Answer: {answer} Return ONLY a single float number (e.g. 0.9). """ model = genai.GenerativeModel(MODEL_NAME) try: resp = model.generate_content(prompt) score = float(resp.text.strip()) return max(0.0, min(1.0, score)) except: return 0.5 def run_audit(): if not os.path.exists(LOG_FILE): print(f"No log file found at {LOG_FILE}") return print(f"šŸ“Š Running Post-Hoc Audit on {LOG_FILE}...\n") print(f"{'Query':<30} | {'Faithful':<10} | {'Relevancy':<10}") print("-" * 60) total_f = 0 total_r = 0 count = 0 with open(LOG_FILE, "r", encoding="utf-8") as f: for line in f: try: data = json.loads(line) # Skip legacy logs without final answer if "final_answer" not in data or not data["final_answer"]: continue q = data["query"] a = data["final_answer"] c = data.get("context_list", []) f_score = calculate_faithfulness(a, c) r_score = calculate_relevancy(q, a) print(f"{q[:30]:<30} | {f_score:.2f} | {r_score:.2f}") total_f += f_score total_r += r_score count += 1 except Exception as e: pass # Skip bad lines if count > 0: print("-" * 60) print(f"\nāœ… Audit Complete.") print(f"Average Faithfulness: {total_f/count:.2f}") print(f"Average Relevancy: {total_r/count:.2f}") else: print("\nāš ļø No complete records found to audit. Ask some questions first!") if __name__ == "__main__": run_audit()