import os from dotenv import load_dotenv from datasets import Dataset from ragas import evaluate from ragas.llms import LangchainLLMWrapper from ragas.metrics import Faithfulness from langchain_mistralai import ChatMistralAI from rag_func import prepare_RAG, retrieve_RAG, generate_RAG load_dotenv() # ========================== # Utility functions # ========================== def process_text(str_to_process): ret_str = "" for c in str(str_to_process): if ord(c) == 34: # double quote ret_str += chr(39) # single quote elif c == "\n": # skip newline continue else: ret_str += c return ret_str def process_text_list(list_to_process): ret_list = [] for chunk in list_to_process: if isinstance(chunk, dict) and "text" in chunk: ret_list.append(process_text(chunk["text"])) elif isinstance(chunk, str): ret_list.append(process_text(chunk)) else: ret_list.append(process_text(str(chunk))) return ret_list # ========================== # Setup RAG # ========================== user_input = input("Enter a subfolder inside 'context' (press Enter for full 'context'): ").strip() user_dir = "context" if not user_input else os.path.join("context", user_input) print(f"[Info] Using context directory: {user_dir}") pinecone_API = os.getenv("PINECONE_API") index_name = os.getenv("INDEX_NAME") llm_model = os.getenv("MODELNAME") index, pc, llm = prepare_RAG(pinecone_API, index_name, llm_model=llm_model, dir_name=user_dir) # ========================== # Generate dataset for evaluation # ========================== def generate_dataset(prompt_messages, llm, pc, index, prompt_refs=[]): dataset = {"question": [], "answer": [], "contexts": [], "ground_truth": []} for i, msg in enumerate(prompt_messages): print(f"[Debug] Generating answer for question {i+1}/{len(prompt_messages)}: {msg}") retrieved_chunks = retrieve_RAG(msg, pc, index, top_k=15) print(f"[Debug] Retrieved chunks for question {i+1}: {retrieved_chunks}") answer = generate_RAG(msg, llm, retrieved_chunks) print(f"[Debug] Generated answer for question {i+1}: {answer}") dataset["question"].append(msg) # assuming answer.content holds the text dataset["answer"].append(process_text(answer.content)) processed_contexts = process_text_list(retrieved_chunks) dataset["contexts"].append(processed_contexts) print(f"[Debug] Processed contexts for question {i+1}: {processed_contexts}") if prompt_refs: dataset["ground_truth"].append(prompt_refs[i]) print(f"[Debug] Ground truth for question {i+1}: {prompt_refs[i]}") else: dataset["ground_truth"].append("") # placeholder return dataset # ========================== # Evaluate RAG answers multiple times, with debug # ========================== def evaluate_RAG_system_multiple(prompt_messages, llm, pc, index, prompt_refs=None, num_runs=2): if prompt_refs is None: prompt_refs = [] # Step 1: Generate dataset once print("[Info] Generating dataset once (answers will NOT be regenerated across runs)...") dataset_dict = generate_dataset(prompt_messages, llm, pc, index, prompt_refs) dataset = Dataset.from_dict(dataset_dict) print(f"[Debug] Number of examples: {len(dataset)}") print("[Debug] Sample example 0:") for k in dataset_dict.keys(): print(f" {k}: {dataset_dict[k][0]}") # Step 2: Evaluator LLM evaluator_llm = LangchainLLMWrapper(ChatMistralAI(model="mistral-large-latest")) # Step 3: Metric only metrics = [Faithfulness()] # Step 4: Run evaluations per_run_scores = [] # each item will be float mean faithfulness for that run all_raw = [] # raw lists per run for run in range(num_runs): print(f"[Info] ===== Evaluation run {run+1}/{num_runs} =====") result = evaluate(dataset=dataset, metrics=metrics, llm=evaluator_llm) print(f"[Debug] Type of result: {type(result)}") print(f"[Debug] Content of result: {result}") raw = None try: raw = result["faithfulness"] print(f"[Debug] Raw faithfulness list for run {run+1}: {raw}") except KeyError: print(f"[Warning] Run {run+1} missing 'faithfulness' key. Result keys: {list(result.keys())}") continue except Exception as e: print(f"[Error] Unexpected error accessing faithfulness in run {run+1}: {e}") continue if isinstance(raw, (list, tuple)): # average over examples run_mean = sum(raw) / len(raw) print(f"[Debug] Mean faithfulness for run {run+1}: {run_mean:.6f}") per_run_scores.append(run_mean) all_raw.append(raw) elif isinstance(raw, (int, float)): print(f"[Debug] Faithfulness is a single float for run {run+1}: {raw:.6f}") per_run_scores.append(float(raw)) all_raw.append([raw]) else: print(f"[Warning] Unexpected type for raw faithfulness: {type(raw)} in run {run+1}; skipping") continue if not per_run_scores: print("[Error] No valid run faithfulness scores collected. Cannot compute summary.") return {"faithfulness_runs": [], "faithfulness_avg": None} # Step 5: Aggregate over runs avg_f = sum(per_run_scores) / len(per_run_scores) import statistics if len(per_run_scores) > 1: std_f = statistics.pstdev(per_run_scores) else: std_f = 0.0 # Step 6: Print summary print("[Info] ===== Summary over runs =====") for idx, s in enumerate(per_run_scores, start=1): print(f" Run {idx} mean faithfulness: {s:.6f}") print(f"Average faithfulness over {len(per_run_scores)} runs: {avg_f:.6f}") print(f"StdDev faithfulness over runs: {std_f:.6f}") return { "faithfulness_runs": per_run_scores, "faithfulness_avg": avg_f, "faithfulness_std": std_f, "faithfulness_raw_per_run": all_raw, } # ========================== # Example usage # ========================== if __name__ == "__main__": prompt_messages = [ "What is the primary goal of the fact-checking examiner proposed in the paper?", "How are fake reports generated for training the examiner?", "What metric is used to evaluate report quality improvement, and how is it computed?", "What accuracy and AUC did the examiner achieve on the test set?", "Why is SentenceBERT used instead of BLEU or ROUGE for report comparison?" ] prompt_refs = [] # or real ground truths if available results = evaluate_RAG_system_multiple(prompt_messages, llm, pc, index, prompt_refs, num_runs=2) print(f"[Final Result] {results}")