RAG_Regulatorik / rag_evaluate.py
smash1pump's picture
Upload folder using huggingface_hub
eb93f4c verified
import os
from dotenv import load_dotenv
from datasets import Dataset
from ragas import evaluate
from ragas.llms import LangchainLLMWrapper
from ragas.metrics import Faithfulness
from langchain_mistralai import ChatMistralAI
from rag_func import prepare_RAG, retrieve_RAG, generate_RAG
load_dotenv()
# ==========================
# Utility functions
# ==========================
def process_text(str_to_process):
ret_str = ""
for c in str(str_to_process):
if ord(c) == 34: # double quote
ret_str += chr(39) # single quote
elif c == "\n":
# skip newline
continue
else:
ret_str += c
return ret_str
def process_text_list(list_to_process):
ret_list = []
for chunk in list_to_process:
if isinstance(chunk, dict) and "text" in chunk:
ret_list.append(process_text(chunk["text"]))
elif isinstance(chunk, str):
ret_list.append(process_text(chunk))
else:
ret_list.append(process_text(str(chunk)))
return ret_list
# ==========================
# Setup RAG
# ==========================
user_input = input("Enter a subfolder inside 'context' (press Enter for full 'context'): ").strip()
user_dir = "context" if not user_input else os.path.join("context", user_input)
print(f"[Info] Using context directory: {user_dir}")
pinecone_API = os.getenv("PINECONE_API")
index_name = os.getenv("INDEX_NAME")
llm_model = os.getenv("MODELNAME")
index, pc, llm = prepare_RAG(pinecone_API, index_name, llm_model=llm_model, dir_name=user_dir)
# ==========================
# Generate dataset for evaluation
# ==========================
def generate_dataset(prompt_messages, llm, pc, index, prompt_refs=[]):
dataset = {"question": [], "answer": [], "contexts": [], "ground_truth": []}
for i, msg in enumerate(prompt_messages):
print(f"[Debug] Generating answer for question {i+1}/{len(prompt_messages)}: {msg}")
retrieved_chunks = retrieve_RAG(msg, pc, index, top_k=15)
print(f"[Debug] Retrieved chunks for question {i+1}: {retrieved_chunks}")
answer = generate_RAG(msg, llm, retrieved_chunks)
print(f"[Debug] Generated answer for question {i+1}: {answer}")
dataset["question"].append(msg)
# assuming answer.content holds the text
dataset["answer"].append(process_text(answer.content))
processed_contexts = process_text_list(retrieved_chunks)
dataset["contexts"].append(processed_contexts)
print(f"[Debug] Processed contexts for question {i+1}: {processed_contexts}")
if prompt_refs:
dataset["ground_truth"].append(prompt_refs[i])
print(f"[Debug] Ground truth for question {i+1}: {prompt_refs[i]}")
else:
dataset["ground_truth"].append("") # placeholder
return dataset
# ==========================
# Evaluate RAG answers multiple times, with debug
# ==========================
def evaluate_RAG_system_multiple(prompt_messages, llm, pc, index, prompt_refs=None, num_runs=2):
if prompt_refs is None:
prompt_refs = []
# Step 1: Generate dataset once
print("[Info] Generating dataset once (answers will NOT be regenerated across runs)...")
dataset_dict = generate_dataset(prompt_messages, llm, pc, index, prompt_refs)
dataset = Dataset.from_dict(dataset_dict)
print(f"[Debug] Number of examples: {len(dataset)}")
print("[Debug] Sample example 0:")
for k in dataset_dict.keys():
print(f" {k}: {dataset_dict[k][0]}")
# Step 2: Evaluator LLM
evaluator_llm = LangchainLLMWrapper(ChatMistralAI(model="mistral-large-latest"))
# Step 3: Metric only
metrics = [Faithfulness()]
# Step 4: Run evaluations
per_run_scores = [] # each item will be float mean faithfulness for that run
all_raw = [] # raw lists per run
for run in range(num_runs):
print(f"[Info] ===== Evaluation run {run+1}/{num_runs} =====")
result = evaluate(dataset=dataset, metrics=metrics, llm=evaluator_llm)
print(f"[Debug] Type of result: {type(result)}")
print(f"[Debug] Content of result: {result}")
raw = None
try:
raw = result["faithfulness"]
print(f"[Debug] Raw faithfulness list for run {run+1}: {raw}")
except KeyError:
print(f"[Warning] Run {run+1} missing 'faithfulness' key. Result keys: {list(result.keys())}")
continue
except Exception as e:
print(f"[Error] Unexpected error accessing faithfulness in run {run+1}: {e}")
continue
if isinstance(raw, (list, tuple)):
# average over examples
run_mean = sum(raw) / len(raw)
print(f"[Debug] Mean faithfulness for run {run+1}: {run_mean:.6f}")
per_run_scores.append(run_mean)
all_raw.append(raw)
elif isinstance(raw, (int, float)):
print(f"[Debug] Faithfulness is a single float for run {run+1}: {raw:.6f}")
per_run_scores.append(float(raw))
all_raw.append([raw])
else:
print(f"[Warning] Unexpected type for raw faithfulness: {type(raw)} in run {run+1}; skipping")
continue
if not per_run_scores:
print("[Error] No valid run faithfulness scores collected. Cannot compute summary.")
return {"faithfulness_runs": [], "faithfulness_avg": None}
# Step 5: Aggregate over runs
avg_f = sum(per_run_scores) / len(per_run_scores)
import statistics
if len(per_run_scores) > 1:
std_f = statistics.pstdev(per_run_scores)
else:
std_f = 0.0
# Step 6: Print summary
print("[Info] ===== Summary over runs =====")
for idx, s in enumerate(per_run_scores, start=1):
print(f" Run {idx} mean faithfulness: {s:.6f}")
print(f"Average faithfulness over {len(per_run_scores)} runs: {avg_f:.6f}")
print(f"StdDev faithfulness over runs: {std_f:.6f}")
return {
"faithfulness_runs": per_run_scores,
"faithfulness_avg": avg_f,
"faithfulness_std": std_f,
"faithfulness_raw_per_run": all_raw,
}
# ==========================
# Example usage
# ==========================
if __name__ == "__main__":
prompt_messages = [
"What is the primary goal of the fact-checking examiner proposed in the paper?",
"How are fake reports generated for training the examiner?",
"What metric is used to evaluate report quality improvement, and how is it computed?",
"What accuracy and AUC did the examiner achieve on the test set?",
"Why is SentenceBERT used instead of BLEU or ROUGE for report comparison?"
]
prompt_refs = [] # or real ground truths if available
results = evaluate_RAG_system_multiple(prompt_messages, llm, pc, index, prompt_refs, num_runs=2)
print(f"[Final Result] {results}")