Spaces:

smash1pump
/

RAG_Regulatorik

Sleeping

App Files Files Community

RAG_Regulatorik / rag_evaluate.py

smash1pump

Upload folder using huggingface_hub

eb93f4c verified 8 months ago

raw

history blame contribute delete

7.09 kB

	import os
	from dotenv import load_dotenv
	from datasets import Dataset
	from ragas import evaluate
	from ragas.llms import LangchainLLMWrapper
	from ragas.metrics import Faithfulness
	from langchain_mistralai import ChatMistralAI

	from rag_func import prepare_RAG, retrieve_RAG, generate_RAG

	load_dotenv()

	# ==========================
	# Utility functions
	# ==========================
	def process_text(str_to_process):
	ret_str = ""
	for c in str(str_to_process):
	if ord(c) == 34: # double quote
	ret_str += chr(39) # single quote
	elif c == "\n":
	# skip newline
	continue
	else:
	ret_str += c
	return ret_str

	def process_text_list(list_to_process):
	ret_list = []
	for chunk in list_to_process:
	if isinstance(chunk, dict) and "text" in chunk:
	ret_list.append(process_text(chunk["text"]))
	elif isinstance(chunk, str):
	ret_list.append(process_text(chunk))
	else:
	ret_list.append(process_text(str(chunk)))
	return ret_list

	# ==========================
	# Setup RAG
	# ==========================
	user_input = input("Enter a subfolder inside 'context' (press Enter for full 'context'): ").strip()
	user_dir = "context" if not user_input else os.path.join("context", user_input)
	print(f"[Info] Using context directory: {user_dir}")

	pinecone_API = os.getenv("PINECONE_API")
	index_name = os.getenv("INDEX_NAME")
	llm_model = os.getenv("MODELNAME")

	index, pc, llm = prepare_RAG(pinecone_API, index_name, llm_model=llm_model, dir_name=user_dir)

	# ==========================
	# Generate dataset for evaluation
	# ==========================
	def generate_dataset(prompt_messages, llm, pc, index, prompt_refs=[]):
	dataset = {"question": [], "answer": [], "contexts": [], "ground_truth": []}
	for i, msg in enumerate(prompt_messages):
	print(f"[Debug] Generating answer for question {i+1}/{len(prompt_messages)}: {msg}")
	retrieved_chunks = retrieve_RAG(msg, pc, index, top_k=15)
	print(f"[Debug] Retrieved chunks for question {i+1}: {retrieved_chunks}")
	answer = generate_RAG(msg, llm, retrieved_chunks)
	print(f"[Debug] Generated answer for question {i+1}: {answer}")

	dataset["question"].append(msg)
	# assuming answer.content holds the text
	dataset["answer"].append(process_text(answer.content))
	processed_contexts = process_text_list(retrieved_chunks)
	dataset["contexts"].append(processed_contexts)
	print(f"[Debug] Processed contexts for question {i+1}: {processed_contexts}")

	if prompt_refs:
	dataset["ground_truth"].append(prompt_refs[i])
	print(f"[Debug] Ground truth for question {i+1}: {prompt_refs[i]}")
	else:
	dataset["ground_truth"].append("") # placeholder
	return dataset

	# ==========================
	# Evaluate RAG answers multiple times, with debug
	# ==========================
	def evaluate_RAG_system_multiple(prompt_messages, llm, pc, index, prompt_refs=None, num_runs=2):
	if prompt_refs is None:
	prompt_refs = []

	# Step 1: Generate dataset once
	print("[Info] Generating dataset once (answers will NOT be regenerated across runs)...")
	dataset_dict = generate_dataset(prompt_messages, llm, pc, index, prompt_refs)
	dataset = Dataset.from_dict(dataset_dict)
	print(f"[Debug] Number of examples: {len(dataset)}")
	print("[Debug] Sample example 0:")
	for k in dataset_dict.keys():
	print(f" {k}: {dataset_dict[k][0]}")

	# Step 2: Evaluator LLM
	evaluator_llm = LangchainLLMWrapper(ChatMistralAI(model="mistral-large-latest"))

	# Step 3: Metric only
	metrics = [Faithfulness()]

	# Step 4: Run evaluations
	per_run_scores = [] # each item will be float mean faithfulness for that run
	all_raw = [] # raw lists per run
	for run in range(num_runs):
	print(f"[Info] ===== Evaluation run {run+1}/{num_runs} =====")
	result = evaluate(dataset=dataset, metrics=metrics, llm=evaluator_llm)
	print(f"[Debug] Type of result: {type(result)}")
	print(f"[Debug] Content of result: {result}")

	raw = None
	try:
	raw = result["faithfulness"]
	print(f"[Debug] Raw faithfulness list for run {run+1}: {raw}")
	except KeyError:
	print(f"[Warning] Run {run+1} missing 'faithfulness' key. Result keys: {list(result.keys())}")
	continue
	except Exception as e:
	print(f"[Error] Unexpected error accessing faithfulness in run {run+1}: {e}")
	continue

	if isinstance(raw, (list, tuple)):
	# average over examples
	run_mean = sum(raw) / len(raw)
	print(f"[Debug] Mean faithfulness for run {run+1}: {run_mean:.6f}")
	per_run_scores.append(run_mean)
	all_raw.append(raw)
	elif isinstance(raw, (int, float)):
	print(f"[Debug] Faithfulness is a single float for run {run+1}: {raw:.6f}")
	per_run_scores.append(float(raw))
	all_raw.append([raw])
	else:
	print(f"[Warning] Unexpected type for raw faithfulness: {type(raw)} in run {run+1}; skipping")
	continue

	if not per_run_scores:
	print("[Error] No valid run faithfulness scores collected. Cannot compute summary.")
	return {"faithfulness_runs": [], "faithfulness_avg": None}

	# Step 5: Aggregate over runs
	avg_f = sum(per_run_scores) / len(per_run_scores)
	import statistics
	if len(per_run_scores) > 1:
	std_f = statistics.pstdev(per_run_scores)
	else:
	std_f = 0.0

	# Step 6: Print summary
	print("[Info] ===== Summary over runs =====")
	for idx, s in enumerate(per_run_scores, start=1):
	print(f" Run {idx} mean faithfulness: {s:.6f}")
	print(f"Average faithfulness over {len(per_run_scores)} runs: {avg_f:.6f}")
	print(f"StdDev faithfulness over runs: {std_f:.6f}")

	return {
	"faithfulness_runs": per_run_scores,
	"faithfulness_avg": avg_f,
	"faithfulness_std": std_f,
	"faithfulness_raw_per_run": all_raw,
	}



	# ==========================
	# Example usage
	# ==========================
	if __name__ == "__main__":
	prompt_messages = [
	"What is the primary goal of the fact-checking examiner proposed in the paper?",
	"How are fake reports generated for training the examiner?",
	"What metric is used to evaluate report quality improvement, and how is it computed?",
	"What accuracy and AUC did the examiner achieve on the test set?",
	"Why is SentenceBERT used instead of BLEU or ROUGE for report comparison?"
	]

	prompt_refs = [] # or real ground truths if available

	results = evaluate_RAG_system_multiple(prompt_messages, llm, pc, index, prompt_refs, num_runs=2)
	print(f"[Final Result] {results}")