Spaces:
Sleeping
Sleeping
| import os | |
| from dotenv import load_dotenv | |
| from datasets import Dataset | |
| from ragas import evaluate | |
| from ragas.llms import LangchainLLMWrapper | |
| from ragas.metrics import Faithfulness | |
| from langchain_mistralai import ChatMistralAI | |
| from rag_func import prepare_RAG, retrieve_RAG, generate_RAG | |
| load_dotenv() | |
| # ========================== | |
| # Utility functions | |
| # ========================== | |
| def process_text(str_to_process): | |
| ret_str = "" | |
| for c in str(str_to_process): | |
| if ord(c) == 34: # double quote | |
| ret_str += chr(39) # single quote | |
| elif c == "\n": | |
| # skip newline | |
| continue | |
| else: | |
| ret_str += c | |
| return ret_str | |
| def process_text_list(list_to_process): | |
| ret_list = [] | |
| for chunk in list_to_process: | |
| if isinstance(chunk, dict) and "text" in chunk: | |
| ret_list.append(process_text(chunk["text"])) | |
| elif isinstance(chunk, str): | |
| ret_list.append(process_text(chunk)) | |
| else: | |
| ret_list.append(process_text(str(chunk))) | |
| return ret_list | |
| # ========================== | |
| # Setup RAG | |
| # ========================== | |
| user_input = input("Enter a subfolder inside 'context' (press Enter for full 'context'): ").strip() | |
| user_dir = "context" if not user_input else os.path.join("context", user_input) | |
| print(f"[Info] Using context directory: {user_dir}") | |
| pinecone_API = os.getenv("PINECONE_API") | |
| index_name = os.getenv("INDEX_NAME") | |
| llm_model = os.getenv("MODELNAME") | |
| index, pc, llm = prepare_RAG(pinecone_API, index_name, llm_model=llm_model, dir_name=user_dir) | |
| # ========================== | |
| # Generate dataset for evaluation | |
| # ========================== | |
| def generate_dataset(prompt_messages, llm, pc, index, prompt_refs=[]): | |
| dataset = {"question": [], "answer": [], "contexts": [], "ground_truth": []} | |
| for i, msg in enumerate(prompt_messages): | |
| print(f"[Debug] Generating answer for question {i+1}/{len(prompt_messages)}: {msg}") | |
| retrieved_chunks = retrieve_RAG(msg, pc, index, top_k=15) | |
| print(f"[Debug] Retrieved chunks for question {i+1}: {retrieved_chunks}") | |
| answer = generate_RAG(msg, llm, retrieved_chunks) | |
| print(f"[Debug] Generated answer for question {i+1}: {answer}") | |
| dataset["question"].append(msg) | |
| # assuming answer.content holds the text | |
| dataset["answer"].append(process_text(answer.content)) | |
| processed_contexts = process_text_list(retrieved_chunks) | |
| dataset["contexts"].append(processed_contexts) | |
| print(f"[Debug] Processed contexts for question {i+1}: {processed_contexts}") | |
| if prompt_refs: | |
| dataset["ground_truth"].append(prompt_refs[i]) | |
| print(f"[Debug] Ground truth for question {i+1}: {prompt_refs[i]}") | |
| else: | |
| dataset["ground_truth"].append("") # placeholder | |
| return dataset | |
| # ========================== | |
| # Evaluate RAG answers multiple times, with debug | |
| # ========================== | |
| def evaluate_RAG_system_multiple(prompt_messages, llm, pc, index, prompt_refs=None, num_runs=2): | |
| if prompt_refs is None: | |
| prompt_refs = [] | |
| # Step 1: Generate dataset once | |
| print("[Info] Generating dataset once (answers will NOT be regenerated across runs)...") | |
| dataset_dict = generate_dataset(prompt_messages, llm, pc, index, prompt_refs) | |
| dataset = Dataset.from_dict(dataset_dict) | |
| print(f"[Debug] Number of examples: {len(dataset)}") | |
| print("[Debug] Sample example 0:") | |
| for k in dataset_dict.keys(): | |
| print(f" {k}: {dataset_dict[k][0]}") | |
| # Step 2: Evaluator LLM | |
| evaluator_llm = LangchainLLMWrapper(ChatMistralAI(model="mistral-large-latest")) | |
| # Step 3: Metric only | |
| metrics = [Faithfulness()] | |
| # Step 4: Run evaluations | |
| per_run_scores = [] # each item will be float mean faithfulness for that run | |
| all_raw = [] # raw lists per run | |
| for run in range(num_runs): | |
| print(f"[Info] ===== Evaluation run {run+1}/{num_runs} =====") | |
| result = evaluate(dataset=dataset, metrics=metrics, llm=evaluator_llm) | |
| print(f"[Debug] Type of result: {type(result)}") | |
| print(f"[Debug] Content of result: {result}") | |
| raw = None | |
| try: | |
| raw = result["faithfulness"] | |
| print(f"[Debug] Raw faithfulness list for run {run+1}: {raw}") | |
| except KeyError: | |
| print(f"[Warning] Run {run+1} missing 'faithfulness' key. Result keys: {list(result.keys())}") | |
| continue | |
| except Exception as e: | |
| print(f"[Error] Unexpected error accessing faithfulness in run {run+1}: {e}") | |
| continue | |
| if isinstance(raw, (list, tuple)): | |
| # average over examples | |
| run_mean = sum(raw) / len(raw) | |
| print(f"[Debug] Mean faithfulness for run {run+1}: {run_mean:.6f}") | |
| per_run_scores.append(run_mean) | |
| all_raw.append(raw) | |
| elif isinstance(raw, (int, float)): | |
| print(f"[Debug] Faithfulness is a single float for run {run+1}: {raw:.6f}") | |
| per_run_scores.append(float(raw)) | |
| all_raw.append([raw]) | |
| else: | |
| print(f"[Warning] Unexpected type for raw faithfulness: {type(raw)} in run {run+1}; skipping") | |
| continue | |
| if not per_run_scores: | |
| print("[Error] No valid run faithfulness scores collected. Cannot compute summary.") | |
| return {"faithfulness_runs": [], "faithfulness_avg": None} | |
| # Step 5: Aggregate over runs | |
| avg_f = sum(per_run_scores) / len(per_run_scores) | |
| import statistics | |
| if len(per_run_scores) > 1: | |
| std_f = statistics.pstdev(per_run_scores) | |
| else: | |
| std_f = 0.0 | |
| # Step 6: Print summary | |
| print("[Info] ===== Summary over runs =====") | |
| for idx, s in enumerate(per_run_scores, start=1): | |
| print(f" Run {idx} mean faithfulness: {s:.6f}") | |
| print(f"Average faithfulness over {len(per_run_scores)} runs: {avg_f:.6f}") | |
| print(f"StdDev faithfulness over runs: {std_f:.6f}") | |
| return { | |
| "faithfulness_runs": per_run_scores, | |
| "faithfulness_avg": avg_f, | |
| "faithfulness_std": std_f, | |
| "faithfulness_raw_per_run": all_raw, | |
| } | |
| # ========================== | |
| # Example usage | |
| # ========================== | |
| if __name__ == "__main__": | |
| prompt_messages = [ | |
| "What is the primary goal of the fact-checking examiner proposed in the paper?", | |
| "How are fake reports generated for training the examiner?", | |
| "What metric is used to evaluate report quality improvement, and how is it computed?", | |
| "What accuracy and AUC did the examiner achieve on the test set?", | |
| "Why is SentenceBERT used instead of BLEU or ROUGE for report comparison?" | |
| ] | |
| prompt_refs = [] # or real ground truths if available | |
| results = evaluate_RAG_system_multiple(prompt_messages, llm, pc, index, prompt_refs, num_runs=2) | |
| print(f"[Final Result] {results}") | |