import os
from dotenv import load_dotenv
from datasets import Dataset
from ragas import evaluate
from ragas.llms import LangchainLLMWrapper
from ragas.metrics import Faithfulness
from langchain_mistralai import ChatMistralAI

from rag_func import prepare_RAG, retrieve_RAG, generate_RAG

load_dotenv()

# ==========================
# Utility functions
# ==========================
def process_text(str_to_process):
    ret_str = ""
    for c in str(str_to_process):
        if ord(c) == 34:  # double quote
            ret_str += chr(39)  # single quote
        elif c == "\n":
            # skip newline
            continue
        else:
            ret_str += c
    return ret_str

def process_text_list(list_to_process):
    ret_list = []
    for chunk in list_to_process:
        if isinstance(chunk, dict) and "text" in chunk:
            ret_list.append(process_text(chunk["text"]))
        elif isinstance(chunk, str):
            ret_list.append(process_text(chunk))
        else:
            ret_list.append(process_text(str(chunk)))
    return ret_list

# ==========================
# Setup RAG
# ==========================
user_input = input("Enter a subfolder inside 'context' (press Enter for full 'context'): ").strip()
user_dir = "context" if not user_input else os.path.join("context", user_input)
print(f"[Info] Using context directory: {user_dir}")

pinecone_API = os.getenv("PINECONE_API")
index_name = os.getenv("INDEX_NAME")
llm_model = os.getenv("MODELNAME")

index, pc, llm = prepare_RAG(pinecone_API, index_name, llm_model=llm_model, dir_name=user_dir)

# ==========================
# Generate dataset for evaluation
# ==========================
def generate_dataset(prompt_messages, llm, pc, index, prompt_refs=[]):
    dataset = {"question": [], "answer": [], "contexts": [], "ground_truth": []}
    for i, msg in enumerate(prompt_messages):
        print(f"[Debug] Generating answer for question {i+1}/{len(prompt_messages)}: {msg}")
        retrieved_chunks = retrieve_RAG(msg, pc, index, top_k=15)
        print(f"[Debug] Retrieved chunks for question {i+1}: {retrieved_chunks}")
        answer = generate_RAG(msg, llm, retrieved_chunks)
        print(f"[Debug] Generated answer for question {i+1}: {answer}")

        dataset["question"].append(msg)
        # assuming answer.content holds the text
        dataset["answer"].append(process_text(answer.content))
        processed_contexts = process_text_list(retrieved_chunks)
        dataset["contexts"].append(processed_contexts)
        print(f"[Debug] Processed contexts for question {i+1}: {processed_contexts}")

        if prompt_refs:
            dataset["ground_truth"].append(prompt_refs[i])
            print(f"[Debug] Ground truth for question {i+1}: {prompt_refs[i]}")
        else: 
            dataset["ground_truth"].append("")  # placeholder
    return dataset

# ==========================
# Evaluate RAG answers multiple times, with debug
# ==========================
def evaluate_RAG_system_multiple(prompt_messages, llm, pc, index, prompt_refs=None, num_runs=2):
    if prompt_refs is None:
        prompt_refs = []

    # Step 1: Generate dataset once
    print("[Info] Generating dataset once (answers will NOT be regenerated across runs)...")
    dataset_dict = generate_dataset(prompt_messages, llm, pc, index, prompt_refs)
    dataset = Dataset.from_dict(dataset_dict)
    print(f"[Debug] Number of examples: {len(dataset)}")
    print("[Debug] Sample example 0:")
    for k in dataset_dict.keys():
        print(f"  {k}: {dataset_dict[k][0]}")

    # Step 2: Evaluator LLM
    evaluator_llm = LangchainLLMWrapper(ChatMistralAI(model="mistral-large-latest"))

    # Step 3: Metric only
    metrics = [Faithfulness()]

    # Step 4: Run evaluations
    per_run_scores = []  # each item will be float mean faithfulness for that run
    all_raw = []  # raw lists per run
    for run in range(num_runs):
        print(f"[Info] ===== Evaluation run {run+1}/{num_runs} =====")
        result = evaluate(dataset=dataset, metrics=metrics, llm=evaluator_llm)
        print(f"[Debug] Type of result: {type(result)}")
        print(f"[Debug] Content of result: {result}")

        raw = None
        try:
            raw = result["faithfulness"]
            print(f"[Debug] Raw faithfulness list for run {run+1}: {raw}")
        except KeyError:
            print(f"[Warning] Run {run+1} missing 'faithfulness' key. Result keys: {list(result.keys())}")
            continue
        except Exception as e:
            print(f"[Error] Unexpected error accessing faithfulness in run {run+1}: {e}")
            continue

        if isinstance(raw, (list, tuple)):
            # average over examples
            run_mean = sum(raw) / len(raw)
            print(f"[Debug] Mean faithfulness for run {run+1}: {run_mean:.6f}")
            per_run_scores.append(run_mean)
            all_raw.append(raw)
        elif isinstance(raw, (int, float)):
            print(f"[Debug] Faithfulness is a single float for run {run+1}: {raw:.6f}")
            per_run_scores.append(float(raw))
            all_raw.append([raw])
        else:
            print(f"[Warning] Unexpected type for raw faithfulness: {type(raw)} in run {run+1}; skipping")
            continue

    if not per_run_scores:
        print("[Error] No valid run faithfulness scores collected. Cannot compute summary.")
        return {"faithfulness_runs": [], "faithfulness_avg": None}

    # Step 5: Aggregate over runs
    avg_f = sum(per_run_scores) / len(per_run_scores)
    import statistics
    if len(per_run_scores) > 1:
        std_f = statistics.pstdev(per_run_scores)
    else:
        std_f = 0.0

    # Step 6: Print summary
    print("[Info] ===== Summary over runs =====")
    for idx, s in enumerate(per_run_scores, start=1):
        print(f"  Run {idx} mean faithfulness: {s:.6f}")
    print(f"Average faithfulness over {len(per_run_scores)} runs: {avg_f:.6f}")
    print(f"StdDev faithfulness over runs: {std_f:.6f}")

    return {
        "faithfulness_runs": per_run_scores,
        "faithfulness_avg": avg_f,
        "faithfulness_std": std_f,
        "faithfulness_raw_per_run": all_raw,
    }



# ==========================
# Example usage
# ==========================
if __name__ == "__main__":
    prompt_messages = [
        "What is the primary goal of the fact-checking examiner proposed in the paper?",
        "How are fake reports generated for training the examiner?",
        "What metric is used to evaluate report quality improvement, and how is it computed?",
        "What accuracy and AUC did the examiner achieve on the test set?",
        "Why is SentenceBERT used instead of BLEU or ROUGE for report comparison?"
    ]

    prompt_refs = []  # or real ground truths if available

    results = evaluate_RAG_system_multiple(prompt_messages, llm, pc, index, prompt_refs, num_runs=2)
    print(f"[Final Result] {results}")