"""
src/pipeline/generator.py — LLM Answer Generation
===================================================
Supports multiple providers based on config.yaml → llm.provider:
  - "gemini"  : Google Gemini API (recommended)
  - "mistral" : Mistral AI API (api.mistral.ai)
  - "groq"    : Groq Cloud API (fast inference)
  - "ollama"  : Local Ollama/Mistral (requires Ollama running locally)

API Key setup:
  Set env variables in Backend/.env:
    GEMINI_API_KEY=your_key
    MISTRAL_API_KEY=your_key
    GROQ_API_KEY=your_key
"""
from __future__ import annotations

import json
import logging
import os
import time
from pathlib import Path
from typing import Optional

import yaml

logger = logging.getLogger(__name__)

# Load .env file at module import time
def _load_env():
    env_path = Path(".env")
    if not env_path.exists():
        # Try one level up
        env_path = Path("../Backend/.env")
    if env_path.exists():
        for line in env_path.read_text().splitlines():
            line = line.strip()
            if line and not line.startswith("#") and "=" in line:
                key, val = line.split("=", 1)
                key = key.strip()
                val = val.strip().strip('"').strip("'")
                if key and val and key not in os.environ:
                    os.environ[key] = val

_load_env()

# ---------------------------------------------------------------------------
# Config loader
# ---------------------------------------------------------------------------

def _load_config() -> dict:
    try:
        config_path = os.environ.get("MEDIRAG_CONFIG", "config_local.yaml" if Path("config_local.yaml").exists() else "config.yaml")
        return yaml.safe_load(Path(config_path).read_text())
    except Exception:
        return {}


# ---------------------------------------------------------------------------
# Prompt builder (shared by both providers)
# ---------------------------------------------------------------------------

_PHYSICIAN_PROMPT = (
    "You are MediRAG, a medical AI assistant tailored for clinicians and researchers. "
    "You MUST answer ONLY using information explicitly stated in the CONTEXT provided below. "
    "Use professional medical terminology, be concise, and cite specific details. "
    "After each claim, cite it inline as [Source: <document title>]. "
    "If the context does NOT contain sufficient information to answer safely, you MUST respond EXACTLY with: "
    "'⚠️ The retrieved context does not contain enough information to answer this safely. "
    "Please consult authoritative clinical guidelines or a specialist.' "
    "NEVER use general knowledge, training data, or information outside the provided context."
)

_PATIENT_PROMPT = (
    "You are MediRAG, a medical AI assistant tailored for patients and non-experts. "
    "You MUST answer ONLY using information explicitly stated in the CONTEXT provided below. "
    "Explain medical information in a clear, accessible, and empathetic way. "
    "After each claim, cite it inline as [Source: <document title>]. "
    "If the context does NOT contain sufficient information to answer safely, you MUST respond EXACTLY with: "
    "'⚠️ The retrieved context does not contain enough information to answer this safely. "
    "Please consult your doctor or a medical specialist.' "
    "NEVER use general knowledge, training data, or information outside the provided context."
)

_SYSTEM_PROMPT = _PHYSICIAN_PROMPT # Default fallback


def _build_prompt(question: str, context_chunks: list[dict], system_prompt: Optional[str] = None, persona: str = "physician") -> str:
    """Build the RAG prompt from the question + retrieved chunks.
    
    Explicitly surfaces title and source for each chunk in the header so the LLM
    can cite [Source: <title>] inline in its answer.
    """
    context_parts = []
    for i, chunk in enumerate(context_chunks, 1):
        text = chunk.get("text") or chunk.get("chunk_text", "")
        title = chunk.get("title", "")
        source = chunk.get("source", "")
        pub_type = chunk.get("pub_type", "")
        # Include title as the primary citation label
        header_parts = [f"Source {i}"]
        if title:
            header_parts.append(f"Title: {title}")
        if pub_type:
            header_parts.append(pub_type)
        if source and source != title:
            header_parts.append(source)
        header = "[" + " | ".join(header_parts) + "]"
        context_parts.append(f"{header}\n{text.strip()}")

    context_block = "\n\n".join(context_parts)
    
    # Determine effective system prompt based on persona if no manual override
    if system_prompt:
        effective_system = system_prompt
    else:
        effective_system = _PATIENT_PROMPT if persona == "patient" else _PHYSICIAN_PROMPT

    return (
        f"{effective_system}\n\n"
        f"CONTEXT:\n{context_block}\n\n"
        f"QUESTION: {question}\n\n"
        f"ANSWER (cite sources inline as [Source: document title]):"
    )


# Strict prompt — used when first answer fails evaluation (HRS ≥ 60)
_STRICT_SYSTEM_PROMPT = (
    "You are MediRAG, a clinical safety assistant under strict mode. "
    "A previous response was flagged as potentially unsafe or inaccurate. "
    "You MUST answer ONLY using the information explicitly stated in the CONTEXT below. "
    "Do NOT use any general medical knowledge, training data, or outside information. "
    "If the context is insufficient, you MUST say EXACTLY: "
    "'⚠️ Insufficient evidence in retrieved context to answer safely. Please consult a clinical specialist.' "
    "NEVER hallucinate drug names, dosages, or clinical recommendations."
)


def _build_strict_prompt(question: str, context_chunks: list[dict]) -> str:
    """Strict prompt: context-only, used on regeneration after failed evaluation."""
    context_parts = []
    for i, chunk in enumerate(context_chunks, 1):
        text = chunk.get("text") or chunk.get("chunk_text", "")
        title = chunk.get("title", "")
        source = chunk.get("source", "")
        pub_type = chunk.get("pub_type", "")
        header_parts = [f"Source {i}"]
        if title:
            header_parts.append(f"Title: {title}")
        if pub_type:
            header_parts.append(pub_type)
        if source and source != title:
            header_parts.append(source)
        header = "[" + " | ".join(header_parts) + "]"
        context_parts.append(f"{header}\n{text.strip()}")

    context_block = "\n\n".join(context_parts)
    return (
        f"{_STRICT_SYSTEM_PROMPT}\n\n"
        f"CONTEXT:\n{context_block}\n\n"
        f"QUESTION: {question}\n\n"
        f"SAFE ANSWER (context-only, cite [Source: title] for every claim):"
    )


# ---------------------------------------------------------------------------
# OpenAI provider
# ---------------------------------------------------------------------------

def _generate_openai(prompt: str, config: dict) -> str:
    llm_cfg = config.get("llm", {})

    # Override from frontend/config takes priority over system ENV
    api_key = llm_cfg.get("openai_api_key") or os.environ.get("OPENAI_API_KEY")
    if not api_key:
        env_file = Path(".env")
        if env_file.exists():
            for line in env_file.read_text().splitlines():
                if line.startswith("OPENAI_API_KEY="):
                    api_key = line.split("=", 1)[1].strip().strip('"').strip("'")
                    break

    if not api_key:
        raise RuntimeError("OpenAI API key not found. Set OPENAI_API_KEY env var or in .env.")

    try:
        from openai import OpenAI
    except ImportError:
        raise RuntimeError("openai not installed. Run: pip install openai")

    model_name = llm_cfg.get("openai_model") or llm_cfg.get("model") or "gpt-4o"
    client = OpenAI(api_key=api_key)

    logger.info("Calling OpenAI API (model=%s)...", model_name)
    t0 = time.perf_counter()

    try:
        response = client.chat.completions.create(
            model=model_name,
            messages=[{"role": "user", "content": prompt}],
            temperature=float(llm_cfg.get("generation_temperature", 0.7)),
            max_tokens=1024,
        )
    except Exception as exc:
        raise RuntimeError(f"OpenAI API error: {exc}") from exc

    elapsed = int((time.perf_counter() - t0) * 1000)
    answer = response.choices[0].message.content.strip()

    if not answer:
        raise RuntimeError("OpenAI returned an empty response.")

    logger.info("OpenAI generated answer in %d ms (%d chars)", elapsed, len(answer))
    return answer

def _generate_gemini(prompt: str, config: dict) -> str:
    llm_cfg = config.get("llm", {})

    # Override from frontend/config takes priority over system ENV
    api_key = llm_cfg.get("gemini_api_key") or os.environ.get("GEMINI_API_KEY")
    if not api_key:
        # Try loading from .env file if present
        env_file = Path(".env")
        if env_file.exists():
            for line in env_file.read_text().splitlines():
                if line.startswith("GEMINI_API_KEY="):
                    api_key = line.split("=", 1)[1].strip().strip('"').strip("'")
                    break

    if not api_key:
        raise RuntimeError(
            "Gemini API key not found. "
            "Either: (1) set GEMINI_API_KEY=your_key in the same terminal as uvicorn, "
            "or (2) create a .env file with GEMINI_API_KEY=your_key in the project root."
        )

    try:
        from google import genai
        from google.genai import types
    except ImportError:
        raise RuntimeError(
            "google-genai not installed. Run: pip install google-genai"
        )

    model_name = llm_cfg.get("gemini_model", "gemini-2.0-flash")
    client = genai.Client(api_key=api_key)

    logger.info("Calling Gemini API (model=%s)...", model_name)
    t0 = time.perf_counter()

    try:
        response = client.models.generate_content(
            model=model_name,
            contents=prompt,
            config=types.GenerateContentConfig(
                temperature=float(llm_cfg.get("generation_temperature", 0.7)),
                max_output_tokens=1024,
            ),
        )
    except Exception as exc:
        raise RuntimeError(f"Gemini API error: {exc}") from exc

    elapsed = int((time.perf_counter() - t0) * 1000)
    answer = response.text.strip() if response.text else ""

    if not answer:
        raise RuntimeError("Gemini returned an empty response.")

    logger.info("Gemini generated answer in %d ms (%d chars)", elapsed, len(answer))
    return answer


# ---------------------------------------------------------------------------
# Ollama provider (kept as fallback)
# ---------------------------------------------------------------------------

def _generate_ollama(prompt: str, config: dict) -> str:
    import requests as _requests

    llm_cfg = config.get("llm", {})
    base_url = llm_cfg.get("base_url", "http://localhost:11434")
    model = llm_cfg.get("model", "mistral")
    timeout = llm_cfg.get("timeout_seconds", 120)
    temperature = llm_cfg.get("generation_temperature", 0.7)

    payload = {
        "model": model,
        "prompt": prompt,
        "stream": False,
        "options": {"temperature": temperature, "num_predict": 512},
    }

    url = f"{base_url}/api/generate"
    logger.info("Calling Ollama (%s @ %s)...", model, base_url)
    t0 = time.perf_counter()

    try:
        resp = _requests.post(url, json=payload, timeout=timeout)
    except _requests.exceptions.ConnectionError as exc:
        raise RuntimeError(
            f"Ollama is not running at {base_url}. Start with: ollama serve"
        ) from exc
    except _requests.exceptions.Timeout as exc:
        raise RuntimeError(
            f"Ollama timed out after {timeout}s. Increase llm.timeout_seconds in config.yaml."
        ) from exc

    if resp.status_code != 200:
        raise RuntimeError(f"Ollama HTTP {resp.status_code}: {resp.text[:300]}")

    try:
        data = resp.json()
        answer = data.get("response", "").strip()
    except (json.JSONDecodeError, KeyError) as exc:
        raise RuntimeError(f"Unexpected Ollama response: {exc}") from exc

    if not answer:
        raise RuntimeError("Ollama returned an empty response.")

    elapsed = int((time.perf_counter() - t0) * 1000)
    logger.info("Ollama generated answer in %d ms (%d chars)", elapsed, len(answer))
    return answer


# ---------------------------------------------------------------------------
# Mistral provider
# ---------------------------------------------------------------------------

def _generate_mistral(prompt: str, config: dict) -> str:
    import requests as _requests
    
    llm_cfg = config.get("llm", {})
    # Resolve placeholder or direct value
    _raw_key = llm_cfg.get("mistral_api_key", "")
    api_key = os.environ.get("MISTRAL_API_KEY") if (not _raw_key or _raw_key.startswith("${")) else _raw_key
    if not api_key:
        raise RuntimeError(
            "Mistral API key not found. Set MISTRAL_API_KEY in Backend/.env"
        )
        
    model = llm_cfg.get("model", "mistral-large-latest")
    timeout = llm_cfg.get("timeout_seconds", 120)
    temperature = llm_cfg.get("generation_temperature", 0.7)

    payload = {
        "model": model,
        "messages": [{"role": "user", "content": prompt}],
        "temperature": temperature,
        "max_tokens": 1024,
    }
    
    headers = {
        "Authorization": f"Bearer {api_key}",
        "Content-Type": "application/json"
    }

    url = "https://api.mistral.ai/v1/chat/completions"
    logger.info("Calling Mistral API (model=%s, key=...***)", model)
    t0 = time.perf_counter()

    try:
        resp = _requests.post(url, json=payload, headers=headers, timeout=timeout)
    except Exception as exc:
        raise RuntimeError(f"Mistral API network error: {exc}") from exc

    if resp.status_code != 200:
        raise RuntimeError(f"Mistral HTTP {resp.status_code}: {resp.text[:300]}")

    try:
        data = resp.json()
        answer = data["choices"][0]["message"]["content"].strip()
    except Exception as exc:
        raise RuntimeError(f"Unexpected Mistral response: {exc}") from exc

    if not answer:
        raise RuntimeError("Mistral returned an empty response.")

    elapsed = int((time.perf_counter() - t0) * 1000)
    logger.info("Mistral generated answer in %d ms (%d chars)", elapsed, len(answer))
    return answer


# ---------------------------------------------------------------------------
# Groq provider
# ---------------------------------------------------------------------------

def _generate_groq(prompt: str, config: dict) -> str:
    import requests as _requests

    llm_cfg = config.get("llm", {})
    _raw_key = llm_cfg.get("groq_api_key", "")
    api_key = os.environ.get("GROQ_API_KEY") if (not _raw_key or _raw_key.startswith("${")) else _raw_key
    if not api_key:
        raise RuntimeError(
            "Groq API key not found. Set GROQ_API_KEY in Backend/.env"
        )

    model = llm_cfg.get("groq_model") or llm_cfg.get("model", "llama-3.3-70b-versatile")
    timeout = llm_cfg.get("timeout_seconds", 120)
    temperature = llm_cfg.get("generation_temperature", 0.7)

    payload = {
        "model": model,
        "messages": [{"role": "user", "content": prompt}],
        "temperature": temperature,
        "max_tokens": 1024,
    }

    headers = {
        "Authorization": f"Bearer {api_key}",
        "Content-Type": "application/json"
    }

    url = "https://api.groq.com/openai/v1/chat/completions"
    logger.info("Calling Groq API (model=%s, key=...***)", model)
    t0 = time.perf_counter()

    try:
        resp = _requests.post(url, json=payload, headers=headers, timeout=timeout)
    except Exception as exc:
        raise RuntimeError(f"Groq API network error: {exc}") from exc

    if resp.status_code != 200:
        raise RuntimeError(f"Groq HTTP {resp.status_code}: {resp.text[:300]}")

    try:
        data = resp.json()
        answer = data["choices"][0]["message"]["content"].strip()
    except Exception as exc:
        raise RuntimeError(f"Unexpected Groq response: {exc}") from exc

    if not answer:
        raise RuntimeError("Groq returned an empty response.")

    elapsed = int((time.perf_counter() - t0) * 1000)
    logger.info("Groq generated answer in %d ms (%d chars)", elapsed, len(answer))
    return answer


# ---------------------------------------------------------------------------
# Public API
# ---------------------------------------------------------------------------

def generate_answer(
    question: str,
    context_chunks: list[dict],
    config: Optional[dict] = None,
    overrides: Optional[dict] = None,
) -> str:
    """
    Generate a grounded medical answer.

    Provider is selected from config.yaml → llm.provider, but can be
    overridden per-request via the `overrides` dict. This makes the eval
    engine portable — callers bring their own API key and model.

    Args:
        question       : User's medical question.
        context_chunks : Retrieved context chunks (dicts with 'text' key).
        config         : Config dict (loaded from config.yaml if None).
        overrides      : Per-request overrides. Supported keys:
                           provider   → "gemini" or "ollama"
                           api_key    → Gemini API key
                           model      → model name (e.g. "gemini-2.5-flash-lite")
                           ollama_url → Ollama base URL

    Returns:
        Generated answer string.

    Raises:
        RuntimeError   : If the provider is unreachable or returns an error.
    """
    if config is None:
        config = _load_config()

    # Build effective config: server config as base, overrides win
    effective_llm = dict(config.get("llm", {}))
    if overrides:
        if overrides.get("provider"):
            effective_llm["provider"] = overrides["provider"]
        if overrides.get("api_key"):
            pk = (overrides.get("provider") or "gemini").lower()
            key_map = {
                "gemini": "gemini_api_key",
                "openai": "openai_api_key",
                "mistral": "mistral_api_key",
                "groq": "groq_api_key",
            }
            effective_llm[key_map.get(pk, "gemini_api_key")] = overrides["api_key"]
        if overrides.get("model"):
            pk = (overrides.get("provider") or "gemini").lower()
            model_map = {
                "gemini": "gemini_model",
                "openai": "openai_model",
                "mistral": "model",
                "groq": "groq_model",
            }
            effective_llm[model_map.get(pk, "gemini_model")] = overrides["model"]
        if overrides.get("ollama_url"):
            effective_llm["base_url"] = overrides["ollama_url"]

    effective_config = {**config, "llm": effective_llm}
    provider = effective_llm.get("provider", "gemini").lower()
    system_prompt_override = overrides.get("system_prompt") if overrides else None
    persona = overrides.get("persona", "physician") if overrides else "physician"
    
    prompt = _build_prompt(
        question, 
        context_chunks, 
        system_prompt=system_prompt_override,
        persona=persona
    )

    if provider == "gemini":
        return _generate_gemini(prompt, effective_config)
    elif provider == "openai":
        return _generate_openai(prompt, effective_config)
    elif provider == "ollama":
        return _generate_ollama(prompt, effective_config)
    elif provider == "mistral":
        return _generate_mistral(prompt, effective_config)
    elif provider == "groq":
        return _generate_groq(prompt, effective_config)
    else:
        raise RuntimeError(
            f"Unknown LLM provider '{provider}'. "
            "Set llm.provider to 'gemini', 'mistral', 'groq', or 'ollama'."
        )


def generate_strict_answer(
    question: str,
    context_chunks: list[dict],
    config: Optional[dict] = None,
    overrides: Optional[dict] = None,
) -> str:
    """
    Generate a STRICT context-only answer.
    Called when initial answer fails evaluation (HRS >= 60).
    The LLM is forbidden from using any training knowledge.
    """
    if config is None:
        config = _load_config()

    effective_llm = dict(config.get("llm", {}))
    if overrides:
        if overrides.get("provider"):
            effective_llm["provider"] = overrides["provider"]
        if overrides.get("api_key"):
            pk = (overrides.get("provider") or "gemini").lower()
            key_map = {
                "gemini": "gemini_api_key",
                "openai": "openai_api_key",
                "mistral": "mistral_api_key",
                "groq": "groq_api_key",
            }
            effective_llm[key_map.get(pk, "gemini_api_key")] = overrides["api_key"]
        if overrides.get("model"):
            pk = (overrides.get("provider") or "gemini").lower()
            model_map = {
                "gemini": "gemini_model",
                "openai": "openai_model",
                "mistral": "model",
                "groq": "groq_model",
            }
            effective_llm[model_map.get(pk, "gemini_model")] = overrides["model"]
        if overrides.get("ollama_url"):
            effective_llm["base_url"] = overrides["ollama_url"]

    effective_config = {**config, "llm": effective_llm}
    provider = effective_llm.get("provider", "gemini").lower()
    prompt = _build_strict_prompt(question, context_chunks)

    if provider == "gemini":
        return _generate_gemini(prompt, effective_config)
    elif provider == "openai":
        return _generate_openai(prompt, effective_config)
    elif provider == "ollama":
        return _generate_ollama(prompt, effective_config)
    elif provider == "mistral":
        return _generate_mistral(prompt, effective_config)
    elif provider == "groq":
        return _generate_groq(prompt, effective_config)
    else:
        raise RuntimeError(f"Unknown LLM provider '{provider}'.")


def generate_simple_prompt(
    prompt: str,
    config: Optional[dict] = None,
    overrides: Optional[dict] = None,
) -> str:
    """Execute a simple prompt on the active LLM provider without context formatting."""
    if config is None:
        config = _load_config()

    effective_llm = dict(config.get("llm", {}))
    if overrides:
        if overrides.get("provider"):
            effective_llm["provider"] = overrides["provider"]
        if overrides.get("api_key"):
            pk = (overrides.get("provider") or "gemini").lower()
            key_map = {
                "gemini": "gemini_api_key",
                "openai": "openai_api_key",
                "mistral": "mistral_api_key",
                "groq": "groq_api_key",
            }
            effective_llm[key_map.get(pk, "gemini_api_key")] = overrides["api_key"]
        if overrides.get("model"):
            pk = (overrides.get("provider") or "gemini").lower()
            model_map = {
                "gemini": "gemini_model",
                "openai": "openai_model",
                "mistral": "model",
                "groq": "groq_model",
            }
            effective_llm[model_map.get(pk, "gemini_model")] = overrides["model"]
        if overrides.get("ollama_url"):
            effective_llm["base_url"] = overrides["ollama_url"]

    effective_config = {**config, "llm": effective_llm}
    provider = effective_llm.get("provider", "gemini").lower()

    if provider == "gemini":
        return _generate_gemini(prompt, effective_config)
    elif provider == "openai":
        return _generate_openai(prompt, effective_config)
    elif provider == "ollama":
        return _generate_ollama(prompt, effective_config)
    elif provider == "mistral":
        return _generate_mistral(prompt, effective_config)
    elif provider == "groq":
        return _generate_groq(prompt, effective_config)
    else:
        raise RuntimeError(f"Unknown LLM provider '{provider}'.")


def translate_hinglish_to_english(
    question: str,
    config: Optional[dict] = None,
    overrides: Optional[dict] = None,
) -> str:
    """Translate clinical query from Hinglish or standard Hindi to professional English."""
    prompt = (
        "You are an expert bilingual clinical query translator. You will receive a medical question "
        "written in Hinglish (a mixture of Hindi and English written in the Latin alphabet) or standard Hindi. "
        "Convert the Hinglish/Hindi question into a clear, professional, grammatically correct English clinical query. "
        "If the input query is already completely in English, return it exactly as it is with no edits. "
        "Do NOT add any conversational preamble, greetings, explanation, or formatting. Only return the translated English query.\n\n"
        f"Query: {question}\n"
        "English Translation:"
    )
    try:
        translated = generate_simple_prompt(prompt, config=config, overrides=overrides)
        return translated.strip().strip('"').strip("'")
    except Exception as exc:
        logger.warning("Hinglish translation failed: %s. Using original query.", exc)
        return question