import os
import re
import json
import tempfile
import requests
import fitz
import pytesseract
from PIL import Image
from docx import Document
import numpy as np
import faiss
from sentence_transformers import SentenceTransformer
import google.generativeai as genai
from fastapi import FastAPI, Request

app = FastAPI()
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

# Utility function: Download file from URL to temp directory
def download_file(url: str, dest_dir: str) -> str:
    ext = url.split('.')[-1].split('?')[0]
    local_path = os.path.join(dest_dir, f"file_{abs(hash(url))}.{ext}")
    resp = requests.get(url, stream=True)
    resp.raise_for_status()
    with open(local_path, "wb") as f:
        for chunk in resp.iter_content(8192):
            f.write(chunk)
    return local_path

# Extract text from PDF, DOCX, or Images
def extract_text(file_path: str, max_pages: int = 3) -> str:
    ext = file_path.split('.')[-1].lower()
    if ext == "pdf":
        doc = fitz.open(file_path)
        return "\n".join(page.get_text() for page in doc[:max_pages])
    elif ext == "docx":
        doc = Document(file_path)
        return "\n".join(p.text for p in doc.paragraphs)
    elif ext in {"jpg", "jpeg", "png"}:
        return pytesseract.image_to_string(Image.open(file_path))
    else:
        raise ValueError(f"Unsupported file type: {ext}")

# Extract parameters like age, gender, procedure, location, policy_duration from text
def extract_params(text: str) -> dict:
    age_m = re.search(r"(\d{2})[- ]?year[- ]?old", text, re.IGNORECASE)
    gender_m = re.search(r"\b(male|female)\b", text, re.IGNORECASE)
    proc_m = re.search(r"(\w+(?:\s\w+)*\s(?:surgery|replacement|operation|treatment))", text, re.IGNORECASE)
    loc_m = re.search(r"(?:in|at)\s([A-Z][a-z]+(?:\s[A-Z][a-z]+)?)", text)
    dur_m = re.search(r"(\d+)[- ]?(?:month|year)[- ]?old.*?insurance", text, re.IGNORECASE)
    return {
        "age": int(age_m.group(1)) if age_m else None,
        "gender": gender_m.group(1).lower() if gender_m else None,
        "procedure": proc_m.group(1).strip() if proc_m else None,
        "location": loc_m.group(1).strip() if loc_m else None,
        "policy_duration": (
            dur_m.group(1) + (" months" if "month" in dur_m.group(0) else " years")
        ) if dur_m else None
    }

# Chunk large text into overlapping pieces
def chunk_text(text: str, chunk_size: int = 500, overlap: int = 100) -> list:
    words = text.split()
    chunks = []
    for i in range(0, len(words), chunk_size - overlap):
        chunk = " ".join(words[i:i + chunk_size])
        chunks.append(chunk)
    return chunks

# Prepare FAISS index from list of policy document file paths
def prepare_policy_index(policy_file_paths: list) -> tuple:
    all_chunks, chunk_sources = [], []
    for path in policy_file_paths:
        text = extract_text(path)
        chunks = chunk_text(text)
        all_chunks.extend(chunks)
        chunk_sources.extend([os.path.basename(path)] * len(chunks))
    embeddings = embedding_model.encode(all_chunks, show_progress_bar=True)
    dimension = embeddings.shape[1]
    index = faiss.IndexFlatL2(dimension)
    index.add(np.array(embeddings))
    return all_chunks, chunk_sources, index

# Semantic search over the FAISS index for a query string
def semantic_search(query: str, chunks: list, chunk_sources: list, index, top_k: int = 3) -> list:
    query_embedding = embedding_model.encode([query])
    D, I = index.search(np.array(query_embedding), top_k)
    return [(chunks[i], chunk_sources[i]) for i in I[0]]

# Call Gemini LLM for final decision
def get_llm_decision_gemini(structured_json: dict, retrieved_clauses: list, gemini_api_key: str) -> str:
    genai.configure(api_key=gemini_api_key)
    llm = genai.GenerativeModel("gemini-1.5-flash")
    prompt = f"""
You are an insurance claim decision model.

Claim Info:
{json.dumps(structured_json, indent=2)}

Relevant Policy Clauses:
{retrieved_clauses[0][0]}
{retrieved_clauses[1][0] if len(retrieved_clauses) > 1 else ''}
{retrieved_clauses[2][0] if len(retrieved_clauses) > 2 else ''}

Your task is to:
1. Decide if the claim should be approved or rejected
2. Mention amount if applicable (else null)
3. Give clear justification pointing to the relevant clauses

Respond only in JSON:
{{"Decision": "...", "Amount": "...", "Justification": "..."}}
"""
    response = llm.generate_content(prompt)
    return response.text

# The FastAPI /hackrx/run endpoint
@app.post("/hackrx/run")
async def hackrx_run(request: Request):
    data = await request.json()
    document_urls = data.get("documents")
    questions = data.get("questions", [])

    if not document_urls:
        return {"error": "No documents provided."}

    if isinstance(document_urls, str):
        document_urls = [document_urls]

    gemini_api_key = os.environ.get("GOOGLE_API_KEY")
    if not gemini_api_key:
        return {"error": "API key not configured in environment variables."}

    with tempfile.TemporaryDirectory() as tmpdir:
        # Download all policy docs
        policy_paths = [download_file(url, tmpdir) for url in document_urls]
        # Extract text and build FAISS index once per request
        chunks, chunk_sources, index = prepare_policy_index(policy_paths)

        answers = []
        for question in questions:
            # Extract structured info from question (optional; can also use raw question text)
            structured_query = extract_params(question)
            # Compose query text for semantic search
            query_text = " ".join([str(v) for v in structured_query.values() if v])
            # Retrieve top relevant clauses
            retrieved_clauses = semantic_search(query_text, chunks, chunk_sources, index)
            # Get final decision from Gemini
            answer = get_llm_decision_gemini(structured_query, retrieved_clauses, gemini_api_key)
            answers.append(answer)

    return {"answers": answers}