#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
Gradio App — AI vs Human Document Classifier (Chunked Inference)
----------------------------------------------------------------
Features:
- Upload a document (TXT/MD/HTML/PDF), chunk if needed, classify each chunk, aggregate to document.
- UI includes:
  1) Probability bars with raw numbers (AI generated / Human written)
  2) Confidence badge ("Likely AI" / "Likely Human") with traffic-light color
  3) Tabs for Basic / Advanced controls
  4) Chunk details accordion with per-chunk probabilities
  5) NEW: Per-chunk **snippet** extracted using tokenizer offset_mapping
"""

import os
import io
import re
from typing import Dict, Any, List, Tuple

import numpy as np
import torch
import gradio as gr
from transformers import AutoTokenizer, AutoModelForSequenceClassification

# -----------------------------
# Config
# -----------------------------
MODEL_ID = os.getenv("MODEL_ID", "bert-base-uncased")
MAX_LENGTH = int(os.getenv("MAX_LENGTH", "512"))
STRIDE = int(os.getenv("STRIDE", "128"))

# Device
device = torch.device("cuda" if torch.cuda.is_available() else
                      "mps" if torch.backends.mps.is_available() else "cpu")
if device.type == "mps":
    try:
        torch.set_float32_matmul_precision("high")
    except Exception:
        pass

# Load model & tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, use_fast=True)
model = AutoModelForSequenceClassification.from_pretrained(MODEL_ID, torch_dtype=torch.float32).to(device)
model.eval()

# -----------------------------
# Utilities
# -----------------------------
TEXT_EXTS = {".txt", ".md", ".rtf", ".html", ".htm"}
PDF_EXTS = {".pdf"}

def read_text_from_file(file_obj) -> str:
    """
    Read text content from an uploaded file.
    Supports: .txt, .md, .rtf, .html, .htm, .pdf (via pypdf).
    """
    name = getattr(file_obj, "name", "") or ""
    ext = os.path.splitext(name)[-1].lower()

    if ext in TEXT_EXTS:
        data = file_obj.read()
        if isinstance(data, bytes):
            data = data.decode("utf-8", errors="ignore")
        if ext in {".html", ".htm"}:
            data = re.sub(r"<[^>]+>", " ", data)
            data = re.sub(r"\s+", " ", data).strip()
        return data

    if ext in PDF_EXTS:
        try:
            from pypdf import PdfReader
            reader = PdfReader(io.BytesIO(file_obj.read()))
            pages = []
            for p in reader.pages:
                try:
                    pages.append(p.extract_text() or "")
                except Exception:
                    pages.append("")
            text = "\n".join(pages)
            text = re.sub(r"\s+", " ", text).strip()
            return text
        except Exception as e:
            return f"[PDF parse error] {e}"

    # Fallback: try as text
    data = file_obj.read()
    if isinstance(data, bytes):
        data = data.decode("utf-8", errors="ignore")
    return data


def chunked_predict(text: str, max_length: int = 512, stride: int = 128, agg: str = "mean") -> Dict[str, Any]:
    """
    Chunk the document using tokenizer overflow, run classifier on each chunk,
    aggregate probabilities, and return both doc-level and chunk-level results,
    including a short snippet per chunk derived from offset_mapping.
    """
    if not text or not text.strip():
        return {"error": "Empty document."}

    with torch.no_grad():
        enc = tokenizer(
            text,
            truncation=True,
            max_length=max_length,
            return_overflowing_tokens=True,
            stride=stride,
            padding=True,
            return_offsets_mapping=True,   # NEW: get character offsets per token
            return_tensors="pt",
        )

        allowed = {"input_ids", "attention_mask", "token_type_ids"}
        inputs = {k: v.to(model.device) for k, v in enc.items() if k in allowed}

        logits_list = []
        for i in range(inputs["input_ids"].size(0)):
            batch = {k: v[i:i+1] for k, v in inputs.items()}
            out = model(**batch)
            logits_list.append(out.logits)

        logits = torch.cat(logits_list, dim=0)  # [num_chunks, num_labels]
        probs  = torch.softmax(logits, dim=-1).cpu().numpy()
        num_chunks = int(probs.shape[0])

    # Aggregate
    if agg == "max":
        doc_probs = probs.max(axis=0)
    else:
        doc_probs = probs.mean(axis=0)

    # By convention: 0 -> Human, 1 -> AI
    prob_human = float(doc_probs[0])
    prob_ai    = float(doc_probs[1])

    # --- Build snippets per chunk from offset mapping ---
    offsets = enc["offset_mapping"]           # tensor of pairs
    attn    = enc["attention_mask"]           # [num_chunks, seq_len]
    snippets: List[str] = []
    PREVIEW = 120

    for i in range(offsets.shape[0]):
        offs = offsets[i].tolist()
        mask = attn[i].tolist()
        spans = [(s, e) for (s, e), m in zip(offs, mask) if m == 1 and not (s == 0 and e == 0)]
        if spans:
            s0 = min(s for s, _ in spans)
            e0 = max(e for _, e in spans)
            raw = text[s0:e0].strip()
            raw = " ".join(raw.split())
            if len(raw) > PREVIEW:
                raw = raw[:PREVIEW].rstrip() + "…"
            snippets.append(raw)
        else:
            snippets.append("")

    # Per-chunk rows: [chunk#, AI prob, Human prob, Snippet]
    chunk_rows: List[List[Any]] = []
    for i, p in enumerate(probs):
        ai_p = float(p[1])
        hu_p = float(p[0])
        chunk_rows.append([i + 1, ai_p, hu_p, snippets[i]])

    return {
        "ai_prob": prob_ai,
        "human_prob": prob_human,
        "num_chunks": num_chunks,
        "chunk_rows": chunk_rows,   # list of [chunk, AI, Human, Snippet]
        "max_length": max_length,
        "stride": stride,
    }


def predict_from_upload(file, aggregation, max_length, stride):
    if file is None:
        return {"error": "Please upload a file."}

    # Work around gradio temp file behavior
    if hasattr(file, "name") and isinstance(file.name, str):
        with open(file.name, "rb") as f:
            raw = io.BytesIO(f.read())
        raw.name = os.path.basename(file.name)
        text = read_text_from_file(raw)
    else:
        text = read_text_from_file(file)

    return chunked_predict(text, max_length=int(max_length), stride=int(stride), agg=aggregation)


# -----------------------------
# UI Helpers (HTML formatting)
# -----------------------------
def probability_bar_html(label: str, prob: float) -> str:
    """Return an HTML row with label, percent, and a bar."""
    pct = prob * 100.0
    return f"""
      <div class="prob-row"><div class="prob-label"><b>{label}</b></div>
        <div class="prob-value">{pct:.2f}%</div>
        <div class="prob-bar">
          <div class="prob-fill" style="width:{pct:.2f}%"></div>
        </div>
      </div>
    """

def verdict_badge_html(prob_ai: float, threshold: float = 0.5) -> str:
    label = "Likely AI" if prob_ai >= threshold else "Likely Human"
    color = "#ef4444" if prob_ai >= threshold else "#10b981"  # red / green
    return f"<span class='pill' style='background:{color}22;color:{color}'>{label}</span>"

def format_outputs(result: Dict[str, Any], threshold: float = 0.5):
    """Produce (verdict_html, probs_html, chunk_table_data, details_md)."""
    if "error" in result:
        return f"<span style='color:#ef4444'>{result['error']}</span>", "", [], ""

    ai, human = result["ai_prob"], result["human_prob"]
    verdict_html = verdict_badge_html(ai, threshold=threshold)

    probs_html = ""
    probs_html += probability_bar_html("AI generated", ai)
    probs_html += probability_bar_html("Human written", human)

    # Chunk table rows (already built server-side)
    table_data = result["chunk_rows"]

    details_md = (
        f"**Chunks:** `{result['num_chunks']}`  \n"
        f"**Tokens per chunk:** `{result['max_length']}`  \n"
        f"**Stride:** `{result['stride']}`"
    )

    return verdict_html, probs_html, table_data, details_md


# -----------------------------
# Gradio Interface
# -----------------------------
CSS = """
.pill {padding:6px 12px; border-radius:999px; display:inline-block; margin: 6px 0; font-weight:600;}
.prob-row {display:flex; align-items:center; gap:10px; margin:6px 0;}
.prob-label {min-width:140px;}
.prob-value {min-width:80px; text-align:right; font-variant-numeric: tabular-nums;}
.prob-bar {flex:1; background:#e5e7eb; height:12px; border-radius:6px; overflow:hidden;}
.prob-fill {height:12px; background:#6366f1;}
.small-note {font-size:0.9rem; color:#6b7280;}
/* Wrap long snippet text within the DataFrame cells */
.gr-dataframe table td { white-space: normal; }
/* Scrollable chunk table container */
#chunkgroup { max-height: 260px; overflow: auto; }
#details_note { font-size: 0.9rem; color: #6b7280; }
"""

DESCRIPTION = """
### 🔎 AI vs Human — Document Classifier
Upload a file to get **document-level probabilities**.  
Long inputs are **chunked** into overlapping windows; chunk predictions are **aggregated**.
"""

with gr.Blocks(
    title="AI vs Human Document Classifier",
    theme='Nymbo/rounded-gradient',
    css=CSS
) as demo:
    gr.Markdown(DESCRIPTION)

    with gr.Tabs():
        with gr.Tab("Predict"):
            file_in = gr.File(label="Upload a document", file_types=[".txt", ".md", ".rtf", ".html", ".htm", ".pdf"])
            agg_in = gr.Radio(choices=["mean", "max"], value="mean", label="Aggregation over chunks")
            btn = gr.Button("Predict", variant="primary")
            verdict_html = gr.HTML(label="Verdict")
            probs_html = gr.HTML(label="Probabilities")

            with gr.Accordion("Chunk details", open=False):
                with gr.Group(elem_id="chunkgroup"):
                    chunk_table = gr.Dataframe(
                        headers=["Chunk", "AI generated", "Human written", "Snippet"],
                        datatype=["number", "number", "number", "str"],
                        label="Per-chunk probabilities",
                        wrap=True,
                        interactive=False,
                        row_count=(0, "dynamic"),
                        col_count=(4, "fixed"),
                    )
                details_md = gr.Markdown("", elem_id="details_note")

        with gr.Tab("Advanced"):
            gr.Markdown("Adjust chunking parameters below.")
            max_len_in = gr.Slider(128, 1024, value=MAX_LENGTH, step=32, label="Tokens per chunk (max_length)")
            stride_in  = gr.Slider(0, 512, value=STRIDE, step=16, label="Stride / overlap")
            gr.Markdown("You can also set `MODEL_ID`, `MAX_LENGTH`, and `STRIDE` via Space Variables.")

    def predict_and_prettify(file, aggregation, max_length=MAX_LENGTH, stride=STRIDE):
        res = predict_from_upload(file, aggregation, max_length, stride)
        return format_outputs(res)

    btn.click(
        fn=predict_and_prettify,
        inputs=[file_in, agg_in, max_len_in, stride_in],
        outputs=[verdict_html, probs_html, chunk_table, details_md],
    )

if __name__ == "__main__":
    demo.launch()