import subprocess, sys
subprocess.run([sys.executable, "-m", "pip", "install", "docling", "-q"], check=False)

import json
import sys
from pathlib import Path

import gradio as gr

sys.path.insert(0, str(Path(__file__).parent / "pipeline"))
from prelabeler import process_pdf, LABELS, LABEL_COLORS, detect_doc_type

DOC_TYPES = ["auto", "math", "newspaper", "research", "presentation"]

def run_prelabel(pdf_files, doc_type_choice):
    if not pdf_files:
        return "No files uploaded.", "{}"
    doc_type = None if doc_type_choice == "auto" else doc_type_choice
    all_results = []
    for pdf_path in pdf_files:
        result = process_pdf(pdf_path, doc_type)
        all_results.append(result)
    summary_lines = []
    for r in all_results:
        summary_lines.append(f"📄 **{r['file']}** ({r.get('doc_type','auto')}) — {r['pages']} page(s)")
        if r.get("error") and r["error"] != "docling_not_installed":
            summary_lines.append(f"  ⚠️ Error: {r['error']}")
        else:
            if r.get("error") == "docling_not_installed":
                summary_lines.append("  ⚠️ Running in demo mode — Docling still installing, try again in 2 mins")
            counts = r.get("label_counts", {})
            for lbl in LABELS:
                if lbl in counts and counts[lbl] > 0:
                    summary_lines.append(f"  • **{lbl}**: {counts[lbl]} region(s)")
        summary_lines.append("")
    summary = "\n".join(summary_lines)
    json_out = json.dumps(all_results, indent=2)
    return summary, json_out


def score_gold(pred_json_text, gold_json_text):
    try:
        preds = json.loads(pred_json_text)
        if isinstance(preds, list) and preds and "regions" in preds[0]:
            regions = []
            for r in preds: regions.extend(r.get("regions", []))
        else:
            regions = preds
        gold = json.loads(gold_json_text)
        if isinstance(gold, dict) and "regions" in gold:
            gold = gold["regions"]
    except Exception as e:
        return f"⚠️ JSON parse error: {e}"
    from prelabeler import score_against_gold
    m = score_against_gold(regions, gold)
    ov = m["overall"]
    lines = [
        f"## Overall: P={ov['precision']:.1%}  R={ov['recall']:.1%}  F1={ov['f1']:.1%}",
        "",
        "| Label | Precision | Recall | F1 |",
        "|---|---|---|---|",
    ]
    for lbl in LABELS:
        d = m["per_label"].get(lbl, {})
        if d.get("f1", 0) > 0 or d.get("precision", 0) > 0:
            lines.append(f"| {lbl} | {d['precision']:.1%} | {d['recall']:.1%} | {d['f1']:.1%} |")
    return "\n".join(lines)


with gr.Blocks(
    title="PreLabel · Document Layout Annotator",
    theme=gr.themes.Soft(primary_hue="blue"),
) as demo:

    gr.Markdown("""
# 🏷️ PreLabel — Document Layout Pre-Labeling
**iMerit Annotation Pipeline** · 10 label classes · Math · Newspaper · Presentation · Research

Automatically pre-labels PDF regions as: `Text` · `Equation` · `Title` · `Section Header` · `Table` · `Caption` · `Figure` · `Page Number` · `Footer` · `Page-footer`
    """)

    with gr.Tabs():
        with gr.Tab("📄 Pre-Label PDFs"):
            with gr.Row():
                with gr.Column(scale=1):
                    pdf_input = gr.File(
                        label="Upload PDF files",
                        file_types=[".pdf"],
                        file_count="multiple",
                    )
                    doc_type_input = gr.Dropdown(
                        choices=DOC_TYPES,
                        value="auto",
                        label="Document type",
                        info="'auto' detects from filename. Override if needed."
                    )
                    run_btn = gr.Button("▶ Run Pre-Labeling", variant="primary", size="lg")
                with gr.Column(scale=2):
                    summary_out = gr.Markdown(value="*Results will appear here after running.*")
            gr.Markdown("### 📥 JSON output")
            json_out = gr.Code(label="Pre-label JSON", language="json", lines=20)
            run_btn.click(fn=run_prelabel, inputs=[pdf_input, doc_type_input], outputs=[summary_out, json_out])

        with gr.Tab("📊 Score Against Gold"):
            gr.Markdown("""
Paste your pre-label JSON and gold JSON to compute Precision / Recall / F1.

**Gold JSON format:**
```json
[{"label": "Text", "bbox": {"x0":72,"y0":110,"x1":540,"y1":200}, "page": 1}]
```
            """)
            with gr.Row():
                pred_input = gr.Code(label="Predictions JSON", language="json", lines=12)
                gold_input = gr.Code(label="Gold JSON", language="json", lines=12)
            score_btn = gr.Button("📊 Compute Metrics", variant="primary")
            metrics_out = gr.Markdown()
            score_btn.click(fn=score_gold, inputs=[pred_input, gold_input], outputs=metrics_out)

        with gr.Tab("📖 Label Guide"):
            gr.Markdown("""
## Label schema

| Label | Description |
|---|---|
| **Text** | Body text, paragraphs, bullet points, list items |
| **Equation** | Standalone math formulas (inline equations in math/research docs are ignored) |
| **Title** | Document or article title |
| **Section Header** | Section/subsection headings |
| **Table** | Data tables (not captions) |
| **Caption** | Figure/table captions — starts with Fig., Table., etc. |
| **Figure** | Images, charts, diagrams, illustrations |
| **Page Number** | Standalone page numbers |
| **Footer** | Copyright lines, source notes at bottom |
| **Page-footer** | Running footer with page number / journal reference |

## Guidelines applied
- **Math & Research:** inline equations inside text are not labeled — only standalone equations
- **Newspapers:** multiple paragraphs in the same column/section can share one bounding box
- **Footer vs Page-footer:** these are two distinct labels, never merged
            """)

demo.launch()