# =========================================================
# app.py (STABLE VERSION - FIXED TOKEN OVERFLOW)
# =========================================================

import gradio as gr
from transformers import pipeline
from pypdf import PdfReader
from pdf2image import convert_from_path
import pytesseract
import tempfile

# =========================================================
# Models
# =========================================================

MODELS = {
    "English model (ubffm/academic_text_classifier_en)": "ubffm/academic_text_classifier_en",
    "German model (ubffm/academic_text_classifier_de)": "ubffm/academic_text_classifier_de",
}

DEFAULT_MODEL = "English model (ubffm/academic_text_classifier_en)"

# =========================================================
# Labels
# =========================================================

LABELS = ["OUT OF SCOPE", "MAIN TEXT", "EXAMPLE", "REFERENCE"]
DEFAULT_NOISE = ["OUT OF SCOPE", "REFERENCE"]

# =========================================================
# Pipeline cache
# =========================================================

PIPELINES = {}

def get_classifier(model_display_name):
    model_name = MODELS[model_display_name]

    if model_name not in PIPELINES:
        PIPELINES[model_name] = pipeline(
            "text-classification",
            model=model_name,
            tokenizer=model_name,
            return_all_scores=True
        )

    return PIPELINES[model_name]

# =========================================================
# Safe prediction (IMPORTANT FIX)
# =========================================================

def get_best_prediction(classifier, text):

    # HARD SAFETY: prevents tokenizer overflow in pipeline
    text = text[:2000]

    result = classifier(text)

    if isinstance(result, list) and isinstance(result[0], list):
        result = result[0]

    return max(result, key=lambda x: x["score"]), result

# =========================================================
# SAFE CHUNKING (ROBUST FIX)
# =========================================================

def safe_chunk_text(text, tokenizer, max_tokens=480):
    """
    True safe chunking for XLM-R (leaves room for special tokens)
    """

    sentences = text.split("\n")

    chunks = []
    current = []
    current_len = 0

    for sent in sentences:

        sent_tokens = tokenizer.encode(sent, add_special_tokens=False)
        sent_len = len(sent_tokens)

        # Case 1: single sentence too large → hard split
        if sent_len > max_tokens:
            for i in range(0, sent_len, max_tokens):
                part = tokenizer.decode(sent_tokens[i:i + max_tokens])
                chunks.append(part)
            continue

        # Case 2: overflow chunk
        if current_len + sent_len > max_tokens:
            chunks.append("\n".join(current))
            current = [sent]
            current_len = sent_len
        else:
            current.append(sent)
            current_len += sent_len

    if current:
        chunks.append("\n".join(current))

    return chunks

# =========================================================
# Empty line cleanup
# =========================================================

def normalize_empty_lines(lines):

    cleaned = []
    prev_empty = False

    for l in lines:
        empty = not l.strip()

        if empty and prev_empty:
            continue

        cleaned.append(l)
        prev_empty = empty

    return cleaned

# =========================================================
# CORE PIPELINE (FIXED)
# =========================================================

def process_text_input(text, noise_labels, selected_model):

    if not text.strip():
        return "", "", "", None

    classifier = get_classifier(selected_model)
    tokenizer = classifier.tokenizer

    chunks = safe_chunk_text(text, tokenizer)

    logs = []
    kept = []
    removed = []

    line_counter = 0

    for c_id, chunk in enumerate(chunks):

        lines = chunk.splitlines()

        for line in lines:

            line_counter += 1

            if not line.strip():
                kept.append("")
                continue

            pred, _ = get_best_prediction(classifier, line)

            logs.append(
                f"[Chunk {c_id}] Line {line_counter} | "
                f"{pred['label']} ({pred['score']:.4f})\n{line}\n"
            )

            if pred["label"] in noise_labels:
                removed.append(line)
            else:
                kept.append(line)

    kept = normalize_empty_lines(kept)
    filtered = "\n".join(kept)

    tmp = tempfile.NamedTemporaryFile(
        delete=False,
        suffix=".txt",
        mode="w",
        encoding="utf-8"
    )

    tmp.write(filtered)
    tmp.close()

    stats = (
        f"Chunks: {len(chunks)}\n"
        f"Total lines: {line_counter}\n"
        f"Removed: {len(removed)}\n"
        f"Remaining: {len(kept)}"
    )

    return "\n".join(logs), filtered, stats, tmp.name

# =========================================================
# TXT FILE
# =========================================================

def process_document_file(file, noise_labels, selected_model):

    if file is None:
        return "", "", "", None

    with open(file.name, "r", encoding="utf-8") as f:
        text = f.read()

    return process_text_input(text, noise_labels, selected_model)

# =========================================================
# PDF EXTRACTION (DIGITAL + OCR)
# =========================================================

def extract_text_from_pdf(pdf_file):

    text_parts = []

    # 1. try digital extraction
    try:
        reader = PdfReader(pdf_file.name)

        for page in reader.pages:
            t = page.extract_text()
            if t:
                text_parts.append(t)

    except:
        pass

    text = "\n".join(text_parts).strip()

    # 2. OCR fallback
    if not text:

        pages = convert_from_path(pdf_file.name, dpi=300)

        ocr = []
        for page in pages:
            ocr.append(pytesseract.image_to_string(page))

        text = "\n".join(ocr)

    return text

# =========================================================
# PDF PIPELINE
# =========================================================

def process_pdf_file(file, noise_labels, selected_model):

    if file is None:
        return "", "", "", None

    text = extract_text_from_pdf(file)

    return process_text_input(text, noise_labels, selected_model)

# =========================================================
# UI
# =========================================================

with gr.Blocks(title="Stable Academic Text Filter") as demo:

    gr.Markdown("""
    # Academic Text Filter (FIXED VERSION)

    ✔ No tokenizer crashes  
    ✔ OCR + PDF support  
    ✔ Safe chunking (XLM-R compatible)  
    ✔ Robust long-document handling  
    """)

    with gr.Tab("Text"):
        m = gr.Dropdown(list(MODELS.keys()), value=DEFAULT_MODEL)
        t = gr.Textbox(lines=20)
        n = gr.CheckboxGroup(LABELS, value=DEFAULT_NOISE)

        btn = gr.Button("Process")

        o1 = gr.Textbox(lines=15)
        o2 = gr.Textbox(lines=15)
        o3 = gr.Textbox()
        o4 = gr.File()

        btn.click(process_text_input, [t, n, m], [o1, o2, o3, o4])

    with gr.Tab("TXT"):
        m = gr.Dropdown(list(MODELS.keys()), value=DEFAULT_MODEL)
        f = gr.File(file_types=[".txt"])
        n = gr.CheckboxGroup(LABELS, value=DEFAULT_NOISE)

        btn = gr.Button("Process")

        o1 = gr.Textbox(lines=15)
        o2 = gr.Textbox(lines=15)
        o3 = gr.Textbox()
        o4 = gr.File()

        btn.click(process_document_file, [f, n, m], [o1, o2, o3, o4])

    with gr.Tab("PDF"):
        m = gr.Dropdown(list(MODELS.keys()), value=DEFAULT_MODEL)
        f = gr.File(file_types=[".pdf"])
        n = gr.CheckboxGroup(LABELS, value=DEFAULT_NOISE)

        btn = gr.Button("Process PDF")

        o1 = gr.Textbox(lines=15)
        o2 = gr.Textbox(lines=15)
        o3 = gr.Textbox()
        o4 = gr.File()

        btn.click(process_pdf_file, [f, n, m], [o1, o2, o3, o4])

if __name__ == "__main__":
    demo.launch()