from docx import Document
import pytesseract
from PIL import Image
import fitz
import gradio as gr
import threading
import pathlib
import os


# --------------------------------------------------
# TOKEN RESOLUTION
# --------------------------------------------------

def resolve_token(ui_token):
    if ui_token.strip():
        return ui_token.strip()

    env_token = os.getenv("hgface_tok")
    if env_token:
        return env_token.strip()

    return ""


# --------------------------------------------------
# FILE TEXT EXTRACTION
# --------------------------------------------------

SUPPORTED_EXT = (
    ".pdf", ".docx", ".txt", ".png", ".jpg", ".jpeg", ".webp", ".bmp", ".tiff"
)

def extract_text_from_file(filepath):
    if not filepath:
        return ""

    if hasattr(filepath,"name"):
        filepath = filepath.name

    ext = pathlib.Path(filepath).suffix.lower()

    try:
        if ext == ".pdf":
            doc = fitz.open(filepath)
            text = []
            for page in doc:
                text.append(page.get_text())
            return "\n".join(text)

        elif ext == ".docx":
            doc = Document(filepath)
            return "\n".join(p.text for p in doc.paragraphs)

        elif ext == ".txt":
            with open(filepath,"r", encoding="utf-8", errors="ignore") as f:
                return f.read()

        elif ext in (".png", ".jpg", ".jpeg", ".webp", ".bmp", ".tiff"):

            try:
                img = Image.open(filepath)
                return pytesseract.image_to_string(img)

            except Exception as e:
                return "OCR failed: " + str(e)

        else:
            return "Unsupported file type: " + ext

    except Exception as e:
        return "Could not read file: " + str(e)


# --------------------------------------------------
# MODELS
# --------------------------------------------------

MODELS = {
    "Gemma 3 270M [0.6GB | Lightning-fast Edge]": "google/gemma-3-270m-it",
    "Qwen 3 0.6B GGUF [0.5GB | Classroom Assistant]": "Qwen/Qwen3-0.6B-GGUF",
    "TinyLlama 1.1B [0.5GB]": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",

    "Qwen 3.5 2B [2.4GB | The Student Tutor]": "Qwen/Qwen3.5-2B",
    "Phi-4 Mini [1.8GB | Logical Powerhouse]": "microsoft/Phi-4-mini-instruct",
    "Gemma 3 1B [2.1GB | Stable & Coherent]": "google/gemma-3-1b-it",

    "Qwen 3.5 9B [7.8GB | BEST FOR LESSON PLANS]": "Qwen/Qwen3.5-9B",
    "Llama 3.1 8B [5.2GB | Industry Standard]": "meta-llama/Meta-Llama-3.1-8B-Instruct",
    "Mistral Small 3 [7.1GB | Concise & Accurate]": "mistralai/Mistral-Small-3-Instruct",
    "Gemma 3 9B [6.3GB | Creative & Safe]": "google/gemma-3-9b-it",

    "Mistral Small 12B [9.5GB | Perfect VRAM Balance]": "mistralai/Mistral-Nemo-Instruct-2407",

    "Qwen 3.5 27B [18GB | Dense Curriculum Architect]": "Qwen/Qwen3.5-27B",
}

ALL_MODEL_NAMES = list(MODELS.keys())


# --------------------------------------------------
# PIPELINE CACHE
# --------------------------------------------------

_pipeline_cache = {}
_pipeline_lock = threading.Lock()


def get_pipeline(model_id, hf_token):

    from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline

    with _pipeline_lock:
        if model_id not in _pipeline_cache:
            try:
                kwargs = {
                    "trust_remote_code": True
                }

                if hf_token:
                    kwargs["token"] = hf_token

                tokenizer = AutoTokenizer.from_pretrained(
                    model_id,
                    **kwargs
                )

                model = AutoModelForCausalLM.from_pretrained(
                    model_id,
                    device_map="cpu",
                    **kwargs
                )
                pipe = pipeline(
                    "text-generation",
                    model=model,
                    tokenizer=tokenizer
                )

                _pipeline_cache[model_id] = pipe

            except Exception as e:
                return None, str(e)

    return _pipeline_cache[model_id], None


# --------------------------------------------------
# INFERENCE
# --------------------------------------------------

SYSTEM_MSG = "You are an expert educational assistant. Use markdown."

def ask_llm(model_label, prompt, hf_token=""):
    token = resolve_token(hf_token)

    model_id = MODELS[model_label]

    pipe, err = get_pipeline(model_id, token)
    if err:
        return "Model load error:\n" + err

    try:
        combined = SYSTEM_MSG + "\n\n" + prompt

        out = pipe(
            combined,
            max_new_tokens=2048,
            do_sample=True,
            temperature=0.6,
            top_p=0.9,
            repetition_penalty=1.15,
            no_repeat_ngram_size=3
        )
        text = out[0]["generated_text"]

        if text.startswith(combined):
            text = text[len(combined):]

        return text.strip()

    except Exception as e:
        return "Inference error:\n" + str(e)


# --------------------------------------------------
# PROMPTS
# --------------------------------------------------

def make_prompts(topic):
    return {
        "lesson":
        "Create a structured lesson plan for classroom teaching.\n"
        "Include:\n"
        "- Learning objectives\n"
        "- Introduction\n"
        "- Concept explanation\n"
        "- Examples\n"
        "- Case study\n"
        "- Classroom activity\n"
        "- Assessment\n\n"
        "Topic:\n"+topic,

        "qa":
        "Generate 10 exam questions with answers.\n\nTopic:\n"+topic,

        "mcq":
        "Generate 10 MCQs with 4 options and answers.\n\nTopic:\n"+topic,

        "summary":
        "Summarize the topic in 250-300 words.\n\nTopic:\n"+topic,
    }


def generate_content(text, file, model_label, token):
    file_text = extract_text_from_file(file) if file else ""

    syllabus = (text + "\n\n" + file_text).strip()
    if not syllabus:
        yield ("Provide topic or file","","","","")
        return

    prompts = make_prompts(syllabus)

    WAIT = "Generating..."
    results = [WAIT,WAIT,WAIT,WAIT,WAIT]
    yield tuple(results)

    order = ["lesson", "qa", "mcq", "summary"]

    for i, key in enumerate(order):
        res = ask_llm(model_label, prompts[key], token)
        results[i] = res

        yield tuple(results)


# --------------------------------------------------
# UI
# --------------------------------------------------

CSS = """
body,.gradio-container{
font-family:Inter,sans-serif!important;
}
"""


with gr.Blocks() as demo:
    gr.Markdown("# 🎓 AI Study Material Generator")

    with gr.Row():
        with gr.Column():
            text_input = gr.Textbox(
                placeholder="Paste syllabus or topic",
                lines=6
            )

            file_input = gr.File(
                label="Upload syllabus file"
            )

        with gr.Column():
            model_selector = gr.Dropdown(
                choices=ALL_MODEL_NAMES,
                value=ALL_MODEL_NAMES[0],
                label="Model"
            )

            token_box = gr.Textbox(
                label="HF Token (optional)",
                type="password"
            )

            btn = gr.Button("Generate")

    with gr.Tabs():
        with gr.TabItem("Lesson Plan"):
            lesson = gr.Markdown()

        with gr.TabItem("Q&A"):
            qa = gr.Markdown()

        with gr.TabItem("MCQ"):
            mcq = gr.Markdown()

        with gr.TabItem("Summary"):
            summary = gr.Markdown()

    btn.click(
        fn=generate_content,
        inputs=[text_input,file_input,model_selector,token_box],
        outputs=[lesson, qa, mcq, summary]
    )


demo.launch(
    theme=gr.themes.Soft(
        primary_hue="indigo",
        secondary_hue="purple"
    ),
    css=CSS
)