import gradio as gr
import re
from transformers import pipeline, AutoTokenizer
from PyPDF2 import PdfReader
import tempfile

# =========================
# Model setup (CPU-safe)
# =========================
# Use smaller, faster models to speed up processing
MODEL_NAME = "sshleifer/distilbart-cnn-6-6"  # Smaller than 12-6, faster on CPU
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
summarizer = pipeline(
    "summarization",
    model=MODEL_NAME,
    tokenizer=tokenizer,
    device=-1  # CPU only
)

# Use smaller flan-t5-small for faster advice generation
advice_generator = pipeline(
    "text2text-generation",
    model="google/flan-t5-small",
    device=-1  # CPU only
)

CHUNK_SIZE = 900  # safe margin under typical max input

# =========================
# Utilities
# =========================
def clean_text(text: str) -> str:
    """Fix quotes, spacing, repetition, broken punctuation."""
    text = text.replace("‘", "'").replace("’", "'")
    text = text.replace("“", '"').replace("”", '"')
    text = re.sub(r"[.]{2,}", ".", text)
    text = re.sub(r"[']{2,}", "'", text)
    text = re.sub(r"\s+", " ", text)
    sentences = re.split(r'(?<=[.!?])\s+', text)
    seen = set()
    result = []
    for s in sentences:
        key = s.strip().lower()
        if key and key not in seen:
            seen.add(key)
            result.append(s.strip())
    return " ".join(result)

def chunk_text(text: str):
    """Token-aware chunking to avoid model overflow."""
    tokens = tokenizer.encode(text, add_special_tokens=False)
    chunks = []
    for i in range(0, len(tokens), CHUNK_SIZE):
        chunk_tokens = tokens[i:i + CHUNK_SIZE]
        chunk_text = tokenizer.decode(chunk_tokens, skip_special_tokens=True)
        chunks.append(chunk_text)
    return chunks

def generate_ai_advice(summary: str) -> str:
    """Generate personalized study advice based on the paper summary."""
    truncated_summary = summary[:1000]
    
    prompt = (
        f"Read this summary of a technical paper: '{truncated_summary}'\n\n"
        "Generate exactly 5 practical study tips for a student to better understand and retain this content. "
        "Focus on active learning techniques, like practice, visualization, or connections to real-world applications. "
        "Make each tip start with a verb (e.g., 'Review...', 'Apply...') and keep them concise. "
        "Output only the 5 tips as bullet points, nothing else."
    )
    
    generated = advice_generator(
        prompt,
        max_length=250,
        num_return_sequences=1,
        do_sample=False,
        temperature=0.7
    )[0]["generated_text"]
    
    # Try to clean into bullet points
    tips = [tip.strip() for tip in generated.split('\n') if tip.strip().startswith('-') or tip.strip()]
    if not tips or len(tips) < 3:
        tips = [t.strip() for t in generated.split('.') if t.strip()]
    
    advice_md = "\n\n---\n\n### 📚 AI-Generated Study Tips\n\n"
    for i, tip in enumerate(tips[:5], 1):
        clean_tip = tip.lstrip('- ').strip()
        advice_md += f"- {clean_tip}\n"
    
    advice_md += "\n**Pro tip**: Combine these with spaced repetition (Anki / Quizlet) for long-term retention!"
    return advice_md

def extract_possible_headings(text: str) -> str:
    """Attempt to extract potential titles and subtitles from raw text.
    This is a simple heuristic: short lines, all caps, or starting with numbers/sections."""
    lines = text.split('\n')
    headings = []
    for line in lines:
        stripped = line.strip()
        if stripped and (len(stripped) < 80) and (stripped.isupper() or re.match(r'^\d+\.?\s', stripped) or re.match(r'^[A-Z][a-z]+\s[A-Z]', stripped)):
            headings.append(stripped)
    if headings:
        return "### Extracted Possible Headings/Subtitles\n\n" + "\n- ".join([''] + headings) + "\n\n---\n\n"
    return ""

def summarize_long_text(text: str, progress=gr.Progress()) -> str:
    """Summarize long text in chunks + add AI study advice.
    Now with longer summaries per chunk and formatted as bullet points."""
    if not text or len(text.strip()) == 0:
        return "No text provided."
    
    progress(0, desc="Extracting headings...")
    # Extract possible headings first
    headings_section = extract_possible_headings(text)
    
    progress(0.1, desc="Chunking text...")
    chunks = chunk_text(text)
    
    summaries = []
    progress(0.2, desc="Summarizing chunks...")
    for i in progress.tqdm(range(len(chunks))):
        chunk = chunks[i]
        try:
            summary = summarizer(
                chunk,
                max_length=200,  # Reduced slightly for speed (compromise between length and time)
                min_length=60,   # Reduced for speed
                do_sample=False
            )[0]["summary_text"]
            cleaned = clean_text(summary)
            summaries.append(f"**Chunk {i+1} Summary:** {cleaned}")
        except Exception:
            pass  # skip problematic chunks
    
    # Format summaries as bullet points
    summary_md = "### Detailed Summary (in Bullet Points)\n\n"
    for s in summaries:
        summary_md += f"- {s}\n"
    
    progress(0.8, desc="Generating AI advice...")
    ai_advice = generate_ai_advice(summary_md)  # Use the bulleted summary for advice generation
    
    progress(1, desc="Done!")
    return headings_section + summary_md + ai_advice

def read_pdf(file) -> str:
    """Safely extract text from PDF."""
    try:
        reader = PdfReader(file)
        pages = [page.extract_text() or "" for page in reader.pages]
        return "\n".join(pages)  # Join with newlines to preserve line breaks for heading detection
    except Exception as e:
        return f"PDF read error: {str(e)}"

# =========================
# Download helper
# =========================
def create_download_file(content: str) -> str:
    """Create temporary file for Gradio file download component"""
    with tempfile.NamedTemporaryFile(delete=False, suffix=".txt", mode="w", encoding="utf-8") as tmp:
        tmp.write(content)
        return tmp.name

# =========================
# Main handler
# =========================
def process_input(text: str, file, progress=gr.Progress()):
    input_text = ""
    
    progress(0, desc="Reading input...")
    if file is not None:
        input_text = read_pdf(file)
    elif text.strip():
        input_text = text
    else:
        return "Please paste some text or upload a PDF.", None
    
    result = summarize_long_text(input_text, progress)
    download_path = create_download_file(result)
    
    return result, download_path

# =========================
# Gradio UI
# =========================
with gr.Blocks() as demo:
    gr.Markdown("# 📄 Long Text Summarizer + AI Study Assistant")
    gr.Markdown(
        "• Handles very long documents (thousands of words)\n"
        "• Supports **PDF** upload or direct paste\n"
        "• Runs on CPU – works on free hardware\n"
        "• Gives you **longer, bullet-point summaries** with possible headings/subtitles\n"
        "• Includes **5 AI-generated study tips** tailored to the content\n"
        "• Download result as .txt file\n"
        "**Note**: Processing may take time for long documents on CPU (initial model load + inference). Please be patient!"
    )
    
    with gr.Row():
        text_input = gr.Textbox(
            lines=10,
            label="Paste your text here (optional)",
            placeholder="Paste lecture notes, article, book chapter...",
        )
        file_input = gr.File(
            label="Or upload a PDF",
            file_types=[".pdf"]
        )
    
    summarize_btn = gr.Button("Summarize & Get Study Tips", variant="primary")
    
    output = gr.Textbox(
        lines=16,
        label="Summary + AI-generated study advice",
        interactive=False
    )
    
    download_output = gr.File(
        label="Download full result (.txt)",
        interactive=False
    )
    
    summarize_btn.click(
        fn=process_input,
        inputs=[text_input, file_input],
        outputs=[output, download_output]
    )

demo.launch()