import gradio as gr
from PyPDF2 import PdfReader
import io
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import re


def extract_text_from_pdf(pdf_file):
    try:
        reader = PdfReader(io.BytesIO(pdf_file))
        full_text = ""
        for page in reader.pages:
            text = page.extract_text()
            if text:
                full_text += text
        return full_text.strip()
    except Exception as e:
        print("PDF extraction error:", e)
        return ""


def semantic_match(lo_list, content):
    lo_texts = [lo for lo in lo_list if lo.strip()]
    vectorizer = TfidfVectorizer().fit_transform([content] + lo_texts)
    vectors = vectorizer.toarray()
    content_vec = vectors[0]
    scores = [cosine_similarity([content_vec], [vec])[0][0] for vec in vectors[1:]]
    return scores


def compare_all(old_pdf, new_pdf, lo_file):
    try:
        los = lo_file.decode("utf-8", errors="ignore").splitlines()
        los = [lo.strip() for lo in los if lo.strip()]
    except:
        return "❌ Could not read learning outcomes file.", None, None, None

    old_text = extract_text_from_pdf(old_pdf)
    new_text = extract_text_from_pdf(new_pdf)

    if not old_text or not new_text:
        return "❌ Could not extract text from one or both PDFs.", None, None, None

    old_scores = semantic_match(los, old_text)
    new_scores = semantic_match(los, new_text)

    labels = [f"LO{i+1}" for i in range(len(los))]
    x = np.arange(len(labels))

    # Plot
    fig, ax = plt.subplots()
    ax.bar(x - 0.2, old_scores, width=0.4, label="Old", align='center')
    ax.bar(x + 0.2, new_scores, width=0.4, label="New", align='center')
    ax.set_xticks(x)
    ax.set_xticklabels(labels, rotation=45)
    ax.set_ylabel("Semantic Match Score")
    ax.set_title("Learning Outcomes Comparison")
    ax.legend()

    # Table
    data = {
        "Learning Outcome": labels,
        "Old Match (%)": [round(s * 100, 2) for s in old_scores],
        "New Match (%)": [round(s * 100, 2) for s in new_scores],
        "Change (%)": [round((new - old) * 100, 2) for new, old in zip(new_scores, old_scores)]
    }
    df = pd.DataFrame(data)

    # Content similarity
    tfidf = TfidfVectorizer().fit_transform([old_text, new_text])
    cosine_sim = cosine_similarity(tfidf[0:1], tfidf[1:2])[0][0] * 100
    content_diff = 100 - round(cosine_sim, 2)

    # Text size change
    len_old = len(re.findall(r'\w+', old_text))
    len_new = len(re.findall(r'\w+', new_text))
    word_change_percent = round(((len_new - len_old) / len_old) * 100, 2)

    summary = f"""
📘 **Summary of Comparison**

📈 **Overall Content Change**: {content_diff:.2f}%
🔍 This is based on TF-IDF cosine similarity between old and new handouts.

📝 **Text Length Difference**: {'+' if word_change_percent >= 0 else ''}{word_change_percent:.2f}% 
Compared by total number of words in both handouts.

🎯 **Learning Outcome Matches**: {sum(1 for s in new_scores if s >= 0.5)} of {len(los)}
✅ New handout appears {'more' if sum(new_scores) > sum(old_scores) else 'less'} aligned with stated outcomes.
"""

    return summary.strip(), df, fig, "✅ Comparison completed successfully."


iface = gr.Interface(
    fn=compare_all,
    inputs=[
        gr.File(label="Old Handout PDF", type='binary'),
        gr.File(label="New Handout PDF", type='binary'),
        gr.File(label="Learning Outcomes (Text File)", type='binary'),
    ],
    outputs=[
        gr.Textbox(label="📘 Summary & Insights", lines=20, max_lines=25),
        gr.Dataframe(label="📊 LO-wise Comparison Table"),
        gr.Plot(label="📈 LO Visual Comparison"),
        gr.Textbox(label="ℹ️ Status", lines=1)
    ],
    title="📘 Handout Comparator + LO Analyzer",
    description="Upload OLD and NEW handouts in PDF format along with a TXT file of Learning Outcomes. The app compares content changes and evaluates alignment with LOs visually and in table format."
)

iface.launch()