Spaces:

Deevyankar
/

Handouts

Sleeping

App Files Files Community

Deevyankar commited on Sep 21, 2025

Commit

3c108e3

verified ·

1 Parent(s): a4cd443

Update app.py

Browse files

Files changed (1) hide show

app.py +45 -98

app.py CHANGED Viewed

@@ -1,117 +1,64 @@
 import gradio as gr
-from sklearn.feature_extraction.text import TfidfVectorizer
-from sklearn.metrics.pairwise import cosine_similarity
-from sentence_transformers import SentenceTransformer, util
 import fitz  # PyMuPDF
-import docx
 import matplotlib.pyplot as plt
-import pandas as pd
-# Load transformer model
 model = SentenceTransformer("all-MiniLM-L6-v2")
-def extract_text_from_pdf(file):
-    text = ""
     try:
-        with fitz.open(stream=file, filetype="pdf") as doc:
             for page in doc:
                 text += page.get_text()
     except Exception as e:
-        print(f"Error extracting PDF text: {e}")
-    return text
-def extract_text_from_docx(file):
-    doc = docx.Document(file)
-    return "\n".join([para.text for para in doc.paragraphs])
-def semantic_match(lo_texts, content):
-    embeddings = model.encode([content] + lo_texts, convert_to_tensor=True)
-    content_embedding = embeddings[0]
-    lo_embeddings = embeddings[1:]
-    similarities = util.pytorch_cos_sim(content_embedding, lo_embeddings)[0]
-    return similarities.tolist()
-def compare_handouts(old_pdf, new_pdf, lo_file):
-    old_text = extract_text_from_pdf(old_pdf)
-    new_text = extract_text_from_pdf(new_pdf)
-    if not old_text.strip() or not new_text.strip():
-        return "❌ Could not extract text from one or both PDFs.", None, None
-    lo_text = extract_text_from_docx(lo_file)
-    lo_list = [line.strip() for line in lo_text.split("\n") if line.strip()]
-    if not lo_list:
-        return "⚠️ No learning outcomes detected in uploaded DOCX file.", None, None
-    old_scores = semantic_match(lo_list, old_text)
-    new_scores = semantic_match(lo_list, new_text)
-    avg_old = sum(old_scores) / len(old_scores)
-    avg_new = sum(new_scores) / len(new_scores)
-    change = round(((avg_new - avg_old) / avg_old) * 100, 2) if avg_old != 0 else 100.0
-    matched = sum([1 for o, n in zip(old_scores, new_scores) if n >= o])
-    summary = f"📈 Content Change: {change:.2f}%\n🎯 Matched LOs: {matched} of {len(lo_list)}"
-    if change > 10:
-        summary += "\n🟢 New content appears more detailed and informative."
-    elif change < -10:
-        summary += "\n🔴 Content may have been reduced or simplified."
     else:
-        summary += "\n🟡 Only minor updates detected."
-    los = [f"LO{i+1}" for i in range(len(lo_list))]
-    percentage_change = [round(((n - o) / o) * 100, 2) if o else 100.0 for o, n in zip(old_scores, new_scores)]
-    df = pd.DataFrame({
-        "Learning Outcome": los,
-        "Old Score": [round(s, 3) for s in old_scores],
-        "New Score": [round(s, 3) for s in new_scores],
-        "% Change": percentage_change
-    })
-    # Table image
-    fig, ax = plt.subplots(figsize=(9, 3))
-    ax.axis('tight')
-    ax.axis('off')
-    table = ax.table(cellText=df.values, colLabels=df.columns, cellLoc='center', loc='center')
-    table.auto_set_font_size(False)
-    table.set_fontsize(10)
-    table.scale(1.2, 1.2)
-    table_path = "/mnt/data/lo_comparison_table.png"
-    plt.savefig(table_path, bbox_inches='tight', dpi=300)
-    plt.close()
-    # Bar chart
-    fig, ax = plt.subplots(figsize=(10, 4))
-    bar_width = 0.35
-    index = range(len(los))
-    ax.bar(index, old_scores, bar_width, label='Old', alpha=0.7)
-    ax.bar([i + bar_width for i in index], new_scores, bar_width, label='New', alpha=0.7)
-    ax.set_xticks([i + bar_width / 2 for i in index])
-    ax.set_xticklabels(los)
-    ax.set_ylabel('Semantic Match (0-1)')
-    ax.set_title('Learning Outcome Comparison')
-    ax.legend()
-    chart_path = "/mnt/data/lo_score_chart.png"
-    plt.tight_layout()
-    plt.savefig(chart_path)
-    plt.close()
-    return summary, table_path, chart_path
-# Gradio interface
-with gr.Blocks() as demo:
-    gr.Markdown("## 🧠 Transformer-Based Course Content Comparator")
-    with gr.Row():
-        old_pdf_input = gr.File(label="📂 Old Handout (PDF)", file_types=[".pdf"])
-        new_pdf_input = gr.File(label="📂 New Handout (PDF)", file_types=[".pdf"])
-        lo_input = gr.File(label="📄 Learning Outcomes (DOCX)", file_types=[".docx"])
-    submit_btn = gr.Button("🔍 Analyze")
-    summary_output = gr.Textbox(label="Summary")
-    lo_table_output = gr.Image(label="📋 LO Change Table")
-    lo_chart_output = gr.Image(label="📈 LO Match Chart")
-    submit_btn.click(fn=compare_handouts, inputs=[old_pdf_input, new_pdf_input, lo_input],
-                     outputs=[summary_output, lo_table_output, lo_chart_output])
-demo.launch()

 import gradio as gr
 import fitz  # PyMuPDF
+from sentence_transformers import SentenceTransformer, util
 import matplotlib.pyplot as plt
+import numpy as np
+import os
+# Load transformer model once
 model = SentenceTransformer("all-MiniLM-L6-v2")
+def extract_text_pdf(file_obj):
     try:
+        with fitz.open(stream=file_obj, filetype="pdf") as doc:
+            text = ""
             for page in doc:
                 text += page.get_text()
+            return text if text.strip() else None
     except Exception as e:
+        return None
+def semantic_similarity(text1, text2):
+    emb1 = model.encode([text1], convert_to_tensor=True)
+    emb2 = model.encode([text2], convert_to_tensor=True)
+    return float(util.pytorch_cos_sim(emb1, emb2)[0][0])
+def compare_docs(old_pdf, new_pdf):
+    old_text = extract_text_pdf(old_pdf)
+    new_text = extract_text_pdf(new_pdf)
+    if not old_text or not new_text:
+        return "❌ Could not extract text from one or both PDFs.", None
+    sim_score = semantic_similarity(old_text, new_text)
+    change_percent = round((1 - sim_score) * 100, 2)
+    summary = f"📈 Estimated Content Change: {change_percent}%\n\n"
+    summary += "🧠 Semantic Similarity Score: {:.2f}\n".format(sim_score)
+    if change_percent < 10:
+        summary += "✅ Minor updates detected, mostly similar content."
+    elif change_percent < 40:
+        summary += "🔄 Moderate content updates detected."
     else:
+        summary += "🆕 Major revisions and new content identified."
+    return summary, None
+iface = gr.Interface(
+    fn=compare_docs,
+    inputs=[
+        gr.File(label="Upload Old Handout (PDF)", file_types=[".pdf"]),
+        gr.File(label="Upload New Handout (PDF)", file_types=[".pdf"])
+    ],
+    outputs=[
+        gr.Textbox(label="Comparison Summary"),
+        gr.Plot(label="(Coming Soon) Visual Summary")
+    ],
+    title="📘 Course Handout Comparator with Semantic AI",
+    description="Upload old and new PDFs to see how much content has changed. Uses transformer model for expert-like judgment.",
+)
+iface.launch()