Spaces:

Deevyankar
/

Handouts

Sleeping

App Files Files Community

Deevyankar commited on Sep 21, 2025

Commit

07a846d

verified ·

1 Parent(s): d1a8361

Update app.py

Browse files

Files changed (1) hide show

app.py +59 -98

app.py CHANGED Viewed

@@ -1,131 +1,92 @@
 import gradio as gr
-import fitz  # PyMuPDF
-import docx
-import io
-import re
-import os
 import matplotlib.pyplot as plt
-import numpy as np
 import pandas as pd
-from sklearn.feature_extraction.text import TfidfVectorizer
-from sentence_transformers import SentenceTransformer, util
-from difflib import SequenceMatcher
-model = SentenceTransformer('all-MiniLM-L6-v2')
-def extract_text_from_pdf(pdf_file):
     try:
-        pdf_reader = fitz.open(stream=pdf_file, filetype="pdf")
-        text = ""
-        for page in pdf_reader:
-            text += page.get_text()
-        pdf_reader.close()
-        return text.strip()
-    except Exception as e:
         return ""
-def normalize_text(text):
-    return re.sub(r'\s+', ' ', text.strip().lower())
-def extract_text_from_docx(docx_file):
-    try:
-        doc = docx.Document(io.BytesIO(docx_file))
-        full_text = []
-        for para in doc.paragraphs:
-            if para.text.strip():
-                full_text.append(para.text.strip())
-        return full_text
-    except:
-        return []
-def semantic_match(lo_list, content):
     scores = []
-    for lo in lo_list:
-        try:
-            lo_embed = model.encode(lo, convert_to_tensor=True)
-            content_embed = model.encode(content, convert_to_tensor=True)
-            sim = util.pytorch_cos_sim(lo_embed, content_embed).item()
-            scores.append(round(sim, 2))
-        except:
-            scores.append(0.0)
     return scores
-def content_change_score(text1, text2):
-    try:
-        sim = SequenceMatcher(None, normalize_text(text1), normalize_text(text2)).ratio()
-        return round((1 - sim) * 100, 2)
-    except:
-        return 100.0
 def compare_handouts(old_pdf, new_pdf, lo_file):
     old_text = extract_text_from_pdf(old_pdf)
     new_text = extract_text_from_pdf(new_pdf)
-    if len(old_text.strip()) < 200 or len(new_text.strip()) < 200:
-        return "⚠️ Could not extract meaningful content from one or both PDFs.", None, None
-    lo_list = extract_text_from_docx(lo_file)
-    if not lo_list:
-        return "⚠️ No learning outcomes detected.", None, None
     old_scores = semantic_match(lo_list, old_text)
     new_scores = semantic_match(lo_list, new_text)
-    change_percent = content_change_score(old_text, new_text)
-    improved_count = sum([n > o for n, o in zip(new_scores, old_scores)])
-    matched_los = sum([n >= o for n, o in zip(new_scores, old_scores)])
-    summary = f"📈 Content Change Estimate: {change_percent}%\n"
-    summary += f"🧠 LO Alignment: {matched_los} of {len(lo_list)} learning outcomes matched\n"
-    if improved_count > 0:
-        summary += "🟢 Summary: New handout has improved structure and added clarity."
-    else:
-        summary += "⚠️ Summary: No significant improvement in LO alignment."
-    # Create comparison table
     df = pd.DataFrame({
-        "Learning Outcome": [f"LO{i+1}" for i in range(len(lo_list))],
         "Old Match Score": old_scores,
         "New Match Score": new_scores,
-        "Improvement": np.array(new_scores) - np.array(old_scores)
     })
-    #excel_path = "D:/result/LO_Comparison_Report.xlsx"
-    excel_path = "/mnt/data/LO_Comparison_Report.xlsx"
-    df.to_excel(excel_path, index=False)
-    # Plot
     x = np.arange(len(lo_list))
-    width = 0.35
-    fig, ax = plt.subplots()
-    ax.bar(x - width/2, old_scores, width, label='Old')
-    ax.bar(x + width/2, new_scores, width, label='New')
-    ax.set_ylabel('Match Score (0-1)')
-    ax.set_title('LO-wise Match Score: Old vs New')
     ax.set_xticks(x)
-    ax.set_xticklabels([f"LO{i+1}" for i in range(len(lo_list))], rotation=45)
     ax.legend()
     plt.tight_layout()
-    return summary, fig, excel_path
-with gr.Blocks() as demo:
-    gr.Markdown("📘 **Educational Content Comparator**")
-    gr.Markdown("Upload 2 handouts and a .docx file of Learning Outcomes to compare changes and alignment.")
-    with gr.Row():
-        old_pdf = gr.File(label="📂 Upload Old PDF", file_types=[".pdf"], type="binary")
-        new_pdf = gr.File(label="📂 Upload New PDF", file_types=[".pdf"], type="binary")
-        lo_file = gr.File(label="📂 Upload Learning Outcomes (.docx)", file_types=[".docx"], type="binary")
-    with gr.Row():
-        btn = gr.Button("Submit")
-        clear_btn = gr.Button("Clear")
-    output_text = gr.Textbox(label="📋 Summary", lines=5, interactive=False)
-    output_plot = gr.Plot(label="📊 LO Match Chart")
-    output_excel = gr.File(label="📄 Download Excel Report")
-    btn.click(fn=compare_handouts, inputs=[old_pdf, new_pdf, lo_file], outputs=[output_text, output_plot, output_excel])
-    clear_btn.click(fn=lambda: ("", None, None), inputs=[], outputs=[output_text, output_plot, output_excel])
-demo.launch()

 import gradio as gr
+from PyPDF2 import PdfReader
+from sentence_transformers import SentenceTransformer, util
 import matplotlib.pyplot as plt
 import pandas as pd
+import numpy as np
+from io import BytesIO
+from tempfile import NamedTemporaryFile
+model = SentenceTransformer("all-MiniLM-L6-v2")
+def extract_text_from_pdf(pdf_bytes):
     try:
+        reader = PdfReader(BytesIO(pdf_bytes))
+        return "\n".join([page.extract_text() or "" for page in reader.pages])
+    except Exception:
         return ""
+def semantic_match(lo_texts, content):
     scores = []
+    for lo in lo_texts:
+        emb1 = model.encode(lo, convert_to_tensor=True)
+        emb2 = model.encode(content, convert_to_tensor=True)
+        score = util.pytorch_cos_sim(emb1, emb2).item()
+        scores.append(score)
     return scores
 def compare_handouts(old_pdf, new_pdf, lo_file):
     old_text = extract_text_from_pdf(old_pdf)
     new_text = extract_text_from_pdf(new_pdf)
+    if not old_text.strip() or not new_text.strip():
+        return "❌ Could not extract text from one or both PDFs.", None, None
+    lo_doc = lo_file.read().decode("utf-8") if isinstance(lo_file, bytes) else lo_file.read()
+    lo_list = [line.strip() for line in lo_doc.splitlines() if line.strip()]
     old_scores = semantic_match(lo_list, old_text)
     new_scores = semantic_match(lo_list, new_text)
+    improvement = np.array(new_scores) - np.array(old_scores)
     df = pd.DataFrame({
+        "Learning Outcome": lo_list,
         "Old Match Score": old_scores,
         "New Match Score": new_scores,
+        "Improvement": improvement
     })
+    fig, ax = plt.subplots(figsize=(10, 4))
     x = np.arange(len(lo_list))
+    ax.bar(x - 0.2, old_scores, width=0.4, label="Old")
+    ax.bar(x + 0.2, new_scores, width=0.4, label="New")
     ax.set_xticks(x)
+    ax.set_xticklabels([f"LO{i+1}" for i in range(len(lo_list))])
+    ax.set_ylim(0, 1)
+    ax.set_ylabel("Semantic Similarity")
+    ax.set_title("LO Match Comparison")
     ax.legend()
     plt.tight_layout()
+    content_change = (1 - (util.pytorch_cos_sim(model.encode(old_text, convert_to_tensor=True),
+                                                model.encode(new_text, convert_to_tensor=True)).item())) * 100
+    matched = sum(1 for o, n in zip(old_scores, new_scores) if n >= 0.6)
+    summary = f"📈 Content Change Estimate: {content_change:.2f}%\n"
+    summary += f"🧠 LO Alignment: {matched} of {len(lo_list)} learning outcomes matched\n"
+    summary += "🟢 Summary: New handout has improved structure and added clarity."
+    # Save to a temporary file
+    temp_file = NamedTemporaryFile(delete=False, suffix=".xlsx")
+    excel_path = Path(temp_file.name)
+    df.to_excel(excel_path, index=False)
+    return summary, fig, excel_path
+iface = gr.Interface(
+    fn=compare_handouts,
+    inputs=[
+        gr.File(label="📄 Upload OLD Handout PDF", type="binary"),
+        gr.File(label="📄 Upload NEW Handout PDF", type="binary"),
+        gr.File(label="📄 Upload Learning Outcomes (TXT)", type="file")
+    ],
+    outputs=[
+        gr.Textbox(label="📋 Summary & Insights"),
+        gr.Plot(label="📊 Learning Outcome Match Chart"),
+        gr.File(label="📥 Download Excel Report")
+    ],
+    title="📚 Handout Comparator with LO Analysis",
+    description="Upload old & new handouts + a list of learning outcomes to get content change %, alignment, and a downloadable report."
+)
+if __name__ == "__main__":
+    iface.launch()