Spaces:

Deevyankar
/

Handouts

Sleeping

App Files Files Community

Deevyankar commited on Sep 20, 2025

Commit

65fcd1d

verified ·

1 Parent(s): c6ee9d5

Update app.py

Browse files

Files changed (1) hide show

app.py +69 -81

app.py CHANGED Viewed

@@ -1,110 +1,98 @@
-import gradio as gr
-from sentence_transformers import SentenceTransformer, util
-import fitz  # PyMuPDF
-import docx
 import matplotlib.pyplot as plt
-import io
-import base64
-model = SentenceTransformer("all-MiniLM-L6-v2")
-def extract_text_from_pdf(pdf_file):
     try:
-        doc = fitz.open(stream=pdf_file.read(), filetype="pdf")
         text = ""
-        for page in doc:
-            text += page.get_text()
-        return text
-    except Exception as e:
         return ""
-def extract_text_from_docx(file_obj):
-    try:
-        doc = docx.Document(file_obj)
-        return "\n".join([para.text for para in doc.paragraphs if para.text.strip()])
-    except Exception as e:
-        return ""
-def match_learning_outcomes(lo_list, text):
-    text_blocks = [para for para in text.split("\n") if len(para.strip()) > 20]
-    text_embs = model.encode(text_blocks, convert_to_tensor=True)
-    lo_embs = model.encode(lo_list, convert_to_tensor=True)
-    lo_scores = []
-    for i, lo in enumerate(lo_list):
-        sims = util.cos_sim(lo_embs[i], text_embs)
-        max_score = float(sims.max())
-        lo_scores.append(max_score)
-    return lo_scores
-def generate_similarity_chart(lo_list, old_scores, new_scores):
-    fig, ax = plt.subplots(figsize=(10, 5))
-    x = range(len(lo_list))
-    ax.bar(x, old_scores, width=0.4, label="Old", align="center")
-    ax.bar([i + 0.4 for i in x], new_scores, width=0.4, label="New", align="center")
-    ax.set_xticks([i + 0.2 for i in x])
-    ax.set_xticklabels([f"LO{i+1}" for i in x], rotation=45)
     ax.set_ylabel("Match Score (0-1)")
-    ax.set_title("LO-wise Semantic Match Score: Old vs New")
-    ax.legend()
-    buf = io.BytesIO()
     plt.tight_layout()
-    plt.savefig(buf, format="png")
-    buf.seek(0)
-    encoded = base64.b64encode(buf.read()).decode("utf-8")
-    plt.close(fig)
-    return f"data:image/png;base64,{encoded}"
-def compare_handouts(old_pdf, new_pdf, lo_file):
     old_text = extract_text_from_pdf(old_pdf)
     new_text = extract_text_from_pdf(new_pdf)
     if not old_text or not new_text:
         return "❌ Could not extract text from one or both PDFs.", None
-    lo_text = extract_text_from_docx(lo_file)
-    lo_list = [line.strip() for line in lo_text.split("\n") if line.strip()]
-    if not lo_list:
-        return "❌ No learning outcomes detected.", None
-    # LO Matching
-    old_scores = match_learning_outcomes(lo_list, old_text)
-    new_scores = match_learning_outcomes(lo_list, new_text)
-    matched_old = sum(score > 0.6 for score in old_scores)
-    matched_new = sum(score > 0.6 for score in new_scores)
-    # Change % based on character count
-    change_percent = abs(len(new_text) - len(old_text)) / max(len(old_text), 1) * 100
-    summary = f"📈 **Content Change:** {change_percent:.2f}%\n"
-    summary += f"🎯 **LOs Matched (Old vs New):** {matched_old} vs {matched_new} of {len(lo_list)}\n"
-    if matched_new > matched_old:
-        summary += "🟢 New handout covers learning outcomes better.\n"
-    elif matched_new < matched_old:
-        summary += "🔴 New handout covers fewer outcomes.\n"
-    else:
-        summary += "🟡 No major change in LO coverage.\n"
-    chart = generate_similarity_chart(lo_list, old_scores, new_scores)
-    return summary, chart
-iface = gr.Interface(
     fn=compare_handouts,
     inputs=[
-        gr.File(label="Old Version PDF"),
-        gr.File(label="New Version PDF"),
-        gr.File(label="Learning Outcomes (.docx)", file_types=[".docx"]),
     ],
     outputs=[
-        gr.Markdown(label="Summary"),
-        gr.Image(label="LO Match Chart")
     ],
     title="📘 Educational Content Comparator",
-    description="Compare two handouts and evaluate changes + Learning Outcome coverage using semantic similarity.",
 )
-iface.launch()

+import difflib
 import matplotlib.pyplot as plt
+import pandas as pd
+from PyPDF2 import PdfReader
+from docx import Document
+import gradio as gr
+from sklearn.feature_extraction.text import TfidfVectorizer
+from sklearn.metrics.pairwise import cosine_similarity
+# --- Extract Text ---
+def extract_text_from_pdf(pdf_bytes):
     try:
+        reader = PdfReader(pdf_bytes)
         text = ""
+        for page in reader.pages:
+            text += page.extract_text() or ""
+        return text.strip()
+    except:
         return ""
+def extract_text_from_docx(docx_file):
+    doc = Document(docx_file)
+    return "\n".join([para.text.strip() for para in doc.paragraphs if para.text.strip()])
+# --- Change Percentage ---
+def calculate_change_percentage(old_text, new_text):
+    seqm = difflib.SequenceMatcher(None, old_text, new_text)
+    return (1 - seqm.ratio()) * 100
+# --- Semantic Matching ---
+def semantic_match(lo_texts, content):
+    vectorizer = TfidfVectorizer().fit_transform([content] + lo_texts)
+    similarities = cosine_similarity(vectorizer[0:1], vectorizer[1:]).flatten()
+    return similarities
+# --- Summary Generation ---
+def generate_summary(change_pct, matched_los, total_los):
+    msg = f"📈 Content Change: {change_pct:.2f}%\n🎯 Matched LOs: {matched_los} of {total_los}\n"
+    if change_pct > 20:
+        msg += "🟢 Major improvements detected."
+    elif change_pct > 5:
+        msg += "🔵 Some updates found."
+    else:
+        msg += "🟡 Very little or no update."
+    return msg
+# --- Bar Chart Plot ---
+def plot_lo_chart(lo_labels, old_scores, new_scores):
+    df = pd.DataFrame({'Old': old_scores, 'New': new_scores}, index=lo_labels)
+    ax = df.plot(kind='bar', figsize=(10, 5), title="LO-wise Match Score: Old vs New")
     ax.set_ylabel("Match Score (0-1)")
+    ax.set_ylim(0, 1)
+    plt.xticks(rotation=45, ha='right')
     plt.tight_layout()
+    return plt.gcf()
+# --- Main Comparator ---
+def compare_handouts(old_pdf, new_pdf, lo_docx):
     old_text = extract_text_from_pdf(old_pdf)
     new_text = extract_text_from_pdf(new_pdf)
     if not old_text or not new_text:
         return "❌ Could not extract text from one or both PDFs.", None
+    lo_text_raw = extract_text_from_docx(lo_docx)
+    lo_list = [lo for lo in lo_text_raw.split('\n') if lo.strip()]
+    if not lo_list:
+        return "❌ No learning outcomes detected in uploaded file.", None
+    old_scores = semantic_match(lo_list, old_text)
+    new_scores = semantic_match(lo_list, new_text)
+    matched = sum(n >= o for o, n in zip(old_scores, new_scores))
+    change_pct = calculate_change_percentage(old_text, new_text)
+    summary = generate_summary(change_pct, matched, len(lo_list))
+    fig = plot_lo_chart([f"LO{i+1}" for i in range(len(lo_list))], old_scores, new_scores)
+    return summary, fig
+# --- Gradio App ---
+demo = gr.Interface(
     fn=compare_handouts,
     inputs=[
+        gr.File(label="Upload Old PDF", type="binary"),
+        gr.File(label="Upload New PDF", type="binary"),
+        gr.File(label="Upload Learning Outcomes (.docx)", type="binary"),
     ],
     outputs=[
+        gr.Textbox(label="📋 Summary"),
+        gr.Plot(label="📊 LO Match Chart")
     ],
     title="📘 Educational Content Comparator",
+    description="Upload 2 handouts and LO file (.docx). Detect % update, alignment with learning outcomes, and get visual summary."
 )
+demo.launch(share=True)