Spaces:

Deevyankar
/

Handouts

Sleeping

App Files Files Community

Deevyankar commited on Sep 20, 2025

Commit

c6ee9d5

verified ·

1 Parent(s): 99d0d6d

Update app.py

Browse files

Files changed (1) hide show

app.py +89 -105

app.py CHANGED Viewed

@@ -1,126 +1,110 @@
 import gradio as gr
 import fitz  # PyMuPDF
-from docx import Document
-import io
-import re
-import difflib
 import matplotlib.pyplot as plt
-import numpy as np
-from sklearn.feature_extraction.text import TfidfVectorizer
-from sklearn.metrics.pairwise import cosine_similarity
-def extract_text_from_pdf(uploaded_file_bytes):
-    doc = fitz.open(stream=uploaded_file_bytes, filetype="pdf")
-    text = ""
-    for page in doc:
-        page_text = page.get_text()
-        if page_text.strip():
-            text += page_text + "\n"
-    return text.strip()
-def extract_los(file_bytes, filename=""):
-    ext = filename.lower().split('.')[-1]
-    if ext == "txt":
-        return file_bytes.decode("utf-8").splitlines()
-    elif ext == "docx":
-        file_stream = io.BytesIO(file_bytes)
-        doc = Document(file_stream)
-        return [p.text.strip() for p in doc.paragraphs if p.text.strip()]
-    return []
-def quality_check(new_text):
-    words = new_text.split()
-    if len(words) > 400:
-        return "🟢 New content appears more detailed and informative."
-    elif len(words) > 200:
-        return "🟡 New content is moderately improved."
-    else:
-        return "🔴 New content may need more detail."
-def find_relevant_los(content, los):
-    if not los:
-        return [], 0, [], []
-    vectorizer = TfidfVectorizer().fit_transform([content] + los)
-    similarities = cosine_similarity(vectorizer[0:1], vectorizer[1:]).flatten()
-    matched = []
-    scores_old = [round(np.random.uniform(1, 3), 1) for _ in los]
-    scores_new = []
-    for i, score in enumerate(similarities):
-        if score > 0.2:
-            matched.append(f"✓ {los[i]} (Match: {score:.2f})")
-        scores_new.append(round(score * 5, 1))  # normalize to 5
-    return matched, len(matched), scores_old, scores_new
-def summarize_added_lines(old_text, new_text):
-    old_lines = set(old_text.splitlines())
-    new_lines = set(new_text.splitlines())
-    added_lines = list(new_lines - old_lines)
-    summary = []
-    for line in added_lines:
-        line_clean = re.sub(r'[^a-zA-Z0-9., ]', '', line).strip()
-        if len(line_clean.split()) >= 5:
-            summary.append("- " + line_clean)
-    return summary, len(added_lines), len(new_lines)
-def create_bar_chart(los, scores_old, scores_new):
-    index = np.arange(len(los))
-    bar_width = 0.35
     fig, ax = plt.subplots(figsize=(10, 5))
-    ax.bar(index, scores_old, bar_width, label="Old")
-    ax.bar(index + bar_width, scores_new, bar_width, label="New")
-    ax.set_xlabel("Learning Outcomes")
-    ax.set_ylabel("Match Score (0-5)")
-    ax.set_title("LO-wise Match Score: Old vs New")
-    ax.set_xticks(index + bar_width / 2)
-    ax.set_xticklabels(los, rotation=45, ha="right")
     ax.legend()
-    ax.grid(True, linestyle="--", alpha=0.4)
-    fig.tight_layout()
-    return fig
-def compare_handouts(old_pdf_bytes, new_pdf_bytes, lo_file_bytes, lo_filename):
-    old_text = extract_text_from_pdf(old_pdf_bytes)
-    new_text = extract_text_from_pdf(new_pdf_bytes)
-    los = extract_los(lo_file_bytes, lo_filename)
     if not old_text or not new_text:
-        return "❗ Error in file(s)", "", "", None
-    added_summary, added_lines, total_lines = summarize_added_lines(old_text, new_text)
-    percent_change = (added_lines / max(total_lines, 1)) * 100
-    lo_matches, matched_count, scores_old, scores_new = find_relevant_los(new_text, los)
-    quality = quality_check(new_text)
-    summary_output = "\n".join(added_summary)
-    lo_output = "\n".join(lo_matches)
-    stats = (
-        f"📈 Content Change: {percent_change:.2f}%\n"
-        f"🎯 Matched LOs: {matched_count} of {len(los)}\n"
-        f"{quality}"
-    )
-    chart = create_bar_chart(los, scores_old, scores_new)
-    return summary_output, lo_output, stats, chart
 iface = gr.Interface(
-    fn=lambda old_pdf, new_pdf, lo_file: compare_handouts(old_pdf, new_pdf, lo_file, "learning_outcomes.docx"),
     inputs=[
-        gr.File(label="📤 Old Handout PDF", type="binary"),
-        gr.File(label="📥 New Handout PDF", type="binary"),
-        gr.File(label="📚 Learning Outcomes (.docx or .txt)", type="binary")
     ],
     outputs=[
-        gr.Textbox(label="🆕 New Content Summary", lines=10),
-        gr.Textbox(label="🎯 LO Matches", lines=10),
-        gr.Textbox(label="📊 Stats & Quality", lines=5),
-        gr.Plot(label="📉 LO Match Score Chart")
     ],
-    title="📘 Handout Comparator (Binary Safe)",
-    description="Upload old/new handouts + LO file. Detects changes, LO match, and generates update chart."
 )
 iface.launch()

 import gradio as gr
+from sentence_transformers import SentenceTransformer, util
 import fitz  # PyMuPDF
+import docx
 import matplotlib.pyplot as plt
+import io
+import base64
+model = SentenceTransformer("all-MiniLM-L6-v2")
+def extract_text_from_pdf(pdf_file):
+    try:
+        doc = fitz.open(stream=pdf_file.read(), filetype="pdf")
+        text = ""
+        for page in doc:
+            text += page.get_text()
+        return text
+    except Exception as e:
+        return ""
+def extract_text_from_docx(file_obj):
+    try:
+        doc = docx.Document(file_obj)
+        return "\n".join([para.text for para in doc.paragraphs if para.text.strip()])
+    except Exception as e:
+        return ""
+def match_learning_outcomes(lo_list, text):
+    text_blocks = [para for para in text.split("\n") if len(para.strip()) > 20]
+    text_embs = model.encode(text_blocks, convert_to_tensor=True)
+    lo_embs = model.encode(lo_list, convert_to_tensor=True)
+    lo_scores = []
+    for i, lo in enumerate(lo_list):
+        sims = util.cos_sim(lo_embs[i], text_embs)
+        max_score = float(sims.max())
+        lo_scores.append(max_score)
+    return lo_scores
+def generate_similarity_chart(lo_list, old_scores, new_scores):
     fig, ax = plt.subplots(figsize=(10, 5))
+    x = range(len(lo_list))
+    ax.bar(x, old_scores, width=0.4, label="Old", align="center")
+    ax.bar([i + 0.4 for i in x], new_scores, width=0.4, label="New", align="center")
+    ax.set_xticks([i + 0.2 for i in x])
+    ax.set_xticklabels([f"LO{i+1}" for i in x], rotation=45)
+    ax.set_ylabel("Match Score (0-1)")
+    ax.set_title("LO-wise Semantic Match Score: Old vs New")
     ax.legend()
+    buf = io.BytesIO()
+    plt.tight_layout()
+    plt.savefig(buf, format="png")
+    buf.seek(0)
+    encoded = base64.b64encode(buf.read()).decode("utf-8")
+    plt.close(fig)
+    return f"data:image/png;base64,{encoded}"
+def compare_handouts(old_pdf, new_pdf, lo_file):
+    old_text = extract_text_from_pdf(old_pdf)
+    new_text = extract_text_from_pdf(new_pdf)
     if not old_text or not new_text:
+        return "❌ Could not extract text from one or both PDFs.", None
+    lo_text = extract_text_from_docx(lo_file)
+    lo_list = [line.strip() for line in lo_text.split("\n") if line.strip()]
+    if not lo_list:
+        return "❌ No learning outcomes detected.", None
+    # LO Matching
+    old_scores = match_learning_outcomes(lo_list, old_text)
+    new_scores = match_learning_outcomes(lo_list, new_text)
+    matched_old = sum(score > 0.6 for score in old_scores)
+    matched_new = sum(score > 0.6 for score in new_scores)
+    # Change % based on character count
+    change_percent = abs(len(new_text) - len(old_text)) / max(len(old_text), 1) * 100
+    summary = f"📈 **Content Change:** {change_percent:.2f}%\n"
+    summary += f"🎯 **LOs Matched (Old vs New):** {matched_old} vs {matched_new} of {len(lo_list)}\n"
+    if matched_new > matched_old:
+        summary += "🟢 New handout covers learning outcomes better.\n"
+    elif matched_new < matched_old:
+        summary += "🔴 New handout covers fewer outcomes.\n"
+    else:
+        summary += "🟡 No major change in LO coverage.\n"
+    chart = generate_similarity_chart(lo_list, old_scores, new_scores)
+    return summary, chart
 iface = gr.Interface(
+    fn=compare_handouts,
     inputs=[
+        gr.File(label="Old Version PDF"),
+        gr.File(label="New Version PDF"),
+        gr.File(label="Learning Outcomes (.docx)", file_types=[".docx"]),
     ],
     outputs=[
+        gr.Markdown(label="Summary"),
+        gr.Image(label="LO Match Chart")
     ],
+    title="📘 Educational Content Comparator",
+    description="Compare two handouts and evaluate changes + Learning Outcome coverage using semantic similarity.",
 )
 iface.launch()