Spaces:

Deevyankar
/

Handouts

Sleeping

App Files Files Community

Deevyankar commited on Sep 21, 2025

Commit

4bbb77a

verified ·

1 Parent(s): ac4e81e

Update app.py

Browse files

Files changed (1) hide show

app.py +92 -50

app.py CHANGED Viewed

@@ -1,91 +1,133 @@
 import gradio as gr
 import docx
 import io
-import pdfplumber
 from sklearn.feature_extraction.text import TfidfVectorizer
-from sklearn.metrics.pairwise import cosine_similarity
 import matplotlib.pyplot as plt
 import numpy as np
-def extract_text_from_pdf(file):
-    try:
-        text = ""
-        with pdfplumber.open(file.name) as pdf:
-            for page in pdf.pages:
-                text += page.extract_text() or ""
-        return text
-    except Exception as e:
-        return ""
-def extract_text_from_docx(file):
     try:
-        doc = docx.Document(file.name)
-        full_text = [para.text for para in doc.paragraphs]
-        return "\n".join(full_text)
     except Exception as e:
         return ""
-def calculate_similarity(text1, text2):
-    vectorizer = TfidfVectorizer().fit_transform([text1, text2])
-    similarity = cosine_similarity(vectorizer[0:1], vectorizer[1:2])
-    return float(similarity[0][0]) * 100  # return as percentage
-def semantic_match(lo_texts, content):
     scores = []
-    vectorizer = TfidfVectorizer().fit_transform([content] + lo_texts)
-    content_vec = vectorizer[0]
-    for i in range(1, len(lo_texts) + 1):
-        score = cosine_similarity(content_vec, vectorizer[i])[0][0]
-        scores.append(score)
     return scores
 def compare_handouts(old_pdf, new_pdf, lo_file):
     old_text = extract_text_from_pdf(old_pdf)
     new_text = extract_text_from_pdf(new_pdf)
-    lo_text = extract_text_from_docx(lo_file)
-    if not old_text or not new_text:
-        return "❌ Could not extract text from one or both PDFs.", None
-    similarity = calculate_similarity(old_text, new_text)
-    lo_list = [line.strip() for line in lo_text.split("\n") if line.strip()]
     old_scores = semantic_match(lo_list, old_text)
     new_scores = semantic_match(lo_list, new_text)
-    matched_outcomes = sum(n >= o for o, n in zip(old_scores, new_scores))
-    # Generate a bar plot for each LO
-    labels = [f"LO{i+1}" for i in range(len(lo_list))]
-    x = np.arange(len(labels))
     width = 0.35
     fig, ax = plt.subplots()
     ax.bar(x - width/2, old_scores, width, label='Old')
     ax.bar(x + width/2, new_scores, width, label='New')
-    ax.set_ylabel('Semantic Match Score')
-    ax.set_title('Learning Outcome Coverage')
     ax.set_xticks(x)
-    ax.set_xticklabels(labels)
     ax.legend()
     plt.tight_layout()
-    summary = f"📈 Content Change Estimate: {similarity:.2f}%\n"
-    summary += f"🎯 Matched LOs: {matched_outcomes} of {len(lo_list)}\n"
-    summary += "🟢 Summary: New handout has improved structure and added clarity." if similarity > 50 else "⚠️ Summary: Minimal updates or low improvement detected."
     return summary, fig
 with gr.Blocks() as demo:
-    gr.Markdown("# 📚 Handout Comparator with LO Alignment")
     with gr.Row():
-        old_pdf = gr.File(label="📄 Upload OLD Handout (PDF)")
-        new_pdf = gr.File(label="🆕 Upload NEW Handout (PDF)")
-        lo_file = gr.File(label="📘 Upload Learning Outcomes (DOCX)")
-    compare_btn = gr.Button("🔍 Analyze and Compare")
-    output_text = gr.Textbox(label="📊 Summary")
-    output_plot = gr.Plot(label="📈 Learning Outcome Comparison")
-    compare_btn.click(fn=compare_handouts, inputs=[old_pdf, new_pdf, lo_file], outputs=[output_text, output_plot])
 demo.launch()

 import gradio as gr
+import fitz  # PyMuPDF
 import docx
 import io
+import re
 from sklearn.feature_extraction.text import TfidfVectorizer
+from sentence_transformers import SentenceTransformer, util
 import matplotlib.pyplot as plt
 import numpy as np
+from difflib import SequenceMatcher
+model = SentenceTransformer('all-MiniLM-L6-v2')
+def extract_text_from_pdf(pdf_file):
     try:
+        pdf_reader = fitz.open(stream=pdf_file, filetype="pdf")
+        text = ""
+        for page in pdf_reader:
+            text += page.get_text()
+        pdf_reader.close()
+        return text.strip()
     except Exception as e:
         return ""
+def normalize_text(text):
+    return re.sub(r'\s+', ' ', text.strip().lower())
+def extract_text_from_docx(docx_file):
+    try:
+        doc = docx.Document(io.BytesIO(docx_file))
+        full_text = []
+        for para in doc.paragraphs:
+            if para.text.strip():
+                full_text.append(para.text.strip())
+        return full_text
+    except:
+        return []
+def semantic_match(lo_list, content):
     scores = []
+    for lo in lo_list:
+        try:
+            lo_embed = model.encode(lo, convert_to_tensor=True)
+            content_embed = model.encode(content, convert_to_tensor=True)
+            sim = util.pytorch_cos_sim(lo_embed, content_embed).item()
+            scores.append(round(sim, 2))
+        except:
+            scores.append(0.0)
     return scores
+def content_change_score(text1, text2):
+    try:
+        sim = SequenceMatcher(None, normalize_text(text1), normalize_text(text2)).ratio()
+        return round((1 - sim) * 100, 2)
+    except:
+        return 100.0
+def structure_score(text):
+    toc_score = 5 if "table of contents" in text.lower() else 0
+    bullet_score = 5 if len(re.findall(r"\n\s*[-•*]", text)) > 10 else 0
+    return toc_score + bullet_score
+def calculate_weighted_score(content_diff, improved_los, total_los, struct_score=10):
+    lo_component = (improved_los / total_los) * 100 if total_los > 0 else 0
+    return round(0.6 * lo_component + 0.3 * content_diff + 0.1 * struct_score, 2)
 def compare_handouts(old_pdf, new_pdf, lo_file):
     old_text = extract_text_from_pdf(old_pdf)
     new_text = extract_text_from_pdf(new_pdf)
+    if len(old_text.strip()) < 200 or len(new_text.strip()) < 200:
+        return "⚠️ Could not extract meaningful content from one or both PDFs.", None
+    lo_list = extract_text_from_docx(lo_file)
+    if not lo_list:
+        return "⚠️ No learning outcomes detected.", None
     old_scores = semantic_match(lo_list, old_text)
     new_scores = semantic_match(lo_list, new_text)
+    content_diff = content_change_score(old_text, new_text)
+    improved_count = sum([n > o for n, o in zip(new_scores, old_scores)])
+    matched_los = sum([n >= o for n, o in zip(new_scores, old_scores)])
+    struct_score = structure_score(new_text)
+    weighted_score = calculate_weighted_score(content_diff, improved_count, len(lo_list), struct_score)
+    summary = f"🧠 Improved LOs: {improved_count} / {len(lo_list)}\n"
+    summary += f"📄 Content Change Estimate: {content_diff}%\n"
+    summary += f"🏗️ Structure Score: {struct_score}/10\n"
+    summary += f"🔢 Final Weighted Score: {weighted_score}%\n"
+    if improved_count > 0:
+        summary += "\n🟢 Summary: New handout better aligns with LOs and has improved clarity."
+    else:
+        summary += "\n⚠️ Summary: No significant LO improvement detected."
+    x = np.arange(len(lo_list))
     width = 0.35
     fig, ax = plt.subplots()
     ax.bar(x - width/2, old_scores, width, label='Old')
     ax.bar(x + width/2, new_scores, width, label='New')
+    ax.set_ylabel('Match Score (0-1)')
+    ax.set_title('LO-wise Match Score: Old vs New')
     ax.set_xticks(x)
+    ax.set_xticklabels([f"LO{i+1}" for i in range(len(lo_list))], rotation=45)
     ax.legend()
     plt.tight_layout()
     return summary, fig
 with gr.Blocks() as demo:
+    gr.Markdown("📘 **Educational Content Comparator - Weighted Analysis**")
+    gr.Markdown("Upload old & new handouts + Learning Outcomes (.docx) to evaluate change & alignment.")
+    with gr.Row():
+        old_pdf = gr.File(label="📂 Upload Old PDF", file_types=[".pdf"], type="binary")
+        new_pdf = gr.File(label="📂 Upload New PDF", file_types=[".pdf"], type="binary")
+        lo_file = gr.File(label="📂 Upload Learning Outcomes (.docx)", file_types=[".docx"], type="binary")
     with gr.Row():
+        btn = gr.Button("Submit")
+        clear_btn = gr.Button("Clear")
+    output_text = gr.Textbox(label="📋 Summary", lines=6)
+    output_plot = gr.Plot(label="📊 LO Match Chart")
+    btn.click(fn=compare_handouts, inputs=[old_pdf, new_pdf, lo_file], outputs=[output_text, output_plot])
+    clear_btn.click(fn=lambda: ("", None), inputs=[], outputs=[output_text, output_plot])
 demo.launch()