Spaces:

Deevyankar
/

Handouts

Sleeping

App Files Files Community

Deevyankar commited on Sep 21, 2025

Commit

a481d53

verified ·

1 Parent(s): ba00bbc

Update app.py

Browse files

Files changed (1) hide show

app.py +50 -40

app.py CHANGED Viewed

@@ -4,6 +4,7 @@ import fitz  # PyMuPDF
 import docx
 import io
 import re
 from sklearn.feature_extraction.text import TfidfVectorizer
 from sentence_transformers import SentenceTransformer, util
 import matplotlib.pyplot as plt
@@ -20,7 +21,7 @@ def extract_text_from_pdf(pdf_file):
             text += page.get_text()
         pdf_reader.close()
         return text.strip()
-    except Exception as e:
         return ""
 def normalize_text(text):
@@ -29,24 +30,17 @@ def normalize_text(text):
 def extract_text_from_docx(docx_file):
     try:
         doc = docx.Document(io.BytesIO(docx_file))
-        full_text = []
-        for para in doc.paragraphs:
-            if para.text.strip():
-                full_text.append(para.text.strip())
-        return full_text
     except:
         return []
 def semantic_match(lo_list, content):
     scores = []
     for lo in lo_list:
-        try:
-            lo_embed = model.encode(lo, convert_to_tensor=True)
-            content_embed = model.encode(content, convert_to_tensor=True)
-            sim = util.pytorch_cos_sim(lo_embed, content_embed).item()
-            scores.append(round(sim, 2))
-        except:
-            scores.append(0.0)
     return scores
 def content_change_score(text1, text2):
@@ -61,62 +55,78 @@ def compare_handouts(old_pdf, new_pdf, lo_file):
     new_text = extract_text_from_pdf(new_pdf)
     if len(old_text.strip()) < 200 or len(new_text.strip()) < 200:
-        return "⚠️ Could not extract meaningful content from one or both PDFs.", None
     lo_list = extract_text_from_docx(lo_file)
     if not lo_list:
-        return "⚠️ No learning outcomes detected.", None
     old_scores = semantic_match(lo_list, old_text)
     new_scores = semantic_match(lo_list, new_text)
-    content_diff_percent = content_change_score(old_text, new_text)
-    lo_improvements = [round(new - old, 2) for new, old in zip(new_scores, old_scores)]
-    improved_count = sum([diff > 0.01 for diff in lo_improvements])
-    improved_total = sum([diff for diff in lo_improvements if diff > 0])
-    lo_change_percent = round((improved_total / len(lo_list)) * 100, 2)
-    summary = f"🧠 Improved LOs: {improved_count} / {len(lo_list)} (Total improvement score: {improved_total})\n"
-    summary += f"📄 Content Change Estimate: {content_diff_percent}%\n"
-    summary += f"🎯 LO Change Estimate: {lo_change_percent}%\n"
     if improved_count > 0:
         summary += "🟢 Summary: New handout better aligns with LOs and has improved clarity."
     else:
         summary += "⚠️ Summary: No significant improvement in LO alignment."
-    # Plot
     x = np.arange(len(lo_list))
     width = 0.35
-    fig, ax = plt.subplots()
     ax.bar(x - width/2, old_scores, width, label='Old')
     ax.bar(x + width/2, new_scores, width, label='New')
     ax.set_ylabel('Match Score (0-1)')
-    ax.set_title('LO-wise Match Score: Old vs New')
     ax.set_xticks(x)
     ax.set_xticklabels([f"LO{i+1}" for i in range(len(lo_list))], rotation=45)
     ax.legend()
     plt.tight_layout()
-    return summary, fig
 with gr.Blocks() as demo:
-    gr.Markdown("📘 **Educational Content Comparator**")
-    gr.Markdown("Upload 2 handouts and a .docx file of Learning Outcomes to compare changes and alignment.")
     with gr.Row():
-        old_pdf = gr.File(label="📂 Upload Old PDF", file_types=[".pdf"], type="binary")
-        new_pdf = gr.File(label="📂 Upload New PDF", file_types=[".pdf"], type="binary")
-        lo_file = gr.File(label="📂 Upload Learning Outcomes (.docx)", file_types=[".docx"], type="binary")
     with gr.Row():
-        btn = gr.Button("Submit")
-        clear_btn = gr.Button("Clear")
-    output_text = gr.Textbox(label="📋 Summary", lines=6, interactive=False)
-    output_plot = gr.Plot(label="📊 LO Match Chart")
-    btn.click(fn=compare_handouts, inputs=[old_pdf, new_pdf, lo_file], outputs=[output_text, output_plot])
-    clear_btn.click(fn=lambda: ("", None), inputs=[], outputs=[output_text, output_plot])
 demo.launch()

 import docx
 import io
 import re
+import pandas as pd
 from sklearn.feature_extraction.text import TfidfVectorizer
 from sentence_transformers import SentenceTransformer, util
 import matplotlib.pyplot as plt
             text += page.get_text()
         pdf_reader.close()
         return text.strip()
+    except Exception:
         return ""
 def normalize_text(text):
 def extract_text_from_docx(docx_file):
     try:
         doc = docx.Document(io.BytesIO(docx_file))
+        return [p.text.strip() for p in doc.paragraphs if p.text.strip()]
     except:
         return []
 def semantic_match(lo_list, content):
     scores = []
+    content_embed = model.encode(content, convert_to_tensor=True)
     for lo in lo_list:
+        lo_embed = model.encode(lo, convert_to_tensor=True)
+        sim = util.pytorch_cos_sim(lo_embed, content_embed).item()
+        scores.append(round(sim, 3))
     return scores
 def content_change_score(text1, text2):
     new_text = extract_text_from_pdf(new_pdf)
     if len(old_text.strip()) < 200 or len(new_text.strip()) < 200:
+        return "❌ Could not extract text from one or both PDFs.", None, None
     lo_list = extract_text_from_docx(lo_file)
     if not lo_list:
+        return "⚠️ No learning outcomes detected.", None, None
     old_scores = semantic_match(lo_list, old_text)
     new_scores = semantic_match(lo_list, new_text)
+    improvement = [round(n - o, 3) for n, o in zip(new_scores, old_scores)]
+    improved_count = sum([i > 0 for i in improvement])
+    # Prepare Excel output
+    df = pd.DataFrame({
+        "Learning Outcome": lo_list,
+        "Old Match Score": old_scores,
+        "New Match Score": new_scores,
+        "Improvement": improvement
+    })
+    excel_path = "/mnt/data/LO_Comparison_Report.xlsx"
+    df.to_excel(excel_path, index=False)
+    # Scores
+    content_diff = content_change_score(old_text, new_text)
+    lo_improvement_percent = round((sum(improvement) / len(lo_list)) * 100, 2)
+    summary = (
+        f"🧠 Improved LOs: {improved_count} / {len(lo_list)}\n"
+        f"📄 Content Change Estimate: {content_diff}%\n"
+        f"📊 Avg LO Improvement Score: {lo_improvement_percent}%\n\n"
+    )
     if improved_count > 0:
         summary += "🟢 Summary: New handout better aligns with LOs and has improved clarity."
     else:
         summary += "⚠️ Summary: No significant improvement in LO alignment."
+    # Bar chart
     x = np.arange(len(lo_list))
     width = 0.35
+    fig, ax = plt.subplots(figsize=(10, 5))
     ax.bar(x - width/2, old_scores, width, label='Old')
     ax.bar(x + width/2, new_scores, width, label='New')
     ax.set_ylabel('Match Score (0-1)')
+    ax.set_title('LO-wise Match Score Comparison')
     ax.set_xticks(x)
     ax.set_xticklabels([f"LO{i+1}" for i in range(len(lo_list))], rotation=45)
     ax.legend()
     plt.tight_layout()
+    return summary, fig, excel_path
 with gr.Blocks() as demo:
+    gr.Markdown("### 📘 Educational Handout Comparison Tool")
+    gr.Markdown("Upload an old and new handout PDF, along with Learning Outcomes (.docx), to compare updates.")
     with gr.Row():
+        old_pdf = gr.File(label="📂 Old Handout", file_types=[".pdf"], type="binary")
+        new_pdf = gr.File(label="📂 New Handout", file_types=[".pdf"], type="binary")
+        lo_file = gr.File(label="📂 Learning Outcomes (.docx)", file_types=[".docx"], type="binary")
     with gr.Row():
+        btn = gr.Button("🔍 Compare")
+        clear_btn = gr.Button("♻️ Clear")
+    summary_out = gr.Textbox(label="📋 Summary", lines=6, interactive=False)
+    plot_out = gr.Plot(label="📊 LO Score Chart")
+    download_link = gr.File(label="📥 Download Excel Report")
+    btn.click(fn=compare_handouts, inputs=[old_pdf, new_pdf, lo_file],
+              outputs=[summary_out, plot_out, download_link])
+    clear_btn.click(fn=lambda: ("", None, None),
+                    inputs=[], outputs=[summary_out, plot_out, download_link])
 demo.launch()