Spaces:

Deevyankar
/

Handouts

Sleeping

App Files Files Community

Deevyankar commited on Sep 22, 2025

Commit

8751b01

verified ·

1 Parent(s): bc78525

Update app.py

Browse files

Files changed (1) hide show

app.py +64 -35

app.py CHANGED Viewed

@@ -1,13 +1,17 @@
 import gradio as gr
 from PyPDF2 import PdfReader
 import io
 import matplotlib.pyplot as plt
 import pandas as pd
-from sentence_transformers import SentenceTransformer, util
-# Load sentence transformer model
-model = SentenceTransformer("all-MiniLM-L6-v2")
 def extract_text_from_pdf(pdf_file):
     try:
@@ -17,22 +21,28 @@ def extract_text_from_pdf(pdf_file):
             text = page.extract_text()
             if text:
                 full_text += text
-        return full_text
     except Exception as e:
         print("Text extraction failed:", e)
-        return ""
-def semantic_similarity_score(text1, text2):
-    embeddings = model.encode([text1, text2], convert_to_tensor=True)
-    score = util.pytorch_cos_sim(embeddings[0], embeddings[1]).item()
-    return score
-def semantic_match_los(lo_list, content):
-    lo_scores = []
-    for lo in lo_list:
-        score = semantic_similarity_score(lo, content)
-        lo_scores.append(score)
-    return lo_scores
 def compare_all(old_pdf, new_pdf, lo_file):
     try:
@@ -47,12 +57,22 @@ def compare_all(old_pdf, new_pdf, lo_file):
     if not old_text.strip() or not new_text.strip():
         return "❌ Could not extract text from one or both PDFs.", None, None
-    old_scores = semantic_match_los(los, old_text)
-    new_scores = semantic_match_los(los, new_text)
     labels = [f"LO{i+1}" for i in range(len(los))]
     x = range(len(labels))
-    fig, ax = plt.subplots()
     ax.bar(x, old_scores, width=0.4, label="Old", align='center')
     ax.bar([i + 0.4 for i in x], new_scores, width=0.4, label="New", align='center')
     ax.set_xticks([i + 0.2 for i in x])
@@ -61,22 +81,32 @@ def compare_all(old_pdf, new_pdf, lo_file):
     ax.set_title("Learning Outcomes Comparison")
     ax.legend()
-    df = pd.DataFrame({
         "Learning Outcome": labels,
         "Old Match": [round(s*100, 2) for s in old_scores],
         "New Match": [round(s*100, 2) for s in new_scores],
-        "Change (%)": [round((n - o)*100, 2) for o, n in zip(old_scores, new_scores)]
-    })
-    content_sim = semantic_similarity_score(old_text, new_text)
-    content_change_pct = round((1 - content_sim) * 100, 2)
-    summary = f"📘 **Summary of Comparison**\n\n"
-    summary += f"📈 **Overall Content Change**: {content_change_pct}%\n"
-    summary += f"🔍 This percentage shows how much the content has changed using a semantic model (SentenceTransformer).\n\n"
-    summary += f"🎯 **Learning Outcome Matches**: {sum(1 for s in new_scores if s >= 0.5)} of {len(los)}\n"
-    summary += f"🧠 The system checks how well the new content aligns with each LO (threshold: 0.5 semantic match).\n\n"
-    summary += f"🟢 **Insight**: New content appears {'more' if sum(new_scores) > sum(old_scores) else 'less'} aligned with the learning outcomes.\n"
     return summary, df, fig
@@ -88,13 +118,12 @@ iface = gr.Interface(
         gr.File(label="Learning Outcomes (Text File)", type='binary'),
     ],
     outputs=[
-        gr.Textbox(label="Summary & Insights", lines=10, max_lines=30),
         gr.Dataframe(label="LO-wise Comparison Table"),
         gr.Plot(label="Visual Comparison Chart")
     ],
-    title="📘 Handout Comparator + LO Analysis (Semantic Transformer)",
-    description="Upload old/new handouts + Learning Outcomes (TXT). Compares content and outcome match using a transformer model (MiniLM)."
 )
 iface.launch()

 import gradio as gr
 from PyPDF2 import PdfReader
+from pdf2image import convert_from_bytes
+import pytesseract
+from PIL import Image
 import io
+from sklearn.feature_extraction.text import TfidfVectorizer
+from sklearn.metrics.pairwise import cosine_similarity
 import matplotlib.pyplot as plt
 import pandas as pd
+from sentence_transformers import SentenceTransformer
+model = SentenceTransformer('all-MiniLM-L6-v2')
 def extract_text_from_pdf(pdf_file):
     try:
             text = page.extract_text()
             if text:
                 full_text += text
+        if full_text.strip():
+            return full_text
     except Exception as e:
         print("Text extraction failed:", e)
+    try:
+        images = convert_from_bytes(pdf_file)
+        text = ""
+        for img in images:
+            text += pytesseract.image_to_string(img)
+        return text
+    except Exception as e:
+        print("OCR failed:", e)
+        return ""
+def semantic_match(lo_list, content):
+    lo_texts = [lo for lo in lo_list if lo.strip()]
+    vectorizer = TfidfVectorizer().fit_transform([content] + lo_texts)
+    vectors = vectorizer.toarray()
+    content_vec = vectors[0]
+    scores = [cosine_similarity([content_vec], [vec])[0][0] for vec in vectors[1:]]
+    return scores
 def compare_all(old_pdf, new_pdf, lo_file):
     try:
     if not old_text.strip() or not new_text.strip():
         return "❌ Could not extract text from one or both PDFs.", None, None
+    # Similarity Calculations
+    tfidf_vectorizer = TfidfVectorizer().fit_transform([old_text, new_text])
+    tfidf_score = cosine_similarity([tfidf_vectorizer.toarray()[0]], [tfidf_vectorizer.toarray()[1]])[0][0] * 100
+    embed_old = model.encode(old_text, convert_to_tensor=True)
+    embed_new = model.encode(new_text, convert_to_tensor=True)
+    semantic_score = float(cosine_similarity([embed_old], [embed_new])[0][0]) * 100
+    # LO Scores
+    old_scores = semantic_match(los, old_text)
+    new_scores = semantic_match(los, new_text)
+    # Bar Plot
     labels = [f"LO{i+1}" for i in range(len(los))]
     x = range(len(labels))
+    fig, ax = plt.subplots(figsize=(10, 5))
     ax.bar(x, old_scores, width=0.4, label="Old", align='center')
     ax.bar([i + 0.4 for i in x], new_scores, width=0.4, label="New", align='center')
     ax.set_xticks([i + 0.2 for i in x])
     ax.set_title("Learning Outcomes Comparison")
     ax.legend()
+    # Table
+    data = {
         "Learning Outcome": labels,
         "Old Match": [round(s*100, 2) for s in old_scores],
         "New Match": [round(s*100, 2) for s in new_scores],
+        "Change (%)": [round((new - old)*100, 2) for new, old in zip(new_scores, old_scores)]
+    }
+    df = pd.DataFrame(data)
+    # Insight Generation
+    lo_diff = sum(new_scores) - sum(old_scores)
+    if abs(lo_diff) < 0.01:
+        insight = "⚪ No significant change in alignment with learning outcomes."
+    elif lo_diff > 0:
+        insight = "🟢 New content appears more aligned with outcomes."
+    else:
+        insight = "🔴 New content appears less aligned with outcomes."
+    matched_lo = sum(1 for s in new_scores if s >= 0.5)
+    total_lo = len(los)
+    summary = f"📘 Summary of Comparison\n\n"
+    summary += f"🔹 Semantic Similarity (Transformer): {round(semantic_score, 2)}%\n"
+    summary += f"🔹 Structural Similarity (TF-IDF): {round(tfidf_score, 2)}%\n\n"
+    summary += f"🎯 Learning Outcome Matches: {matched_lo} of {total_lo}\n"
+    summary += insight
     return summary, df, fig
         gr.File(label="Learning Outcomes (Text File)", type='binary'),
     ],
     outputs=[
+        gr.Textbox(label="Summary & Insights", lines=10),
         gr.Dataframe(label="LO-wise Comparison Table"),
         gr.Plot(label="Visual Comparison Chart")
     ],
+    title="📘 Handout Comparator + LO Analysis (Dual Similarity)",
+    description="Upload old/new handouts + Learning Outcomes file (TXT). See content diff, LO alignment, and dual similarity scoring (TF-IDF + Transformers)."
 )
 iface.launch()