Spaces:

Deevyankar
/

Handouts

Sleeping

App Files Files Community

Deevyankar commited on Sep 22, 2025

Commit

bc78525

verified ·

1 Parent(s): ef62d76

Update app.py

Browse files

Files changed (1) hide show

app.py +33 -61

app.py CHANGED Viewed

@@ -1,16 +1,13 @@
 import gradio as gr
 from PyPDF2 import PdfReader
-from pdf2image import convert_from_bytes
-import pytesseract
-from PIL import Image
 import io
-from sklearn.feature_extraction.text import TfidfVectorizer
-from sklearn.metrics.pairwise import cosine_similarity
 import matplotlib.pyplot as plt
 from sentence_transformers import SentenceTransformer, util
-import pandas as pd
 def extract_text_from_pdf(pdf_file):
     try:
@@ -20,35 +17,22 @@ def extract_text_from_pdf(pdf_file):
             text = page.extract_text()
             if text:
                 full_text += text
-        if full_text.strip():
-            return full_text
     except Exception as e:
         print("Text extraction failed:", e)
-    try:
-        images = convert_from_bytes(pdf_file)
-        text = ""
-        for img in images:
-            text += pytesseract.image_to_string(img)
-        return text
-    except Exception as e:
-        print("OCR failed:", e)
         return ""
-def semantic_match(lo_list, content):
-    lo_texts = [lo for lo in lo_list if lo.strip()]
-    vectorizer = TfidfVectorizer().fit_transform([content] + lo_texts)
-    vectors = vectorizer.toarray()
-    content_vec = vectors[0]
-    scores = [cosine_similarity([content_vec], [vec])[0][0] for vec in vectors[1:]]
-    return scores
-def compute_change_percentage(old_text, new_text):
-    vectorizer = TfidfVectorizer()
-    vectors = vectorizer.fit_transform([old_text, new_text])
-    cos_sim = cosine_similarity(vectors[0], vectors[1])[0][0]
-    change_percentage = round((1 - cos_sim) * 100, 2)
-    return change_percentage
 def compare_all(old_pdf, new_pdf, lo_file):
     try:
@@ -63,8 +47,8 @@ def compare_all(old_pdf, new_pdf, lo_file):
     if not old_text.strip() or not new_text.strip():
         return "❌ Could not extract text from one or both PDFs.", None, None
-    old_scores = semantic_match(los, old_text)
-    new_scores = semantic_match(los, new_text)
     labels = [f"LO{i+1}" for i in range(len(los))]
     x = range(len(labels))
@@ -77,35 +61,22 @@ def compare_all(old_pdf, new_pdf, lo_file):
     ax.set_title("Learning Outcomes Comparison")
     ax.legend()
-    data = {
         "Learning Outcome": labels,
         "Old Match": [round(s*100, 2) for s in old_scores],
         "New Match": [round(s*100, 2) for s in new_scores],
-        "Change (%)": [round((new - old)*100, 2) for new, old in zip(new_scores, old_scores)]
-    }
-    df = pd.DataFrame(data)
-    change_percentage = compute_change_percentage(old_text, new_text)
-    matched_los = sum(1 for s in new_scores if s >= 0.5)
-    # Overall content change using cosine similarity
-    vectorizer = TfidfVectorizer()
-    vectors = vectorizer.fit_transform([old_text, new_text])
-    cos_sim = cosine_similarity(vectors[0], vectors[1])[0][0]
-    change_percentage = round((1 - cos_sim) * 100, 2)
-    summary = f"""📘 **Summary of Comparison**
-📈 **Overall Content Change**: {change_percentage}%
-🔍 This percentage shows how much the content has changed from old to new handouts using TF-IDF vector similarity (cosine distance).
-🎯 **Learning Outcome Matches**: {sum(1 for s in new_scores if s >= 0.5)} of {len(los)}
-🧠 The system checks how well the new content aligns with each LO (threshold: 0.5 semantic match).
-🟢 **Insight**: New content appears {'more' if sum(new_scores) > sum(old_scores) else 'less'} aligned with the learning outcomes.
-💡 **Note**: Content change > 40% usually indicates significant modifications (new examples, topics, etc.). Review the table and chart below for LO-specific changes."""
     return summary, df, fig
@@ -117,12 +88,13 @@ iface = gr.Interface(
         gr.File(label="Learning Outcomes (Text File)", type='binary'),
     ],
     outputs=[
-        gr.Textbox(label="Summary & Insights", lines=15, max_lines=20),
         gr.Dataframe(label="LO-wise Comparison Table"),
         gr.Plot(label="Visual Comparison Chart")
     ],
-    title="📘 Handout Comparator + LO Analysis",
-    description="Upload old/new handouts + Learning Outcomes file (TXT). See content diff, visual LO match, and semantic change."
 )
-iface.launch()

 import gradio as gr
 from PyPDF2 import PdfReader
 import io
 import matplotlib.pyplot as plt
+import pandas as pd
 from sentence_transformers import SentenceTransformer, util
+# Load sentence transformer model
+model = SentenceTransformer("all-MiniLM-L6-v2")
 def extract_text_from_pdf(pdf_file):
     try:
             text = page.extract_text()
             if text:
                 full_text += text
+        return full_text
     except Exception as e:
         print("Text extraction failed:", e)
         return ""
+def semantic_similarity_score(text1, text2):
+    embeddings = model.encode([text1, text2], convert_to_tensor=True)
+    score = util.pytorch_cos_sim(embeddings[0], embeddings[1]).item()
+    return score
+def semantic_match_los(lo_list, content):
+    lo_scores = []
+    for lo in lo_list:
+        score = semantic_similarity_score(lo, content)
+        lo_scores.append(score)
+    return lo_scores
 def compare_all(old_pdf, new_pdf, lo_file):
     try:
     if not old_text.strip() or not new_text.strip():
         return "❌ Could not extract text from one or both PDFs.", None, None
+    old_scores = semantic_match_los(los, old_text)
+    new_scores = semantic_match_los(los, new_text)
     labels = [f"LO{i+1}" for i in range(len(los))]
     x = range(len(labels))
     ax.set_title("Learning Outcomes Comparison")
     ax.legend()
+    df = pd.DataFrame({
         "Learning Outcome": labels,
         "Old Match": [round(s*100, 2) for s in old_scores],
         "New Match": [round(s*100, 2) for s in new_scores],
+        "Change (%)": [round((n - o)*100, 2) for o, n in zip(old_scores, new_scores)]
+    })
+    content_sim = semantic_similarity_score(old_text, new_text)
+    content_change_pct = round((1 - content_sim) * 100, 2)
+    summary = f"📘 **Summary of Comparison**\n\n"
+    summary += f"📈 **Overall Content Change**: {content_change_pct}%\n"
+    summary += f"🔍 This percentage shows how much the content has changed using a semantic model (SentenceTransformer).\n\n"
+    summary += f"🎯 **Learning Outcome Matches**: {sum(1 for s in new_scores if s >= 0.5)} of {len(los)}\n"
+    summary += f"🧠 The system checks how well the new content aligns with each LO (threshold: 0.5 semantic match).\n\n"
+    summary += f"🟢 **Insight**: New content appears {'more' if sum(new_scores) > sum(old_scores) else 'less'} aligned with the learning outcomes.\n"
     return summary, df, fig
         gr.File(label="Learning Outcomes (Text File)", type='binary'),
     ],
     outputs=[
+        gr.Textbox(label="Summary & Insights", lines=10, max_lines=30),
         gr.Dataframe(label="LO-wise Comparison Table"),
         gr.Plot(label="Visual Comparison Chart")
     ],
+    title="📘 Handout Comparator + LO Analysis (Semantic Transformer)",
+    description="Upload old/new handouts + Learning Outcomes (TXT). Compares content and outcome match using a transformer model (MiniLM)."
 )
+iface.launch()