Spaces:

Deevyankar
/

Handouts

Sleeping

App Files Files Community

Deevyankar commited on Sep 22, 2025

Commit

9af4462

verified ·

1 Parent(s): 3378b7b

Update app.py

Browse files

Files changed (1) hide show

app.py +40 -63

app.py CHANGED Viewed

@@ -1,20 +1,14 @@
 import gradio as gr
 from PyPDF2 import PdfReader
-from pdf2image import convert_from_bytes
-import pytesseract
-from PIL import Image
 import io
 from sklearn.feature_extraction.text import TfidfVectorizer
 from sklearn.metrics.pairwise import cosine_similarity
-from transformers import pipeline
 import matplotlib.pyplot as plt
 import pandas as pd
-from difflib import SequenceMatcher
-# Load transformer model for semantic similarity
-semantic_pipeline = pipeline("feature-extraction", model="sentence-transformers/all-MiniLM-L6-v2")
 def extract_text_from_pdf(pdf_file):
     try:
@@ -24,21 +18,12 @@ def extract_text_from_pdf(pdf_file):
             text = page.extract_text()
             if text:
                 full_text += text
-        if full_text.strip():
-            return full_text
-    except Exception as e:
-        print("Text extraction failed:", e)
-    try:
-        images = convert_from_bytes(pdf_file)
-        text = ""
-        for img in images:
-            text += pytesseract.image_to_string(img)
-        return text
     except Exception as e:
-        print("OCR failed:", e)
         return ""
 def semantic_match(lo_list, content):
     lo_texts = [lo for lo in lo_list if lo.strip()]
     vectorizer = TfidfVectorizer().fit_transform([content] + lo_texts)
@@ -47,43 +32,31 @@ def semantic_match(lo_list, content):
     scores = [cosine_similarity([content_vec], [vec])[0][0] for vec in vectors[1:]]
     return scores
-def compute_difference_and_text_change(old_text, new_text):
-    similarity = SequenceMatcher(None, old_text, new_text).ratio()
-    difference_percentage = round((1 - similarity) * 100, 2)
-    len_old = len(old_text.split())
-    len_new = len(new_text.split())
-    length_change = round(((len_new - len_old) / len_old) * 100, 2)
-    return difference_percentage, length_change
-def transformer_similarity(text1, text2):
-    emb1 = semantic_pipeline(text1)[0][0]
-    emb2 = semantic_pipeline(text2)[0][0]
-    sim = cosine_similarity([emb1], [emb2])[0][0]
-    return round(sim * 100, 2)
 def compare_all(old_pdf, new_pdf, lo_file):
     try:
         los = lo_file.decode("utf-8", errors="ignore").splitlines()
         los = [lo.strip() for lo in los if lo.strip()]
     except:
-        return "❌ Could not read learning outcomes file.", None, None
     old_text = extract_text_from_pdf(old_pdf)
     new_text = extract_text_from_pdf(new_pdf)
-    if not old_text.strip() or not new_text.strip():
-        return "❌ Could not extract text from one or both PDFs.", None, None
     old_scores = semantic_match(los, old_text)
     new_scores = semantic_match(los, new_text)
     labels = [f"LO{i+1}" for i in range(len(los))]
-    x = range(len(labels))
     fig, ax = plt.subplots()
-    ax.bar(x, old_scores, width=0.4, label="Old", align='center')
-    ax.bar([i + 0.4 for i in x], new_scores, width=0.4, label="New", align='center')
-    ax.set_xticks([i + 0.2 for i in x])
     ax.set_xticklabels(labels, rotation=45)
     ax.set_ylabel("Semantic Match Score")
     ax.set_title("Learning Outcomes Comparison")
@@ -92,35 +65,37 @@ def compare_all(old_pdf, new_pdf, lo_file):
     # Table
     data = {
         "Learning Outcome": labels,
-        "Old Match": [round(s*100, 2) for s in old_scores],
-        "New Match": [round(s*100, 2) for s in new_scores],
-        "Change (%)": [round((new - old)*100, 2) for new, old in zip(new_scores, old_scores)]
     }
     df = pd.DataFrame(data)
-    # Calculate metrics
-    tfidf_similarity = round(cosine_similarity(
-        [TfidfVectorizer().fit_transform([old_text, new_text]).toarray()[0]],
-        [TfidfVectorizer().fit_transform([old_text, new_text]).toarray()[1]]
-    )[0][0] * 100, 2)
-    diff_pct, length_delta = compute_difference_and_text_change(old_text, new_text)
-    transformer_sim = transformer_similarity(old_text, new_text)
     summary = f"""
 📘 **Summary of Comparison**
-📈 **TF-IDF Similarity**: {tfidf_similarity}%
-🤖 **Transformer Similarity**: {transformer_sim}%
-🔄 **Textual Change** (Diff-based): {diff_pct}%
-📏 **Text Length Change**: {length_delta}% (words)
 🎯 **Learning Outcome Matches**: {sum(1 for s in new_scores if s >= 0.5)} of {len(los)}
-🧠 **Insight**: New content appears {'more' if sum(new_scores) > sum(old_scores) else 'less'} aligned with the learning outcomes.
-💬 **Tip**: Diff > 30% or word increase > 20% generally reflects real updates.
 """
-    return summary.strip(), df, fig
 iface = gr.Interface(
     fn=compare_all,
@@ -130,12 +105,14 @@ iface = gr.Interface(
         gr.File(label="Learning Outcomes (Text File)", type='binary'),
     ],
     outputs=[
-        gr.Textbox(label="📊 Summary Report", lines=12),
-        gr.Dataframe(label="📋 LO-wise Comparison Table"),
-        gr.Plot(label="📈 LO Match Chart")
     ],
-    title="📘 Handout Comparator + LO Analyzer (with AI)",
-    description="Compare two handouts and learning outcomes. View similarity via TF-IDF and Transformers. Bar chart and table included."
 )
 iface.launch()

 import gradio as gr
 from PyPDF2 import PdfReader
 import io
 from sklearn.feature_extraction.text import TfidfVectorizer
 from sklearn.metrics.pairwise import cosine_similarity
 import matplotlib.pyplot as plt
 import pandas as pd
+import numpy as np
+import re
 def extract_text_from_pdf(pdf_file):
     try:
             text = page.extract_text()
             if text:
                 full_text += text
+        return full_text.strip()
     except Exception as e:
+        print("PDF extraction error:", e)
         return ""
 def semantic_match(lo_list, content):
     lo_texts = [lo for lo in lo_list if lo.strip()]
     vectorizer = TfidfVectorizer().fit_transform([content] + lo_texts)
     scores = [cosine_similarity([content_vec], [vec])[0][0] for vec in vectors[1:]]
     return scores
 def compare_all(old_pdf, new_pdf, lo_file):
     try:
         los = lo_file.decode("utf-8", errors="ignore").splitlines()
         los = [lo.strip() for lo in los if lo.strip()]
     except:
+        return "❌ Could not read learning outcomes file.", None, None, None
     old_text = extract_text_from_pdf(old_pdf)
     new_text = extract_text_from_pdf(new_pdf)
+    if not old_text or not new_text:
+        return "❌ Could not extract text from one or both PDFs.", None, None, None
     old_scores = semantic_match(los, old_text)
     new_scores = semantic_match(los, new_text)
     labels = [f"LO{i+1}" for i in range(len(los))]
+    x = np.arange(len(labels))
+    # Plot
     fig, ax = plt.subplots()
+    ax.bar(x - 0.2, old_scores, width=0.4, label="Old", align='center')
+    ax.bar(x + 0.2, new_scores, width=0.4, label="New", align='center')
+    ax.set_xticks(x)
     ax.set_xticklabels(labels, rotation=45)
     ax.set_ylabel("Semantic Match Score")
     ax.set_title("Learning Outcomes Comparison")
     # Table
     data = {
         "Learning Outcome": labels,
+        "Old Match (%)": [round(s * 100, 2) for s in old_scores],
+        "New Match (%)": [round(s * 100, 2) for s in new_scores],
+        "Change (%)": [round((new - old) * 100, 2) for new, old in zip(new_scores, old_scores)]
     }
     df = pd.DataFrame(data)
+    # Content similarity
+    tfidf = TfidfVectorizer().fit_transform([old_text, new_text])
+    cosine_sim = cosine_similarity(tfidf[0:1], tfidf[1:2])[0][0] * 100
+    content_diff = 100 - round(cosine_sim, 2)
+    # Text size change
+    len_old = len(re.findall(r'\w+', old_text))
+    len_new = len(re.findall(r'\w+', new_text))
+    word_change_percent = round(((len_new - len_old) / len_old) * 100, 2)
     summary = f"""
 📘 **Summary of Comparison**
+📈 **Overall Content Change**: {content_diff:.2f}%
+🔍 This is based on TF-IDF cosine similarity between old and new handouts.
+📝 **Text Length Difference**: {'+' if word_change_percent >= 0 else ''}{word_change_percent:.2f}%
+Compared by total number of words in both handouts.
 🎯 **Learning Outcome Matches**: {sum(1 for s in new_scores if s >= 0.5)} of {len(los)}
+✅ New handout appears {'more' if sum(new_scores) > sum(old_scores) else 'less'} aligned with stated outcomes.
 """
+    return summary.strip(), df, fig, "✅ Comparison completed successfully."
 iface = gr.Interface(
     fn=compare_all,
         gr.File(label="Learning Outcomes (Text File)", type='binary'),
     ],
     outputs=[
+        gr.Textbox(label="📘 Summary & Insights", lines=20, max_lines=25),
+        gr.Dataframe(label="📊 LO-wise Comparison Table"),
+        gr.Plot(label="📈 LO Visual Comparison"),
+        gr.Textbox(label="ℹ️ Status", lines=1)
     ],
+    title="📘 Handout Comparator + LO Analyzer",
+    description="Upload OLD and NEW handouts in PDF format along with a TXT file of Learning Outcomes. The app compares content changes and evaluates alignment with LOs visually and in table format."
 )
 iface.launch()