Spaces:

Deevyankar
/

Handouts

Sleeping

App Files Files Community

Deevyankar commited on Sep 22, 2025

Commit

3378b7b

verified ·

1 Parent(s): 8751b01

Update app.py

Browse files

Files changed (1) hide show

app.py +46 -34

app.py CHANGED Viewed

@@ -1,4 +1,5 @@
 import gradio as gr
 from PyPDF2 import PdfReader
 from pdf2image import convert_from_bytes
@@ -7,11 +8,13 @@ from PIL import Image
 import io
 from sklearn.feature_extraction.text import TfidfVectorizer
 from sklearn.metrics.pairwise import cosine_similarity
 import matplotlib.pyplot as plt
 import pandas as pd
-from sentence_transformers import SentenceTransformer
-model = SentenceTransformer('all-MiniLM-L6-v2')
 def extract_text_from_pdf(pdf_file):
     try:
@@ -44,6 +47,21 @@ def semantic_match(lo_list, content):
     scores = [cosine_similarity([content_vec], [vec])[0][0] for vec in vectors[1:]]
     return scores
 def compare_all(old_pdf, new_pdf, lo_file):
     try:
         los = lo_file.decode("utf-8", errors="ignore").splitlines()
@@ -57,22 +75,12 @@ def compare_all(old_pdf, new_pdf, lo_file):
     if not old_text.strip() or not new_text.strip():
         return "❌ Could not extract text from one or both PDFs.", None, None
-    # Similarity Calculations
-    tfidf_vectorizer = TfidfVectorizer().fit_transform([old_text, new_text])
-    tfidf_score = cosine_similarity([tfidf_vectorizer.toarray()[0]], [tfidf_vectorizer.toarray()[1]])[0][0] * 100
-    embed_old = model.encode(old_text, convert_to_tensor=True)
-    embed_new = model.encode(new_text, convert_to_tensor=True)
-    semantic_score = float(cosine_similarity([embed_old], [embed_new])[0][0]) * 100
-    # LO Scores
     old_scores = semantic_match(los, old_text)
     new_scores = semantic_match(los, new_text)
-    # Bar Plot
     labels = [f"LO{i+1}" for i in range(len(los))]
     x = range(len(labels))
-    fig, ax = plt.subplots(figsize=(10, 5))
     ax.bar(x, old_scores, width=0.4, label="Old", align='center')
     ax.bar([i + 0.4 for i in x], new_scores, width=0.4, label="New", align='center')
     ax.set_xticks([i + 0.2 for i in x])
@@ -90,25 +98,29 @@ def compare_all(old_pdf, new_pdf, lo_file):
     }
     df = pd.DataFrame(data)
-    # Insight Generation
-    lo_diff = sum(new_scores) - sum(old_scores)
-    if abs(lo_diff) < 0.01:
-        insight = "⚪ No significant change in alignment with learning outcomes."
-    elif lo_diff > 0:
-        insight = "🟢 New content appears more aligned with outcomes."
-    else:
-        insight = "🔴 New content appears less aligned with outcomes."
-    matched_lo = sum(1 for s in new_scores if s >= 0.5)
-    total_lo = len(los)
-    summary = f"📘 Summary of Comparison\n\n"
-    summary += f"🔹 Semantic Similarity (Transformer): {round(semantic_score, 2)}%\n"
-    summary += f"🔹 Structural Similarity (TF-IDF): {round(tfidf_score, 2)}%\n\n"
-    summary += f"🎯 Learning Outcome Matches: {matched_lo} of {total_lo}\n"
-    summary += insight
-    return summary, df, fig
 iface = gr.Interface(
     fn=compare_all,
@@ -118,12 +130,12 @@ iface = gr.Interface(
         gr.File(label="Learning Outcomes (Text File)", type='binary'),
     ],
     outputs=[
-        gr.Textbox(label="Summary & Insights", lines=10),
-        gr.Dataframe(label="LO-wise Comparison Table"),
-        gr.Plot(label="Visual Comparison Chart")
     ],
-    title="📘 Handout Comparator + LO Analysis (Dual Similarity)",
-    description="Upload old/new handouts + Learning Outcomes file (TXT). See content diff, LO alignment, and dual similarity scoring (TF-IDF + Transformers)."
 )
 iface.launch()

 import gradio as gr
 from PyPDF2 import PdfReader
 from pdf2image import convert_from_bytes
 import io
 from sklearn.feature_extraction.text import TfidfVectorizer
 from sklearn.metrics.pairwise import cosine_similarity
+from transformers import pipeline
 import matplotlib.pyplot as plt
 import pandas as pd
+from difflib import SequenceMatcher
+# Load transformer model for semantic similarity
+semantic_pipeline = pipeline("feature-extraction", model="sentence-transformers/all-MiniLM-L6-v2")
 def extract_text_from_pdf(pdf_file):
     try:
     scores = [cosine_similarity([content_vec], [vec])[0][0] for vec in vectors[1:]]
     return scores
+def compute_difference_and_text_change(old_text, new_text):
+    similarity = SequenceMatcher(None, old_text, new_text).ratio()
+    difference_percentage = round((1 - similarity) * 100, 2)
+    len_old = len(old_text.split())
+    len_new = len(new_text.split())
+    length_change = round(((len_new - len_old) / len_old) * 100, 2)
+    return difference_percentage, length_change
+def transformer_similarity(text1, text2):
+    emb1 = semantic_pipeline(text1)[0][0]
+    emb2 = semantic_pipeline(text2)[0][0]
+    sim = cosine_similarity([emb1], [emb2])[0][0]
+    return round(sim * 100, 2)
 def compare_all(old_pdf, new_pdf, lo_file):
     try:
         los = lo_file.decode("utf-8", errors="ignore").splitlines()
     if not old_text.strip() or not new_text.strip():
         return "❌ Could not extract text from one or both PDFs.", None, None
     old_scores = semantic_match(los, old_text)
     new_scores = semantic_match(los, new_text)
     labels = [f"LO{i+1}" for i in range(len(los))]
     x = range(len(labels))
+    fig, ax = plt.subplots()
     ax.bar(x, old_scores, width=0.4, label="Old", align='center')
     ax.bar([i + 0.4 for i in x], new_scores, width=0.4, label="New", align='center')
     ax.set_xticks([i + 0.2 for i in x])
     }
     df = pd.DataFrame(data)
+    # Calculate metrics
+    tfidf_similarity = round(cosine_similarity(
+        [TfidfVectorizer().fit_transform([old_text, new_text]).toarray()[0]],
+        [TfidfVectorizer().fit_transform([old_text, new_text]).toarray()[1]]
+    )[0][0] * 100, 2)
+    diff_pct, length_delta = compute_difference_and_text_change(old_text, new_text)
+    transformer_sim = transformer_similarity(old_text, new_text)
+    summary = f"""
+📘 **Summary of Comparison**
+📈 **TF-IDF Similarity**: {tfidf_similarity}%
+🤖 **Transformer Similarity**: {transformer_sim}%
+🔄 **Textual Change** (Diff-based): {diff_pct}%
+📏 **Text Length Change**: {length_delta}% (words)
+🎯 **Learning Outcome Matches**: {sum(1 for s in new_scores if s >= 0.5)} of {len(los)}
+🧠 **Insight**: New content appears {'more' if sum(new_scores) > sum(old_scores) else 'less'} aligned with the learning outcomes.
+💬 **Tip**: Diff > 30% or word increase > 20% generally reflects real updates.
+"""
+    return summary.strip(), df, fig
 iface = gr.Interface(
     fn=compare_all,
         gr.File(label="Learning Outcomes (Text File)", type='binary'),
     ],
     outputs=[
+        gr.Textbox(label="📊 Summary Report", lines=12),
+        gr.Dataframe(label="📋 LO-wise Comparison Table"),
+        gr.Plot(label="📈 LO Match Chart")
     ],
+    title="📘 Handout Comparator + LO Analyzer (with AI)",
+    description="Compare two handouts and learning outcomes. View similarity via TF-IDF and Transformers. Bar chart and table included."
 )
 iface.launch()