Spaces:

Deevyankar
/

Handouts

Sleeping

App Files Files Community

Deevyankar commited on Sep 21, 2025

Commit

b0754aa

verified ·

1 Parent(s): fe0db2c

Update app.py

Browse files

Files changed (1) hide show

app.py +68 -75

app.py CHANGED Viewed

@@ -1,109 +1,102 @@
 import gradio as gr
 from PyPDF2 import PdfReader
-from sentence_transformers import SentenceTransformer, util
 import matplotlib.pyplot as plt
 import pandas as pd
 import numpy as np
-model = SentenceTransformer("all-MiniLM-L6-v2")
-def extract_text_pdf(file_obj):
     try:
-        pdf_reader = PdfReader(file_obj)
         text = ""
-        for page in pdf_reader.pages:
-            extracted = page.extract_text()
-            if extracted:
-                text += extracted + "\n"
-        return text if text.strip() else None
     except:
         return None
-def semantic_similarity(text1, text2):
-    emb1 = model.encode([text1], convert_to_tensor=True)
-    emb2 = model.encode([text2], convert_to_tensor=True)
-    return float(util.pytorch_cos_sim(emb1, emb2)[0][0])
-def compare_with_los(text, lo_list):
-    scores = []
-    for lo in lo_list:
-        score = util.cos_sim(model.encode(lo, convert_to_tensor=True),
-                             model.encode(text, convert_to_tensor=True))[0][0].item()
-        scores.append(round(score * 100, 2))
-    return scores
 def compare_all(old_pdf, new_pdf, lo_file):
-    old_text = extract_text_pdf(old_pdf)
-    new_text = extract_text_pdf(new_pdf)
     if not old_text or not new_text:
         return "❌ Could not extract text from one or both PDFs.", None, None
-    # Overall semantic similarity
-    sim_score = semantic_similarity(old_text, new_text)
-    change_percent = round((1 - sim_score) * 100, 2)
-    summary = f"📈 Content Change: {change_percent}%\n🧠 Similarity Score: {sim_score:.2f}\n\n"
-    if change_percent < 10:
-        summary += "✅ Minor content update."
-    elif change_percent < 40:
-        summary += "🔄 Moderate update."
-    else:
-        summary += "🆕 Significant changes detected."
-    # LO comparison
-    if hasattr(lo_file, 'read'):
-        lo_text = lo_file.read().decode("utf-8", errors="ignore")
-    else:
-        lo_text = lo_file  # already a string
-    los = [line.strip() for line in lo_text.splitlines() if line.strip()]
-    if not los:
-        return summary + "\n⚠️ No valid Learning Outcomes found.", None, None
-    old_scores = compare_with_los(old_text, los)
-    new_scores = compare_with_los(new_text, los)
-    score_diff = [round(new - old, 2) for old, new in zip(old_scores, new_scores)]
     df = pd.DataFrame({
         "Learning Outcome": los,
-        "Old Match (%)": old_scores,
-        "New Match (%)": new_scores,
-        "Change (%)": score_diff
     })
-    table_html = df.to_html(index=False)
-    # Bar chart
-    fig, ax = plt.subplots(figsize=(10, 4))
-    index = np.arange(len(los))
-    bar_width = 0.35
-    ax.bar(index, old_scores, bar_width, label='Old')
-    ax.bar(index + bar_width, new_scores, bar_width, label='New')
-    ax.set_xlabel('Learning Outcomes')
     ax.set_ylabel('Match Score (%)')
-    ax.set_title('LO-wise Semantic Match')
-    ax.set_xticks(index + bar_width / 2)
-    ax.set_xticklabels([f"LO{i+1}" for i in range(len(los))], rotation=45)
     ax.legend()
-    fig.tight_layout()
-    return summary, fig, table_html
 iface = gr.Interface(
     fn=compare_all,
     inputs=[
-        gr.File(label="Old Handout (PDF)"),
-        gr.File(label="New Handout (PDF)"),
-        gr.File(label="Learning Outcomes (.txt)", file_types=[".txt"])
     ],
     outputs=[
-        gr.Textbox(label="Summary"),
-        gr.Plot(label="LO-wise Bar Chart"),
-        gr.HTML(label="LO-wise Comparison Table")
     ],
-    title="📘 Semantic Handout Comparator with LO Alignment (Final Patch)",
-    description="Compare course handouts and learning outcomes using robust PDF reading and transformers."
 )
-iface.launch()

 import gradio as gr
 from PyPDF2 import PdfReader
+from sklearn.feature_extraction.text import TfidfVectorizer
+from sklearn.metrics.pairwise import cosine_similarity
+from transformers import pipeline
 import matplotlib.pyplot as plt
 import pandas as pd
 import numpy as np
+import io
+# Helper to extract text from PDF
+def extract_text_from_pdf(file):
     try:
+        reader = PdfReader(file)
         text = ""
+        for page in reader.pages:
+            text += page.extract_text() or ""
+        return text.strip()
     except:
         return None
+# Semantic match for each LO
+def semantic_match(lo_list, content):
+    vectorizer = TfidfVectorizer().fit_transform([content] + lo_list)
+    vectors = vectorizer.toarray()
+    content_vector = vectors[0]
+    lo_vectors = vectors[1:]
+    similarities = cosine_similarity([content_vector], lo_vectors)[0]
+    return similarities.tolist()
+# Summarizer (Optional: switch models if needed)
+summarizer = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6")
+# Generate summary insight and visualization
 def compare_all(old_pdf, new_pdf, lo_file):
+    old_text = extract_text_from_pdf(old_pdf)
+    new_text = extract_text_from_pdf(new_pdf)
     if not old_text or not new_text:
         return "❌ Could not extract text from one or both PDFs.", None, None
+    los = lo_file.read().decode("utf-8", errors="ignore").splitlines()
+    los = [lo.strip() for lo in los if lo.strip()]
+    # Semantic matching scores
+    old_scores = semantic_match(los, old_text)
+    new_scores = semantic_match(los, new_text)
+    changes = [round(new - old, 2) for old, new in zip(old_scores, new_scores)]
+    # Table and DataFrame
     df = pd.DataFrame({
         "Learning Outcome": los,
+        "Old Match (%)": [round(score * 100, 2) for score in old_scores],
+        "New Match (%)": [round(score * 100, 2) for score in new_scores],
+        "Change (%)": [round(change * 100, 2) for change in changes],
     })
+    # Visualization
+    fig, ax = plt.subplots(figsize=(10, 5))
+    x = np.arange(len(los))
+    width = 0.35
+    ax.bar(x - width/2, df["Old Match (%)"], width, label='Old')
+    ax.bar(x + width/2, df["New Match (%)"], width, label='New')
     ax.set_ylabel('Match Score (%)')
+    ax.set_title('LO-wise Semantic Match: Old vs New')
+    ax.set_xticks(x)
+    ax.set_xticklabels(los, rotation=45, ha='right')
     ax.legend()
+    plt.tight_layout()
+    # Content similarity and summary
+    content_vectorizer = TfidfVectorizer().fit_transform([old_text, new_text])
+    content_sim = cosine_similarity(content_vectorizer)[0, 1]
+    summary = summarizer(new_text[:2000], max_length=100, min_length=30, do_sample=False)[0]['summary_text']
+    summary_text = (
+        f"📈 Content Change Score: {round((1 - content_sim) * 100, 2)}%\n"
+        f"🎯 Learning Outcomes Analyzed: {len(los)}\n"
+        f"🟢 Summary of New Content: {summary}"
+    )
+    return summary_text, df, fig
+# Interface
 iface = gr.Interface(
     fn=compare_all,
     inputs=[
+        gr.File(label="Old Handout PDF"),
+        gr.File(label="New Handout PDF"),
+        gr.File(label="Learning Outcomes (Text File)", type='file'),
     ],
     outputs=[
+        gr.Textbox(label="Summary & Insights"),
+        gr.Dataframe(label="LO-wise Comparison Table"),
+        gr.Plot(label="Visual Comparison Chart")
     ],
+    title="📘 Syllabus Comparator with Learning Outcome Evaluation",
+    description="Upload two syllabus handouts (old and new) and a file containing learning outcomes. Get LO-wise comparison, visual chart, and overall content insight."
 )
+if __name__ == "__main__":
+    iface.launch()