import gradio as gr from PyPDF2 import PdfReader from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.metrics.pairwise import cosine_similarity from transformers import pipeline import matplotlib.pyplot as plt import pandas as pd import io semantic_pipeline = pipeline("feature-extraction", model="sentence-transformers/all-MiniLM-L6-v2") def extract_text_from_pdf(pdf_file): try: reader = PdfReader(io.BytesIO(pdf_file)) full_text = "" for page in reader.pages: text = page.extract_text() if text: full_text += text return full_text.strip() except Exception as e: print("Error reading PDF:", e) return "" def tfidf_similarity(text1, text2): vec = TfidfVectorizer().fit_transform([text1, text2]) return cosine_similarity(vec[0:1], vec[1:2])[0][0] def transformer_similarity(text1, text2): emb1 = semantic_pipeline(text1[:512])[0] emb2 = semantic_pipeline(text2[:512])[0] emb1_avg = [sum(x)/len(x) for x in zip(*emb1)] emb2_avg = [sum(x)/len(x) for x in zip(*emb2)] return cosine_similarity([emb1_avg], [emb2_avg])[0][0] def semantic_match(lo_list, content): vectorizer = TfidfVectorizer().fit_transform([content] + lo_list) vectors = vectorizer.toarray() content_vec = vectors[0] scores = [cosine_similarity([content_vec], [vec])[0][0] for vec in vectors[1:]] return scores def compare_all(old_pdf, new_pdf, lo_file): try: los = lo_file.decode("utf-8", errors="ignore").splitlines() los = [lo.strip() for lo in los if lo.strip()] except: return "❌ Could not read learning outcomes file.", None, None, None, None, None old_text = extract_text_from_pdf(old_pdf) new_text = extract_text_from_pdf(new_pdf) if not old_text or not new_text: return "❌ Could not extract text from one or both PDFs.", None, None, None, None, None old_scores = semantic_match(los, old_text) new_scores = semantic_match(los, new_text) labels = [f"LO{i+1}" for i in range(len(los))] x = range(len(labels)) fig, ax = plt.subplots() ax.bar(x, old_scores, width=0.4, label="Old", align='center') ax.bar([i + 0.4 for i in x], new_scores, width=0.4, label="New", align='center') ax.set_xticks([i + 0.2 for i in x]) ax.set_xticklabels(labels, rotation=45) ax.set_ylabel("Semantic Match Score") ax.set_title("Learning Outcomes Comparison") ax.legend() df = pd.DataFrame({ "Learning Outcome": labels, "Old Match (%)": [round(s*100, 2) for s in old_scores], "New Match (%)": [round(s*100, 2) for s in new_scores], "Change (%)": [round((n - o)*100, 2) for o, n in zip(old_scores, new_scores)], }) tfidf_sim = tfidf_similarity(old_text, new_text) transformer_sim = transformer_similarity(old_text, new_text) text_growth = (len(new_text) - len(old_text)) / len(old_text) * 100 summary = f"📘 **Summary of Comparison**" summary += f"📈 **TF-IDF Content Similarity**: {round(tfidf_sim * 100, 2)}%" summary += f"🤖 **Transformer-based Similarity**: {round(transformer_sim * 100, 2)}%" summary += f"📝 **Text Growth**: {'+' if text_growth >= 0 else ''}{round(text_growth, 2)}% more content in new handout" summary += f"🎯 **LOs Matched (New ≥ 0.5)**: {sum(1 for s in new_scores if s >= 0.5)} of {len(los)}" summary += f"📊 **Insight**: New content appears {'more' if sum(new_scores) > sum(old_scores) else 'less'} aligned with outcomes." explanation = ("---" "📚 **Explanation of Methods**:" "- **TF-IDF Similarity** checks how often important words appear in both documents. It gives a quick idea of textual overlap." "- **Transformer Similarity** uses AI to understand meaning beyond words. It compares the 'sense' of the documents like a human would." ) return summary + explanation, df, fig, new_text iface = gr.Interface( fn=compare_all, inputs=[ gr.File(label="Old Handout PDF", type='binary'), gr.File(label="New Handout PDF", type='binary'), gr.File(label="Learning Outcomes (Text File)", type='binary'), ], outputs=[ gr.Markdown(label="📘 Summary & Insights"), gr.Dataframe(label="📊 LO-wise Comparison Table"), gr.Plot(label="📈 Visual LO Change Chart"), gr.Textbox(label="📄 New Handout Preview (Full Text)", lines=20, interactive=False), ], title="📘 Handout Comparison + LO Semantic Analysis", description="Upload two handouts (old and new) and a text file of Learning Outcomes (LOs). This tool compares content using TF-IDF and Transformers, visualizes LO changes, and explains results in simple terms.", ) iface.launch()