import gradio as gr from PyPDF2 import PdfReader import io from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.metrics.pairwise import cosine_similarity import matplotlib.pyplot as plt import pandas as pd import numpy as np import re def extract_text_from_pdf(pdf_file): try: reader = PdfReader(io.BytesIO(pdf_file)) full_text = "" for page in reader.pages: text = page.extract_text() if text: full_text += text return full_text.strip() except Exception as e: print("PDF extraction error:", e) return "" def semantic_match(lo_list, content): lo_texts = [lo for lo in lo_list if lo.strip()] vectorizer = TfidfVectorizer().fit_transform([content] + lo_texts) vectors = vectorizer.toarray() content_vec = vectors[0] scores = [cosine_similarity([content_vec], [vec])[0][0] for vec in vectors[1:]] return scores def compare_all(old_pdf, new_pdf, lo_file): try: los = lo_file.decode("utf-8", errors="ignore").splitlines() los = [lo.strip() for lo in los if lo.strip()] except: return "❌ Could not read learning outcomes file.", None, None, None old_text = extract_text_from_pdf(old_pdf) new_text = extract_text_from_pdf(new_pdf) if not old_text or not new_text: return "❌ Could not extract text from one or both PDFs.", None, None, None old_scores = semantic_match(los, old_text) new_scores = semantic_match(los, new_text) labels = [f"LO{i+1}" for i in range(len(los))] x = np.arange(len(labels)) # Plot fig, ax = plt.subplots() ax.bar(x - 0.2, old_scores, width=0.4, label="Old", align='center') ax.bar(x + 0.2, new_scores, width=0.4, label="New", align='center') ax.set_xticks(x) ax.set_xticklabels(labels, rotation=45) ax.set_ylabel("Semantic Match Score") ax.set_title("Learning Outcomes Comparison") ax.legend() # Table data = { "Learning Outcome": labels, "Old Match (%)": [round(s * 100, 2) for s in old_scores], "New Match (%)": [round(s * 100, 2) for s in new_scores], "Change (%)": [round((new - old) * 100, 2) for new, old in zip(new_scores, old_scores)] } df = pd.DataFrame(data) # Content similarity tfidf = TfidfVectorizer().fit_transform([old_text, new_text]) cosine_sim = cosine_similarity(tfidf[0:1], tfidf[1:2])[0][0] * 100 content_diff = 100 - round(cosine_sim, 2) # Text size change len_old = len(re.findall(r'\w+', old_text)) len_new = len(re.findall(r'\w+', new_text)) word_change_percent = round(((len_new - len_old) / len_old) * 100, 2) summary = f""" 📘 **Summary of Comparison** 📈 **Overall Content Change**: {content_diff:.2f}% 🔍 This is based on TF-IDF cosine similarity between old and new handouts. 📝 **Text Length Difference**: {'+' if word_change_percent >= 0 else ''}{word_change_percent:.2f}% Compared by total number of words in both handouts. đŸŽ¯ **Learning Outcome Matches**: {sum(1 for s in new_scores if s >= 0.5)} of {len(los)} ✅ New handout appears {'more' if sum(new_scores) > sum(old_scores) else 'less'} aligned with stated outcomes. """ return summary.strip(), df, fig, "✅ Comparison completed successfully." iface = gr.Interface( fn=compare_all, inputs=[ gr.File(label="Old Handout PDF", type='binary'), gr.File(label="New Handout PDF", type='binary'), gr.File(label="Learning Outcomes (Text File)", type='binary'), ], outputs=[ gr.Textbox(label="📘 Summary & Insights", lines=20, max_lines=25), gr.Dataframe(label="📊 LO-wise Comparison Table"), gr.Plot(label="📈 LO Visual Comparison"), gr.Textbox(label="â„šī¸ Status", lines=1) ], title="📘 Handout Comparator + LO Analyzer", description="Upload OLD and NEW handouts in PDF format along with a TXT file of Learning Outcomes. The app compares content changes and evaluates alignment with LOs visually and in table format." ) iface.launch()