Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| from PyPDF2 import PdfReader | |
| from sklearn.feature_extraction.text import TfidfVectorizer | |
| from sklearn.metrics.pairwise import cosine_similarity | |
| from sentence_transformers import SentenceTransformer, util | |
| import matplotlib.pyplot as plt | |
| import pandas as pd | |
| import io | |
| # Load transformer model for semantic similarity | |
| model = SentenceTransformer('paraphrase-MiniLM-L6-v2') | |
| def extract_text_from_pdf(file_bytes): | |
| try: | |
| reader = PdfReader(io.BytesIO(file_bytes)) | |
| return " ".join([page.extract_text() or "" for page in reader.pages]).strip() | |
| except Exception as e: | |
| print("Error extracting text:", e) | |
| return "" | |
| def tfidf_similarity(text1, text2): | |
| vectorizer = TfidfVectorizer().fit_transform([text1, text2]) | |
| return cosine_similarity(vectorizer[0:1], vectorizer[1:2])[0][0] | |
| def transformer_similarity(text1, text2): | |
| embeddings = model.encode([text1, text2], convert_to_tensor=True) | |
| return util.pytorch_cos_sim(embeddings[0], embeddings[1]).item() | |
| def bloom_level(term): | |
| term = term.lower() | |
| blooms = { | |
| "remember": ["define", "list", "recall", "identify"], | |
| "understand": ["explain", "describe", "summarize"], | |
| "apply": ["apply", "demonstrate", "use"], | |
| "analyze": ["analyze", "compare", "contrast"], | |
| "evaluate": ["evaluate", "judge", "critique"], | |
| "create": ["create", "design", "formulate"] | |
| } | |
| for level, keywords in blooms.items(): | |
| if any(word in term for word in keywords): | |
| return level.capitalize() | |
| return "Unknown" | |
| def lo_semantic_scores(los, content): | |
| scores = [] | |
| for lo in los: | |
| score = transformer_similarity(lo, content) | |
| scores.append(score) | |
| return scores | |
| def compare_all(old_pdf, new_pdf, lo_file): | |
| try: | |
| lo_content = lo_file.read().decode("utf-8", errors="ignore") if hasattr(lo_file, "read") else lo_file.decode("utf-8", errors="ignore") | |
| los = [line.strip() for line in lo_content.splitlines() if line.strip()] | |
| except Exception as e: | |
| return "β Could not read learning outcomes file.", None, None, None | |
| old_text = extract_text_from_pdf(old_pdf) | |
| new_text = extract_text_from_pdf(new_pdf) | |
| if not old_text or not new_text: | |
| return "β Could not extract text from one or both PDFs.", None, None, None | |
| tfidf_sim = tfidf_similarity(old_text, new_text) | |
| transformer_sim = transformer_similarity(old_text, new_text) | |
| text_growth = round(((len(new_text) - len(old_text)) / len(old_text)) * 100, 2) | |
| old_scores = lo_semantic_scores(los, old_text) | |
| new_scores = lo_semantic_scores(los, new_text) | |
| labels = [f"LO{i+1}" for i in range(len(los))] | |
| x = range(len(labels)) | |
| fig, ax = plt.subplots(figsize=(10, 5)) | |
| ax.bar(x, old_scores, width=0.4, label="Old", align='center') | |
| ax.bar([i + 0.4 for i in x], new_scores, width=0.4, label="New", align='center') | |
| ax.set_xticks([i + 0.2 for i in x]) | |
| ax.set_xticklabels(labels, rotation=45) | |
| ax.set_ylabel("Semantic Match Score") | |
| ax.set_title("Learning Outcomes Comparison") | |
| ax.legend() | |
| data = { | |
| "Learning Outcome": labels, | |
| "LO Text": los, | |
| "Bloom Level": [bloom_level(lo) for lo in los], | |
| "Old Match": [round(s*100, 2) for s in old_scores], | |
| "New Match": [round(s*100, 2) for s in new_scores], | |
| "Change (%)": [round((n - o)*100, 2) for n, o in zip(new_scores, old_scores)] | |
| } | |
| df = pd.DataFrame(data) | |
| summary = f"""π **Summary of Comparison** | |
| π **TF-IDF Content Change**: {round((1 - tfidf_sim) * 100, 2)}% | |
| π§ **Transformer-based Similarity**: {round(transformer_sim * 100, 2)}% | |
| π **Content Length Change**: {text_growth}% {"π Reduced" if text_growth < 0 else "π Increased"} | |
| π― **LO Matches**: {sum(1 for score in new_scores if score > 0.5)} of {len(los)} | |
| π **Content appears {'more' if sum(new_scores) > sum(old_scores) else 'less'} aligned with learning outcomes.** | |
| """ | |
| return summary, df, fig, new_text[:2000] + "..." | |
| import gradio as gr | |
| iface = gr.Interface( | |
| fn=compare_all, | |
| inputs=[ | |
| gr.File(label="Old Handout PDF", type='binary'), | |
| gr.File(label="New Handout PDF", type='binary'), | |
| gr.File(label="Learning Outcomes (TXT)", type='binary') | |
| ], | |
| outputs=[ | |
| gr.Markdown(label="π Summary"), | |
| gr.Dataframe(label="π LO-wise Comparison Table"), | |
| gr.Plot(label="π LO Match Chart"), | |
| gr.Textbox(label="π Preview of New Content") | |
| ], | |
| title="π AI Handout Comparator + LO Aligner", | |
| description="Compare two versions of handouts using both TF-IDF and Transformers. Analyze changes in content, alignment with Learning Outcomes, and Bloomβs taxonomy level." | |
| ) | |
| iface.launch() |