Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| import fitz # PyMuPDF | |
| import docx | |
| import io | |
| from sklearn.feature_extraction.text import TfidfVectorizer | |
| from sentence_transformers import SentenceTransformer, util | |
| import matplotlib.pyplot as plt | |
| import numpy as np | |
| model = SentenceTransformer('all-MiniLM-L6-v2') | |
| def extract_text_from_pdf(pdf_file): | |
| try: | |
| pdf_reader = fitz.open(stream=pdf_file, filetype="pdf") | |
| text = "" | |
| for page in pdf_reader: | |
| text += page.get_text() | |
| pdf_reader.close() | |
| return text.strip() | |
| except Exception as e: | |
| return "" | |
| def extract_text_from_docx(docx_file): | |
| try: | |
| doc = docx.Document(io.BytesIO(docx_file)) | |
| full_text = [] | |
| for para in doc.paragraphs: | |
| if para.text.strip(): | |
| full_text.append(para.text.strip()) | |
| return full_text | |
| except: | |
| return [] | |
| def tfidf_similarity(text1, text2): | |
| try: | |
| vectorizer = TfidfVectorizer() | |
| tfidf_matrix = vectorizer.fit_transform([text1, text2]) | |
| similarity = (tfidf_matrix * tfidf_matrix.T).A[0, 1] | |
| return round(similarity, 2) | |
| except: | |
| return 0.0 | |
| def semantic_match(lo_list, content): | |
| scores = [] | |
| for lo in lo_list: | |
| try: | |
| lo_embed = model.encode(lo, convert_to_tensor=True) | |
| content_embed = model.encode(content, convert_to_tensor=True) | |
| sim = util.pytorch_cos_sim(lo_embed, content_embed).item() | |
| scores.append(round(sim, 2)) | |
| except: | |
| scores.append(0.0) | |
| return scores | |
| def compare_handouts(old_pdf, new_pdf, lo_file): | |
| old_text = extract_text_from_pdf(old_pdf) | |
| new_text = extract_text_from_pdf(new_pdf) | |
| if not old_text or not new_text: | |
| return "β Could not extract text from one or both PDFs.", None | |
| lo_list = extract_text_from_docx(lo_file) | |
| if not lo_list: | |
| return "β οΈ No learning outcomes detected.", None | |
| old_scores = semantic_match(lo_list, old_text) | |
| new_scores = semantic_match(lo_list, new_text) | |
| change_percent = tfidf_similarity(old_text, new_text) | |
| improved_count = sum([n > o for n, o in zip(new_scores, old_scores)]) | |
| summary = f"π Content Change: {round((1 - change_percent) * 100, 2)}%\n" | |
| summary += f"π― Matched LOs: {sum([n >= o for n, o in zip(new_scores, old_scores)])} of {len(lo_list)}\n" | |
| summary += "π’ New content appears more detailed and informative." if improved_count > 0 else "β οΈ No significant improvement detected." | |
| # Plot | |
| x = np.arange(len(lo_list)) | |
| width = 0.35 | |
| fig, ax = plt.subplots() | |
| ax.bar(x - width/2, old_scores, width, label='Old') | |
| ax.bar(x + width/2, new_scores, width, label='New') | |
| ax.set_ylabel('Match Score (0-1)') | |
| ax.set_title('LO-wise Match Score: Old vs New') | |
| ax.set_xticks(x) | |
| ax.set_xticklabels([f"LO{i+1}" for i in range(len(lo_list))], rotation=45) | |
| ax.legend() | |
| plt.tight_layout() | |
| return summary, fig | |
| with gr.Blocks() as demo: | |
| gr.Markdown("π **Educational Content Comparator**") | |
| gr.Markdown("Upload 2 handouts and a .docx file of Learning Outcomes to compare changes and alignment.") | |
| with gr.Row(): | |
| old_pdf = gr.File(label="π Upload Old PDF", file_types=[".pdf"], type="binary") | |
| new_pdf = gr.File(label="π Upload New PDF", file_types=[".pdf"], type="binary") | |
| lo_file = gr.File(label="π Upload Learning Outcomes (.docx)", file_types=[".docx"], type="binary") | |
| with gr.Row(): | |
| btn = gr.Button("Submit") | |
| clear_btn = gr.Button("Clear") | |
| output_text = gr.Textbox(label="π Summary", lines=5, interactive=False) | |
| output_plot = gr.Plot(label="π LO Match Chart") | |
| btn.click(fn=compare_handouts, inputs=[old_pdf, new_pdf, lo_file], outputs=[output_text, output_plot]) | |
| clear_btn.click(fn=lambda: ("", None), inputs=[], outputs=[output_text, output_plot]) | |
| demo.launch() |