Spaces:
Sleeping
Sleeping
File size: 4,144 Bytes
28e421a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 |
import gradio as gr
import fitz # PyMuPDF
import docx
import io
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sentence_transformers import SentenceTransformer, util
import matplotlib.pyplot as plt
import numpy as np
from difflib import SequenceMatcher
model = SentenceTransformer('all-MiniLM-L6-v2')
def extract_text_from_pdf(pdf_file):
try:
pdf_reader = fitz.open(stream=pdf_file, filetype="pdf")
text = ""
for page in pdf_reader:
text += page.get_text()
pdf_reader.close()
return text.strip()
except Exception as e:
return ""
def normalize_text(text):
return re.sub(r'\s+', ' ', text.strip().lower())
def extract_text_from_docx(docx_file):
try:
doc = docx.Document(io.BytesIO(docx_file))
full_text = []
for para in doc.paragraphs:
if para.text.strip():
full_text.append(para.text.strip())
return full_text
except:
return []
def semantic_match(lo_list, content):
scores = []
for lo in lo_list:
try:
lo_embed = model.encode(lo, convert_to_tensor=True)
content_embed = model.encode(content, convert_to_tensor=True)
sim = util.pytorch_cos_sim(lo_embed, content_embed).item()
scores.append(round(sim, 2))
except:
scores.append(0.0)
return scores
def content_change_score(text1, text2):
try:
sim = SequenceMatcher(None, normalize_text(text1), normalize_text(text2)).ratio()
return round((1 - sim) * 100, 2)
except:
return 100.0
def compare_handouts(old_pdf, new_pdf, lo_file):
old_text = extract_text_from_pdf(old_pdf)
new_text = extract_text_from_pdf(new_pdf)
if len(old_text.strip()) < 200 or len(new_text.strip()) < 200:
return "β οΈ Could not extract meaningful content from one or both PDFs.", None
lo_list = extract_text_from_docx(lo_file)
if not lo_list:
return "β οΈ No learning outcomes detected.", None
old_scores = semantic_match(lo_list, old_text)
new_scores = semantic_match(lo_list, new_text)
change_percent = content_change_score(old_text, new_text)
improved_count = sum([n > o for n, o in zip(new_scores, old_scores)])
matched_los = sum([n >= o for n, o in zip(new_scores, old_scores)])
summary = f"π Content Change Estimate: {change_percent}%\n"
summary += f"π§ LO Alignment: {matched_los} of {len(lo_list)} learning outcomes matched\n"
if improved_count > 0:
summary += "π’ Summary: New handout has improved structure and added clarity."
else:
summary += "β οΈ Summary: No significant improvement in LO alignment."
# Plot
x = np.arange(len(lo_list))
width = 0.35
fig, ax = plt.subplots()
ax.bar(x - width/2, old_scores, width, label='Old')
ax.bar(x + width/2, new_scores, width, label='New')
ax.set_ylabel('Match Score (0-1)')
ax.set_title('LO-wise Match Score: Old vs New')
ax.set_xticks(x)
ax.set_xticklabels([f"LO{i+1}" for i in range(len(lo_list))], rotation=45)
ax.legend()
plt.tight_layout()
return summary, fig
with gr.Blocks() as demo:
gr.Markdown("π **Educational Content Comparator**")
gr.Markdown("Upload 2 handouts and a .docx file of Learning Outcomes to compare changes and alignment.")
with gr.Row():
old_pdf = gr.File(label="π Upload Old PDF", file_types=[".pdf"], type="binary")
new_pdf = gr.File(label="π Upload New PDF", file_types=[".pdf"], type="binary")
lo_file = gr.File(label="π Upload Learning Outcomes (.docx)", file_types=[".docx"], type="binary")
with gr.Row():
btn = gr.Button("Submit")
clear_btn = gr.Button("Clear")
output_text = gr.Textbox(label="π Summary", lines=5, interactive=False)
output_plot = gr.Plot(label="π LO Match Chart")
btn.click(fn=compare_handouts, inputs=[old_pdf, new_pdf, lo_file], outputs=[output_text, output_plot])
clear_btn.click(fn=lambda: ("", None), inputs=[], outputs=[output_text, output_plot])
demo.launch() |