Spaces:
Sleeping
Sleeping
File size: 3,926 Bytes
89ad98d 65fcd1d 02949b2 ace889b 02949b2 c6ee9d5 02949b2 c6ee9d5 02949b2 c6ee9d5 02949b2 65fcd1d ace889b c6ee9d5 02949b2 ace889b 02949b2 c6ee9d5 65fcd1d 89ad98d c6ee9d5 02949b2 65fcd1d 02949b2 c6ee9d5 65fcd1d c6ee9d5 02949b2 5abbe67 65fcd1d 89ad98d 02949b2 89ad98d 00d9a8a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 |
import gradio as gr
import fitz # PyMuPDF
import docx
import io
from sklearn.feature_extraction.text import TfidfVectorizer
from sentence_transformers import SentenceTransformer, util
import matplotlib.pyplot as plt
import numpy as np
model = SentenceTransformer('all-MiniLM-L6-v2')
def extract_text_from_pdf(pdf_file):
try:
pdf_reader = fitz.open(stream=pdf_file, filetype="pdf")
text = ""
for page in pdf_reader:
text += page.get_text()
pdf_reader.close()
return text.strip()
except Exception as e:
return ""
def extract_text_from_docx(docx_file):
try:
doc = docx.Document(io.BytesIO(docx_file))
full_text = []
for para in doc.paragraphs:
if para.text.strip():
full_text.append(para.text.strip())
return full_text
except:
return []
def tfidf_similarity(text1, text2):
try:
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform([text1, text2])
similarity = (tfidf_matrix * tfidf_matrix.T).A[0, 1]
return round(similarity, 2)
except:
return 0.0
def semantic_match(lo_list, content):
scores = []
for lo in lo_list:
try:
lo_embed = model.encode(lo, convert_to_tensor=True)
content_embed = model.encode(content, convert_to_tensor=True)
sim = util.pytorch_cos_sim(lo_embed, content_embed).item()
scores.append(round(sim, 2))
except:
scores.append(0.0)
return scores
def compare_handouts(old_pdf, new_pdf, lo_file):
old_text = extract_text_from_pdf(old_pdf)
new_text = extract_text_from_pdf(new_pdf)
if not old_text or not new_text:
return "β Could not extract text from one or both PDFs.", None
lo_list = extract_text_from_docx(lo_file)
if not lo_list:
return "β οΈ No learning outcomes detected.", None
old_scores = semantic_match(lo_list, old_text)
new_scores = semantic_match(lo_list, new_text)
change_percent = tfidf_similarity(old_text, new_text)
improved_count = sum([n > o for n, o in zip(new_scores, old_scores)])
summary = f"π Content Change: {round((1 - change_percent) * 100, 2)}%\n"
summary += f"π― Matched LOs: {sum([n >= o for n, o in zip(new_scores, old_scores)])} of {len(lo_list)}\n"
summary += "π’ New content appears more detailed and informative." if improved_count > 0 else "β οΈ No significant improvement detected."
# Plot
x = np.arange(len(lo_list))
width = 0.35
fig, ax = plt.subplots()
ax.bar(x - width/2, old_scores, width, label='Old')
ax.bar(x + width/2, new_scores, width, label='New')
ax.set_ylabel('Match Score (0-1)')
ax.set_title('LO-wise Match Score: Old vs New')
ax.set_xticks(x)
ax.set_xticklabels([f"LO{i+1}" for i in range(len(lo_list))], rotation=45)
ax.legend()
plt.tight_layout()
return summary, fig
with gr.Blocks() as demo:
gr.Markdown("π **Educational Content Comparator**")
gr.Markdown("Upload 2 handouts and a .docx file of Learning Outcomes to compare changes and alignment.")
with gr.Row():
old_pdf = gr.File(label="π Upload Old PDF", file_types=[".pdf"], type="binary")
new_pdf = gr.File(label="π Upload New PDF", file_types=[".pdf"], type="binary")
lo_file = gr.File(label="π Upload Learning Outcomes (.docx)", file_types=[".docx"], type="binary")
with gr.Row():
btn = gr.Button("Submit")
clear_btn = gr.Button("Clear")
output_text = gr.Textbox(label="π Summary", lines=5, interactive=False)
output_plot = gr.Plot(label="π LO Match Chart")
btn.click(fn=compare_handouts, inputs=[old_pdf, new_pdf, lo_file], outputs=[output_text, output_plot])
clear_btn.click(fn=lambda: ("", None), inputs=[], outputs=[output_text, output_plot])
demo.launch() |