File size: 4,144 Bytes
28e421a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
import gradio as gr
import fitz  # PyMuPDF
import docx
import io
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sentence_transformers import SentenceTransformer, util
import matplotlib.pyplot as plt
import numpy as np
from difflib import SequenceMatcher

model = SentenceTransformer('all-MiniLM-L6-v2')

def extract_text_from_pdf(pdf_file):
    try:
        pdf_reader = fitz.open(stream=pdf_file, filetype="pdf")
        text = ""
        for page in pdf_reader:
            text += page.get_text()
        pdf_reader.close()
        return text.strip()
    except Exception as e:
        return ""

def normalize_text(text):
    return re.sub(r'\s+', ' ', text.strip().lower())

def extract_text_from_docx(docx_file):
    try:
        doc = docx.Document(io.BytesIO(docx_file))
        full_text = []
        for para in doc.paragraphs:
            if para.text.strip():
                full_text.append(para.text.strip())
        return full_text
    except:
        return []

def semantic_match(lo_list, content):
    scores = []
    for lo in lo_list:
        try:
            lo_embed = model.encode(lo, convert_to_tensor=True)
            content_embed = model.encode(content, convert_to_tensor=True)
            sim = util.pytorch_cos_sim(lo_embed, content_embed).item()
            scores.append(round(sim, 2))
        except:
            scores.append(0.0)
    return scores

def content_change_score(text1, text2):
    try:
        sim = SequenceMatcher(None, normalize_text(text1), normalize_text(text2)).ratio()
        return round((1 - sim) * 100, 2)
    except:
        return 100.0

def compare_handouts(old_pdf, new_pdf, lo_file):
    old_text = extract_text_from_pdf(old_pdf)
    new_text = extract_text_from_pdf(new_pdf)

    if len(old_text.strip()) < 200 or len(new_text.strip()) < 200:
        return "⚠️ Could not extract meaningful content from one or both PDFs.", None

    lo_list = extract_text_from_docx(lo_file)
    if not lo_list:
        return "⚠️ No learning outcomes detected.", None

    old_scores = semantic_match(lo_list, old_text)
    new_scores = semantic_match(lo_list, new_text)

    change_percent = content_change_score(old_text, new_text)
    improved_count = sum([n > o for n, o in zip(new_scores, old_scores)])
    matched_los = sum([n >= o for n, o in zip(new_scores, old_scores)])

    summary = f"πŸ“ˆ Content Change Estimate: {change_percent}%\n"
    summary += f"🧠 LO Alignment: {matched_los} of {len(lo_list)} learning outcomes matched\n"
    if improved_count > 0:
        summary += "🟒 Summary: New handout has improved structure and added clarity."
    else:
        summary += "⚠️ Summary: No significant improvement in LO alignment."

    # Plot
    x = np.arange(len(lo_list))
    width = 0.35
    fig, ax = plt.subplots()
    ax.bar(x - width/2, old_scores, width, label='Old')
    ax.bar(x + width/2, new_scores, width, label='New')
    ax.set_ylabel('Match Score (0-1)')
    ax.set_title('LO-wise Match Score: Old vs New')
    ax.set_xticks(x)
    ax.set_xticklabels([f"LO{i+1}" for i in range(len(lo_list))], rotation=45)
    ax.legend()
    plt.tight_layout()

    return summary, fig

with gr.Blocks() as demo:
    gr.Markdown("πŸ“˜ **Educational Content Comparator**")
    gr.Markdown("Upload 2 handouts and a .docx file of Learning Outcomes to compare changes and alignment.")

    with gr.Row():
        old_pdf = gr.File(label="πŸ“‚ Upload Old PDF", file_types=[".pdf"], type="binary")
        new_pdf = gr.File(label="πŸ“‚ Upload New PDF", file_types=[".pdf"], type="binary")
        lo_file = gr.File(label="πŸ“‚ Upload Learning Outcomes (.docx)", file_types=[".docx"], type="binary")

    with gr.Row():
        btn = gr.Button("Submit")
        clear_btn = gr.Button("Clear")

    output_text = gr.Textbox(label="πŸ“‹ Summary", lines=5, interactive=False)
    output_plot = gr.Plot(label="πŸ“Š LO Match Chart")

    btn.click(fn=compare_handouts, inputs=[old_pdf, new_pdf, lo_file], outputs=[output_text, output_plot])
    clear_btn.click(fn=lambda: ("", None), inputs=[], outputs=[output_text, output_plot])

demo.launch()