File size: 3,926 Bytes
89ad98d
65fcd1d
02949b2
 
ace889b
02949b2
 
 
 
 
 
c6ee9d5
02949b2
c6ee9d5
02949b2
c6ee9d5
02949b2
 
 
65fcd1d
ace889b
c6ee9d5
 
02949b2
ace889b
02949b2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c6ee9d5
 
65fcd1d
89ad98d
c6ee9d5
 
02949b2
65fcd1d
02949b2
c6ee9d5
65fcd1d
 
c6ee9d5
02949b2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5abbe67
65fcd1d
89ad98d
02949b2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
89ad98d
00d9a8a
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111

import gradio as gr
import fitz  # PyMuPDF
import docx
import io
from sklearn.feature_extraction.text import TfidfVectorizer
from sentence_transformers import SentenceTransformer, util
import matplotlib.pyplot as plt
import numpy as np

model = SentenceTransformer('all-MiniLM-L6-v2')

def extract_text_from_pdf(pdf_file):
    try:
        pdf_reader = fitz.open(stream=pdf_file, filetype="pdf")
        text = ""
        for page in pdf_reader:
            text += page.get_text()
        pdf_reader.close()
        return text.strip()
    except Exception as e:
        return ""

def extract_text_from_docx(docx_file):
    try:
        doc = docx.Document(io.BytesIO(docx_file))
        full_text = []
        for para in doc.paragraphs:
            if para.text.strip():
                full_text.append(para.text.strip())
        return full_text
    except:
        return []

def tfidf_similarity(text1, text2):
    try:
        vectorizer = TfidfVectorizer()
        tfidf_matrix = vectorizer.fit_transform([text1, text2])
        similarity = (tfidf_matrix * tfidf_matrix.T).A[0, 1]
        return round(similarity, 2)
    except:
        return 0.0

def semantic_match(lo_list, content):
    scores = []
    for lo in lo_list:
        try:
            lo_embed = model.encode(lo, convert_to_tensor=True)
            content_embed = model.encode(content, convert_to_tensor=True)
            sim = util.pytorch_cos_sim(lo_embed, content_embed).item()
            scores.append(round(sim, 2))
        except:
            scores.append(0.0)
    return scores

def compare_handouts(old_pdf, new_pdf, lo_file):
    old_text = extract_text_from_pdf(old_pdf)
    new_text = extract_text_from_pdf(new_pdf)

    if not old_text or not new_text:
        return "❌ Could not extract text from one or both PDFs.", None

    lo_list = extract_text_from_docx(lo_file)
    if not lo_list:
        return "⚠️ No learning outcomes detected.", None

    old_scores = semantic_match(lo_list, old_text)
    new_scores = semantic_match(lo_list, new_text)

    change_percent = tfidf_similarity(old_text, new_text)
    improved_count = sum([n > o for n, o in zip(new_scores, old_scores)])

    summary = f"πŸ“ˆ Content Change: {round((1 - change_percent) * 100, 2)}%\n"
    summary += f"🎯 Matched LOs: {sum([n >= o for n, o in zip(new_scores, old_scores)])} of {len(lo_list)}\n"
    summary += "🟒 New content appears more detailed and informative." if improved_count > 0 else "⚠️ No significant improvement detected."

    # Plot
    x = np.arange(len(lo_list))
    width = 0.35
    fig, ax = plt.subplots()
    ax.bar(x - width/2, old_scores, width, label='Old')
    ax.bar(x + width/2, new_scores, width, label='New')
    ax.set_ylabel('Match Score (0-1)')
    ax.set_title('LO-wise Match Score: Old vs New')
    ax.set_xticks(x)
    ax.set_xticklabels([f"LO{i+1}" for i in range(len(lo_list))], rotation=45)
    ax.legend()
    plt.tight_layout()

    return summary, fig

with gr.Blocks() as demo:
    gr.Markdown("πŸ“˜ **Educational Content Comparator**")
    gr.Markdown("Upload 2 handouts and a .docx file of Learning Outcomes to compare changes and alignment.")

    with gr.Row():
        old_pdf = gr.File(label="πŸ“‚ Upload Old PDF", file_types=[".pdf"], type="binary")
        new_pdf = gr.File(label="πŸ“‚ Upload New PDF", file_types=[".pdf"], type="binary")
        lo_file = gr.File(label="πŸ“‚ Upload Learning Outcomes (.docx)", file_types=[".docx"], type="binary")

    with gr.Row():
        btn = gr.Button("Submit")
        clear_btn = gr.Button("Clear")

    output_text = gr.Textbox(label="πŸ“‹ Summary", lines=5, interactive=False)
    output_plot = gr.Plot(label="πŸ“Š LO Match Chart")

    btn.click(fn=compare_handouts, inputs=[old_pdf, new_pdf, lo_file], outputs=[output_text, output_plot])
    clear_btn.click(fn=lambda: ("", None), inputs=[], outputs=[output_text, output_plot])

demo.launch()