File size: 4,103 Bytes
624ba1c
62adbdf
e1fb09a
624ba1c
8751b01
 
62adbdf
bc78525
9af4462
 
ef62d76
62adbdf
624ba1c
62adbdf
624ba1c
 
b0754aa
624ba1c
 
 
9af4462
8751b01
9af4462
8751b01
bc78525
9af4462
8751b01
 
 
 
 
 
 
98a66f5
3378b7b
4c891c6
624ba1c
 
 
 
9af4462
624ba1c
b0754aa
 
624ba1c
9af4462
 
ac4d1e6
8751b01
 
 
624ba1c
9af4462
 
 
3378b7b
9af4462
 
 
624ba1c
 
 
4c891c6
b0754aa
8751b01
 
624ba1c
9af4462
 
 
8751b01
 
 
9af4462
 
 
 
3378b7b
9af4462
 
 
 
3378b7b
 
 
8751b01
9af4462
 
 
 
 
8751b01
3378b7b
9af4462
3378b7b
3c108e3
9af4462
 
3c108e3
 
4c891c6
3c108e3
624ba1c
 
b50e58b
3c108e3
 
9af4462
 
 
 
3c108e3
9af4462
 
3c108e3
 
bc78525
9af4462
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119

import gradio as gr
from PyPDF2 import PdfReader
import io
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import re


def extract_text_from_pdf(pdf_file):
    try:
        reader = PdfReader(io.BytesIO(pdf_file))
        full_text = ""
        for page in reader.pages:
            text = page.extract_text()
            if text:
                full_text += text
        return full_text.strip()
    except Exception as e:
        print("PDF extraction error:", e)
        return ""


def semantic_match(lo_list, content):
    lo_texts = [lo for lo in lo_list if lo.strip()]
    vectorizer = TfidfVectorizer().fit_transform([content] + lo_texts)
    vectors = vectorizer.toarray()
    content_vec = vectors[0]
    scores = [cosine_similarity([content_vec], [vec])[0][0] for vec in vectors[1:]]
    return scores


def compare_all(old_pdf, new_pdf, lo_file):
    try:
        los = lo_file.decode("utf-8", errors="ignore").splitlines()
        los = [lo.strip() for lo in los if lo.strip()]
    except:
        return "❌ Could not read learning outcomes file.", None, None, None

    old_text = extract_text_from_pdf(old_pdf)
    new_text = extract_text_from_pdf(new_pdf)

    if not old_text or not new_text:
        return "❌ Could not extract text from one or both PDFs.", None, None, None

    old_scores = semantic_match(los, old_text)
    new_scores = semantic_match(los, new_text)

    labels = [f"LO{i+1}" for i in range(len(los))]
    x = np.arange(len(labels))

    # Plot
    fig, ax = plt.subplots()
    ax.bar(x - 0.2, old_scores, width=0.4, label="Old", align='center')
    ax.bar(x + 0.2, new_scores, width=0.4, label="New", align='center')
    ax.set_xticks(x)
    ax.set_xticklabels(labels, rotation=45)
    ax.set_ylabel("Semantic Match Score")
    ax.set_title("Learning Outcomes Comparison")
    ax.legend()

    # Table
    data = {
        "Learning Outcome": labels,
        "Old Match (%)": [round(s * 100, 2) for s in old_scores],
        "New Match (%)": [round(s * 100, 2) for s in new_scores],
        "Change (%)": [round((new - old) * 100, 2) for new, old in zip(new_scores, old_scores)]
    }
    df = pd.DataFrame(data)

    # Content similarity
    tfidf = TfidfVectorizer().fit_transform([old_text, new_text])
    cosine_sim = cosine_similarity(tfidf[0:1], tfidf[1:2])[0][0] * 100
    content_diff = 100 - round(cosine_sim, 2)

    # Text size change
    len_old = len(re.findall(r'\w+', old_text))
    len_new = len(re.findall(r'\w+', new_text))
    word_change_percent = round(((len_new - len_old) / len_old) * 100, 2)

    summary = f"""
πŸ“˜ **Summary of Comparison**

πŸ“ˆ **Overall Content Change**: {content_diff:.2f}%
πŸ” This is based on TF-IDF cosine similarity between old and new handouts.

πŸ“ **Text Length Difference**: {'+' if word_change_percent >= 0 else ''}{word_change_percent:.2f}% 
Compared by total number of words in both handouts.

🎯 **Learning Outcome Matches**: {sum(1 for s in new_scores if s >= 0.5)} of {len(los)}
βœ… New handout appears {'more' if sum(new_scores) > sum(old_scores) else 'less'} aligned with stated outcomes.
"""

    return summary.strip(), df, fig, "βœ… Comparison completed successfully."


iface = gr.Interface(
    fn=compare_all,
    inputs=[
        gr.File(label="Old Handout PDF", type='binary'),
        gr.File(label="New Handout PDF", type='binary'),
        gr.File(label="Learning Outcomes (Text File)", type='binary'),
    ],
    outputs=[
        gr.Textbox(label="πŸ“˜ Summary & Insights", lines=20, max_lines=25),
        gr.Dataframe(label="πŸ“Š LO-wise Comparison Table"),
        gr.Plot(label="πŸ“ˆ LO Visual Comparison"),
        gr.Textbox(label="ℹ️ Status", lines=1)
    ],
    title="πŸ“˜ Handout Comparator + LO Analyzer",
    description="Upload OLD and NEW handouts in PDF format along with a TXT file of Learning Outcomes. The app compares content changes and evaluates alignment with LOs visually and in table format."
)

iface.launch()