File size: 4,781 Bytes
e0b1653
671e233
e0b1653
 
 
 
 
 
 
 
 
671e233
 
e0b1653
671e233
e0b1653
671e233
 
e0b1653
671e233
e0b1653
 
671e233
 
 
e0b1653
 
671e233
 
 
 
 
 
 
 
 
 
 
 
e0b1653
671e233
 
 
 
 
 
 
 
 
 
 
e0b1653
 
 
671e233
 
 
e0b1653
 
 
 
 
671e233
e0b1653
 
671e233
 
 
 
 
 
e0b1653
 
 
671e233
e0b1653
 
 
 
 
 
 
 
 
 
671e233
 
e0b1653
 
671e233
e0b1653
 
 
671e233
e0b1653
671e233
 
 
e0b1653
671e233
 
e0b1653
 
671e233
 
 
e0b1653
 
 
 
 
 
671e233
e0b1653
 
671e233
e0b1653
671e233
 
e0b1653
671e233
 
e0b1653
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125


import gradio as gr
from PyPDF2 import PdfReader
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer, util
import matplotlib.pyplot as plt
import pandas as pd
import io

# Load transformer model for semantic similarity
model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

def extract_text_from_pdf(file_bytes):
    try:
        reader = PdfReader(io.BytesIO(file_bytes))
        return " ".join([page.extract_text() or "" for page in reader.pages]).strip()
    except Exception as e:
        print("Error extracting text:", e)
        return ""

def tfidf_similarity(text1, text2):
    vectorizer = TfidfVectorizer().fit_transform([text1, text2])
    return cosine_similarity(vectorizer[0:1], vectorizer[1:2])[0][0]

def transformer_similarity(text1, text2):
    embeddings = model.encode([text1, text2], convert_to_tensor=True)
    return util.pytorch_cos_sim(embeddings[0], embeddings[1]).item()

def bloom_level(term):
    term = term.lower()
    blooms = {
        "remember": ["define", "list", "recall", "identify"],
        "understand": ["explain", "describe", "summarize"],
        "apply": ["apply", "demonstrate", "use"],
        "analyze": ["analyze", "compare", "contrast"],
        "evaluate": ["evaluate", "judge", "critique"],
        "create": ["create", "design", "formulate"]
    }
    for level, keywords in blooms.items():
        if any(word in term for word in keywords):
            return level.capitalize()
    return "Unknown"

def lo_semantic_scores(los, content):
    scores = []
    for lo in los:
        score = transformer_similarity(lo, content)
        scores.append(score)
    return scores

def compare_all(old_pdf, new_pdf, lo_file):
    try:
        lo_content = lo_file.read().decode("utf-8", errors="ignore") if hasattr(lo_file, "read") else lo_file.decode("utf-8", errors="ignore")
        los = [line.strip() for line in lo_content.splitlines() if line.strip()]
    except Exception as e:
        return "❌ Could not read learning outcomes file.", None, None, None

    old_text = extract_text_from_pdf(old_pdf)
    new_text = extract_text_from_pdf(new_pdf)

    if not old_text or not new_text:
        return "❌ Could not extract text from one or both PDFs.", None, None, None

    tfidf_sim = tfidf_similarity(old_text, new_text)
    transformer_sim = transformer_similarity(old_text, new_text)
    text_growth = round(((len(new_text) - len(old_text)) / len(old_text)) * 100, 2)

    old_scores = lo_semantic_scores(los, old_text)
    new_scores = lo_semantic_scores(los, new_text)

    labels = [f"LO{i+1}" for i in range(len(los))]
    x = range(len(labels))
    fig, ax = plt.subplots(figsize=(10, 5))
    ax.bar(x, old_scores, width=0.4, label="Old", align='center')
    ax.bar([i + 0.4 for i in x], new_scores, width=0.4, label="New", align='center')
    ax.set_xticks([i + 0.2 for i in x])
    ax.set_xticklabels(labels, rotation=45)
    ax.set_ylabel("Semantic Match Score")
    ax.set_title("Learning Outcomes Comparison")
    ax.legend()

    data = {
        "Learning Outcome": labels,
        "LO Text": los,
        "Bloom Level": [bloom_level(lo) for lo in los],
        "Old Match": [round(s*100, 2) for s in old_scores],
        "New Match": [round(s*100, 2) for s in new_scores],
        "Change (%)": [round((n - o)*100, 2) for n, o in zip(new_scores, old_scores)]
    }
    df = pd.DataFrame(data)

    summary = f"""πŸ“˜ **Summary of Comparison**

πŸ“ˆ **TF-IDF Content Change**: {round((1 - tfidf_sim) * 100, 2)}%
🧠 **Transformer-based Similarity**: {round(transformer_sim * 100, 2)}%
πŸ“ **Content Length Change**: {text_growth}% {"πŸ“‰ Reduced" if text_growth < 0 else "πŸ“ˆ Increased"}

🎯 **LO Matches**: {sum(1 for score in new_scores if score > 0.5)} of {len(los)}
πŸ“Š **Content appears {'more' if sum(new_scores) > sum(old_scores) else 'less'} aligned with learning outcomes.**
"""

    return summary, df, fig, new_text[:2000] + "..."

import gradio as gr

iface = gr.Interface(
    fn=compare_all,
    inputs=[
        gr.File(label="Old Handout PDF", type='binary'),
        gr.File(label="New Handout PDF", type='binary'),
        gr.File(label="Learning Outcomes (TXT)", type='binary')
    ],
    outputs=[
        gr.Markdown(label="πŸ“‹ Summary"),
        gr.Dataframe(label="πŸ“Š LO-wise Comparison Table"),
        gr.Plot(label="πŸ“ˆ LO Match Chart"),
        gr.Textbox(label="πŸ“ Preview of New Content")
    ],
    title="πŸ“˜ AI Handout Comparator + LO Aligner",
    description="Compare two versions of handouts using both TF-IDF and Transformers. Analyze changes in content, alignment with Learning Outcomes, and Bloom’s taxonomy level."
)

iface.launch()