Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -1,4 +1,5 @@
|
|
| 1 |
|
|
|
|
| 2 |
import gradio as gr
|
| 3 |
from PyPDF2 import PdfReader
|
| 4 |
from sklearn.feature_extraction.text import TfidfVectorizer
|
|
@@ -8,71 +9,70 @@ import matplotlib.pyplot as plt
|
|
| 8 |
import pandas as pd
|
| 9 |
import io
|
| 10 |
|
| 11 |
-
# Load
|
| 12 |
-
model = SentenceTransformer('
|
| 13 |
|
| 14 |
-
def extract_text_from_pdf(
|
| 15 |
try:
|
| 16 |
-
reader = PdfReader(io.BytesIO(
|
| 17 |
-
|
| 18 |
-
for page in reader.pages:
|
| 19 |
-
text = page.extract_text()
|
| 20 |
-
if text:
|
| 21 |
-
full_text += text
|
| 22 |
-
return full_text
|
| 23 |
except Exception as e:
|
|
|
|
| 24 |
return ""
|
| 25 |
|
| 26 |
-
def
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
vectors = vectorizer.toarray()
|
| 30 |
-
content_vec = vectors[0]
|
| 31 |
-
scores = [cosine_similarity([content_vec], [vec])[0][0] for vec in vectors[1:]]
|
| 32 |
-
return scores
|
| 33 |
|
| 34 |
def transformer_similarity(text1, text2):
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
"
|
| 42 |
-
"
|
| 43 |
-
"
|
| 44 |
-
"
|
| 45 |
-
"
|
| 46 |
-
"
|
| 47 |
}
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 55 |
|
| 56 |
def compare_all(old_pdf, new_pdf, lo_file):
|
| 57 |
try:
|
| 58 |
-
|
| 59 |
-
los = [
|
| 60 |
-
except:
|
| 61 |
return "β Could not read learning outcomes file.", None, None, None
|
| 62 |
|
| 63 |
old_text = extract_text_from_pdf(old_pdf)
|
| 64 |
new_text = extract_text_from_pdf(new_pdf)
|
| 65 |
|
| 66 |
-
if not old_text
|
| 67 |
return "β Could not extract text from one or both PDFs.", None, None, None
|
| 68 |
|
| 69 |
-
|
| 70 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 71 |
|
| 72 |
-
# Bar Plot
|
| 73 |
labels = [f"LO{i+1}" for i in range(len(los))]
|
| 74 |
x = range(len(labels))
|
| 75 |
-
fig, ax = plt.subplots()
|
| 76 |
ax.bar(x, old_scores, width=0.4, label="Old", align='center')
|
| 77 |
ax.bar([i + 0.4 for i in x], new_scores, width=0.4, label="New", align='center')
|
| 78 |
ax.set_xticks([i + 0.2 for i in x])
|
|
@@ -81,63 +81,45 @@ def compare_all(old_pdf, new_pdf, lo_file):
|
|
| 81 |
ax.set_title("Learning Outcomes Comparison")
|
| 82 |
ax.legend()
|
| 83 |
|
| 84 |
-
# Table
|
| 85 |
data = {
|
| 86 |
"Learning Outcome": labels,
|
|
|
|
|
|
|
| 87 |
"Old Match": [round(s*100, 2) for s in old_scores],
|
| 88 |
"New Match": [round(s*100, 2) for s in new_scores],
|
| 89 |
-
"Change (%)": [round((
|
| 90 |
}
|
| 91 |
df = pd.DataFrame(data)
|
| 92 |
|
| 93 |
-
|
| 94 |
-
tfidf_sim = cosine_similarity(
|
| 95 |
-
TfidfVectorizer().fit_transform([old_text, new_text])
|
| 96 |
-
)[0][1]
|
| 97 |
-
tfidf_percent = round(tfidf_sim * 100, 2)
|
| 98 |
-
|
| 99 |
-
# Sentence Transformer Similarity
|
| 100 |
-
try:
|
| 101 |
-
trans_sim = round(transformer_similarity(old_text, new_text) * 100, 2)
|
| 102 |
-
except:
|
| 103 |
-
trans_sim = "N/A"
|
| 104 |
-
|
| 105 |
-
# Text Length Change
|
| 106 |
-
length_change = round(((len(new_text) - len(old_text)) / len(old_text)) * 100, 2)
|
| 107 |
-
|
| 108 |
-
# Bloom's Taxonomy Quality Index
|
| 109 |
-
quality_index = calculate_quality_index(new_text)
|
| 110 |
|
| 111 |
-
|
| 112 |
-
|
| 113 |
-
|
| 114 |
|
| 115 |
-
|
| 116 |
-
|
| 117 |
-
βοΈ **Text Length Change**: {length_change}% ({'more' if length_change > 0 else 'less'} content)
|
| 118 |
-
π **Quality Index (Bloom's Taxonomy)**: {quality_index}%
|
| 119 |
-
|
| 120 |
-
π― **Matched Learning Outcomes**: {sum(1 for s in new_scores if s >= 0.5)} of {len(los)}
|
| 121 |
-
π **Insight**: New content appears {'better' if sum(new_scores) > sum(old_scores) else 'less'} aligned with outcomes.
|
| 122 |
"""
|
| 123 |
|
| 124 |
-
return summary, df, fig,
|
|
|
|
|
|
|
| 125 |
|
| 126 |
iface = gr.Interface(
|
| 127 |
fn=compare_all,
|
| 128 |
inputs=[
|
| 129 |
gr.File(label="Old Handout PDF", type='binary'),
|
| 130 |
gr.File(label="New Handout PDF", type='binary'),
|
| 131 |
-
gr.File(label="Learning Outcomes (TXT)", type='binary')
|
| 132 |
],
|
| 133 |
outputs=[
|
| 134 |
-
gr.Markdown(label="
|
| 135 |
gr.Dataframe(label="π LO-wise Comparison Table"),
|
| 136 |
-
gr.Plot(label="
|
| 137 |
-
gr.
|
| 138 |
],
|
| 139 |
-
title="
|
| 140 |
-
description="Compare two handouts and
|
| 141 |
)
|
| 142 |
|
| 143 |
iface.launch()
|
|
|
|
| 1 |
|
| 2 |
+
|
| 3 |
import gradio as gr
|
| 4 |
from PyPDF2 import PdfReader
|
| 5 |
from sklearn.feature_extraction.text import TfidfVectorizer
|
|
|
|
| 9 |
import pandas as pd
|
| 10 |
import io
|
| 11 |
|
| 12 |
+
# Load transformer model for semantic similarity
|
| 13 |
+
model = SentenceTransformer('paraphrase-MiniLM-L6-v2')
|
| 14 |
|
| 15 |
+
def extract_text_from_pdf(file_bytes):
|
| 16 |
try:
|
| 17 |
+
reader = PdfReader(io.BytesIO(file_bytes))
|
| 18 |
+
return " ".join([page.extract_text() or "" for page in reader.pages]).strip()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 19 |
except Exception as e:
|
| 20 |
+
print("Error extracting text:", e)
|
| 21 |
return ""
|
| 22 |
|
| 23 |
+
def tfidf_similarity(text1, text2):
|
| 24 |
+
vectorizer = TfidfVectorizer().fit_transform([text1, text2])
|
| 25 |
+
return cosine_similarity(vectorizer[0:1], vectorizer[1:2])[0][0]
|
|
|
|
|
|
|
|
|
|
|
|
|
| 26 |
|
| 27 |
def transformer_similarity(text1, text2):
|
| 28 |
+
embeddings = model.encode([text1, text2], convert_to_tensor=True)
|
| 29 |
+
return util.pytorch_cos_sim(embeddings[0], embeddings[1]).item()
|
| 30 |
+
|
| 31 |
+
def bloom_level(term):
|
| 32 |
+
term = term.lower()
|
| 33 |
+
blooms = {
|
| 34 |
+
"remember": ["define", "list", "recall", "identify"],
|
| 35 |
+
"understand": ["explain", "describe", "summarize"],
|
| 36 |
+
"apply": ["apply", "demonstrate", "use"],
|
| 37 |
+
"analyze": ["analyze", "compare", "contrast"],
|
| 38 |
+
"evaluate": ["evaluate", "judge", "critique"],
|
| 39 |
+
"create": ["create", "design", "formulate"]
|
| 40 |
}
|
| 41 |
+
for level, keywords in blooms.items():
|
| 42 |
+
if any(word in term for word in keywords):
|
| 43 |
+
return level.capitalize()
|
| 44 |
+
return "Unknown"
|
| 45 |
+
|
| 46 |
+
def lo_semantic_scores(los, content):
|
| 47 |
+
scores = []
|
| 48 |
+
for lo in los:
|
| 49 |
+
score = transformer_similarity(lo, content)
|
| 50 |
+
scores.append(score)
|
| 51 |
+
return scores
|
| 52 |
|
| 53 |
def compare_all(old_pdf, new_pdf, lo_file):
|
| 54 |
try:
|
| 55 |
+
lo_content = lo_file.read().decode("utf-8", errors="ignore") if hasattr(lo_file, "read") else lo_file.decode("utf-8", errors="ignore")
|
| 56 |
+
los = [line.strip() for line in lo_content.splitlines() if line.strip()]
|
| 57 |
+
except Exception as e:
|
| 58 |
return "β Could not read learning outcomes file.", None, None, None
|
| 59 |
|
| 60 |
old_text = extract_text_from_pdf(old_pdf)
|
| 61 |
new_text = extract_text_from_pdf(new_pdf)
|
| 62 |
|
| 63 |
+
if not old_text or not new_text:
|
| 64 |
return "β Could not extract text from one or both PDFs.", None, None, None
|
| 65 |
|
| 66 |
+
tfidf_sim = tfidf_similarity(old_text, new_text)
|
| 67 |
+
transformer_sim = transformer_similarity(old_text, new_text)
|
| 68 |
+
text_growth = round(((len(new_text) - len(old_text)) / len(old_text)) * 100, 2)
|
| 69 |
+
|
| 70 |
+
old_scores = lo_semantic_scores(los, old_text)
|
| 71 |
+
new_scores = lo_semantic_scores(los, new_text)
|
| 72 |
|
|
|
|
| 73 |
labels = [f"LO{i+1}" for i in range(len(los))]
|
| 74 |
x = range(len(labels))
|
| 75 |
+
fig, ax = plt.subplots(figsize=(10, 5))
|
| 76 |
ax.bar(x, old_scores, width=0.4, label="Old", align='center')
|
| 77 |
ax.bar([i + 0.4 for i in x], new_scores, width=0.4, label="New", align='center')
|
| 78 |
ax.set_xticks([i + 0.2 for i in x])
|
|
|
|
| 81 |
ax.set_title("Learning Outcomes Comparison")
|
| 82 |
ax.legend()
|
| 83 |
|
|
|
|
| 84 |
data = {
|
| 85 |
"Learning Outcome": labels,
|
| 86 |
+
"LO Text": los,
|
| 87 |
+
"Bloom Level": [bloom_level(lo) for lo in los],
|
| 88 |
"Old Match": [round(s*100, 2) for s in old_scores],
|
| 89 |
"New Match": [round(s*100, 2) for s in new_scores],
|
| 90 |
+
"Change (%)": [round((n - o)*100, 2) for n, o in zip(new_scores, old_scores)]
|
| 91 |
}
|
| 92 |
df = pd.DataFrame(data)
|
| 93 |
|
| 94 |
+
summary = f"""π **Summary of Comparison**
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 95 |
|
| 96 |
+
π **TF-IDF Content Change**: {round((1 - tfidf_sim) * 100, 2)}%
|
| 97 |
+
π§ **Transformer-based Similarity**: {round(transformer_sim * 100, 2)}%
|
| 98 |
+
π **Content Length Change**: {text_growth}% {"π Reduced" if text_growth < 0 else "π Increased"}
|
| 99 |
|
| 100 |
+
π― **LO Matches**: {sum(1 for score in new_scores if score > 0.5)} of {len(los)}
|
| 101 |
+
π **Content appears {'more' if sum(new_scores) > sum(old_scores) else 'less'} aligned with learning outcomes.**
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 102 |
"""
|
| 103 |
|
| 104 |
+
return summary, df, fig, new_text[:2000] + "..."
|
| 105 |
+
|
| 106 |
+
import gradio as gr
|
| 107 |
|
| 108 |
iface = gr.Interface(
|
| 109 |
fn=compare_all,
|
| 110 |
inputs=[
|
| 111 |
gr.File(label="Old Handout PDF", type='binary'),
|
| 112 |
gr.File(label="New Handout PDF", type='binary'),
|
| 113 |
+
gr.File(label="Learning Outcomes (TXT)", type='binary')
|
| 114 |
],
|
| 115 |
outputs=[
|
| 116 |
+
gr.Markdown(label="π Summary"),
|
| 117 |
gr.Dataframe(label="π LO-wise Comparison Table"),
|
| 118 |
+
gr.Plot(label="π LO Match Chart"),
|
| 119 |
+
gr.Textbox(label="π Preview of New Content")
|
| 120 |
],
|
| 121 |
+
title="π AI Handout Comparator + LO Aligner",
|
| 122 |
+
description="Compare two versions of handouts using both TF-IDF and Transformers. Analyze changes in content, alignment with Learning Outcomes, and Bloomβs taxonomy level."
|
| 123 |
)
|
| 124 |
|
| 125 |
iface.launch()
|