Spaces:
Sleeping
Sleeping
File size: 4,781 Bytes
e0b1653 671e233 e0b1653 671e233 e0b1653 671e233 e0b1653 671e233 e0b1653 671e233 e0b1653 671e233 e0b1653 671e233 e0b1653 671e233 e0b1653 671e233 e0b1653 671e233 e0b1653 671e233 e0b1653 671e233 e0b1653 671e233 e0b1653 671e233 e0b1653 671e233 e0b1653 671e233 e0b1653 671e233 e0b1653 671e233 e0b1653 671e233 e0b1653 671e233 e0b1653 671e233 e0b1653 671e233 e0b1653 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 |
import gradio as gr
from PyPDF2 import PdfReader
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer, util
import matplotlib.pyplot as plt
import pandas as pd
import io
# Load transformer model for semantic similarity
model = SentenceTransformer('paraphrase-MiniLM-L6-v2')
def extract_text_from_pdf(file_bytes):
try:
reader = PdfReader(io.BytesIO(file_bytes))
return " ".join([page.extract_text() or "" for page in reader.pages]).strip()
except Exception as e:
print("Error extracting text:", e)
return ""
def tfidf_similarity(text1, text2):
vectorizer = TfidfVectorizer().fit_transform([text1, text2])
return cosine_similarity(vectorizer[0:1], vectorizer[1:2])[0][0]
def transformer_similarity(text1, text2):
embeddings = model.encode([text1, text2], convert_to_tensor=True)
return util.pytorch_cos_sim(embeddings[0], embeddings[1]).item()
def bloom_level(term):
term = term.lower()
blooms = {
"remember": ["define", "list", "recall", "identify"],
"understand": ["explain", "describe", "summarize"],
"apply": ["apply", "demonstrate", "use"],
"analyze": ["analyze", "compare", "contrast"],
"evaluate": ["evaluate", "judge", "critique"],
"create": ["create", "design", "formulate"]
}
for level, keywords in blooms.items():
if any(word in term for word in keywords):
return level.capitalize()
return "Unknown"
def lo_semantic_scores(los, content):
scores = []
for lo in los:
score = transformer_similarity(lo, content)
scores.append(score)
return scores
def compare_all(old_pdf, new_pdf, lo_file):
try:
lo_content = lo_file.read().decode("utf-8", errors="ignore") if hasattr(lo_file, "read") else lo_file.decode("utf-8", errors="ignore")
los = [line.strip() for line in lo_content.splitlines() if line.strip()]
except Exception as e:
return "β Could not read learning outcomes file.", None, None, None
old_text = extract_text_from_pdf(old_pdf)
new_text = extract_text_from_pdf(new_pdf)
if not old_text or not new_text:
return "β Could not extract text from one or both PDFs.", None, None, None
tfidf_sim = tfidf_similarity(old_text, new_text)
transformer_sim = transformer_similarity(old_text, new_text)
text_growth = round(((len(new_text) - len(old_text)) / len(old_text)) * 100, 2)
old_scores = lo_semantic_scores(los, old_text)
new_scores = lo_semantic_scores(los, new_text)
labels = [f"LO{i+1}" for i in range(len(los))]
x = range(len(labels))
fig, ax = plt.subplots(figsize=(10, 5))
ax.bar(x, old_scores, width=0.4, label="Old", align='center')
ax.bar([i + 0.4 for i in x], new_scores, width=0.4, label="New", align='center')
ax.set_xticks([i + 0.2 for i in x])
ax.set_xticklabels(labels, rotation=45)
ax.set_ylabel("Semantic Match Score")
ax.set_title("Learning Outcomes Comparison")
ax.legend()
data = {
"Learning Outcome": labels,
"LO Text": los,
"Bloom Level": [bloom_level(lo) for lo in los],
"Old Match": [round(s*100, 2) for s in old_scores],
"New Match": [round(s*100, 2) for s in new_scores],
"Change (%)": [round((n - o)*100, 2) for n, o in zip(new_scores, old_scores)]
}
df = pd.DataFrame(data)
summary = f"""π **Summary of Comparison**
π **TF-IDF Content Change**: {round((1 - tfidf_sim) * 100, 2)}%
π§ **Transformer-based Similarity**: {round(transformer_sim * 100, 2)}%
π **Content Length Change**: {text_growth}% {"π Reduced" if text_growth < 0 else "π Increased"}
π― **LO Matches**: {sum(1 for score in new_scores if score > 0.5)} of {len(los)}
π **Content appears {'more' if sum(new_scores) > sum(old_scores) else 'less'} aligned with learning outcomes.**
"""
return summary, df, fig, new_text[:2000] + "..."
import gradio as gr
iface = gr.Interface(
fn=compare_all,
inputs=[
gr.File(label="Old Handout PDF", type='binary'),
gr.File(label="New Handout PDF", type='binary'),
gr.File(label="Learning Outcomes (TXT)", type='binary')
],
outputs=[
gr.Markdown(label="π Summary"),
gr.Dataframe(label="π LO-wise Comparison Table"),
gr.Plot(label="π LO Match Chart"),
gr.Textbox(label="π Preview of New Content")
],
title="π AI Handout Comparator + LO Aligner",
description="Compare two versions of handouts using both TF-IDF and Transformers. Analyze changes in content, alignment with Learning Outcomes, and Bloomβs taxonomy level."
)
iface.launch() |