Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -1,109 +1,102 @@
|
|
| 1 |
-
|
| 2 |
import gradio as gr
|
| 3 |
from PyPDF2 import PdfReader
|
| 4 |
-
from
|
|
|
|
|
|
|
| 5 |
import matplotlib.pyplot as plt
|
| 6 |
import pandas as pd
|
| 7 |
import numpy as np
|
|
|
|
| 8 |
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
def extract_text_pdf(file_obj):
|
| 12 |
try:
|
| 13 |
-
|
| 14 |
text = ""
|
| 15 |
-
for page in
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
text += extracted + "\n"
|
| 19 |
-
return text if text.strip() else None
|
| 20 |
except:
|
| 21 |
return None
|
| 22 |
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 27 |
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
for lo in lo_list:
|
| 31 |
-
score = util.cos_sim(model.encode(lo, convert_to_tensor=True),
|
| 32 |
-
model.encode(text, convert_to_tensor=True))[0][0].item()
|
| 33 |
-
scores.append(round(score * 100, 2))
|
| 34 |
-
return scores
|
| 35 |
|
|
|
|
| 36 |
def compare_all(old_pdf, new_pdf, lo_file):
|
| 37 |
-
old_text =
|
| 38 |
-
new_text =
|
| 39 |
-
|
| 40 |
if not old_text or not new_text:
|
| 41 |
return "β Could not extract text from one or both PDFs.", None, None
|
| 42 |
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
elif change_percent < 40:
|
| 51 |
-
summary += "π Moderate update."
|
| 52 |
-
else:
|
| 53 |
-
summary += "π Significant changes detected."
|
| 54 |
-
|
| 55 |
-
# LO comparison
|
| 56 |
-
if hasattr(lo_file, 'read'):
|
| 57 |
-
lo_text = lo_file.read().decode("utf-8", errors="ignore")
|
| 58 |
-
else:
|
| 59 |
-
lo_text = lo_file # already a string
|
| 60 |
-
|
| 61 |
-
los = [line.strip() for line in lo_text.splitlines() if line.strip()]
|
| 62 |
-
if not los:
|
| 63 |
-
return summary + "\nβ οΈ No valid Learning Outcomes found.", None, None
|
| 64 |
-
|
| 65 |
-
old_scores = compare_with_los(old_text, los)
|
| 66 |
-
new_scores = compare_with_los(new_text, los)
|
| 67 |
-
score_diff = [round(new - old, 2) for old, new in zip(old_scores, new_scores)]
|
| 68 |
|
|
|
|
| 69 |
df = pd.DataFrame({
|
| 70 |
"Learning Outcome": los,
|
| 71 |
-
"Old Match (%)": old_scores,
|
| 72 |
-
"New Match (%)": new_scores,
|
| 73 |
-
"Change (%)":
|
| 74 |
})
|
| 75 |
-
table_html = df.to_html(index=False)
|
| 76 |
|
| 77 |
-
#
|
| 78 |
-
fig, ax = plt.subplots(figsize=(10,
|
| 79 |
-
|
| 80 |
-
|
| 81 |
-
ax.bar(
|
| 82 |
-
ax.bar(
|
| 83 |
-
ax.set_xlabel('Learning Outcomes')
|
| 84 |
ax.set_ylabel('Match Score (%)')
|
| 85 |
-
ax.set_title('LO-wise Semantic Match')
|
| 86 |
-
ax.set_xticks(
|
| 87 |
-
ax.set_xticklabels(
|
| 88 |
ax.legend()
|
| 89 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 90 |
|
| 91 |
-
return
|
| 92 |
|
|
|
|
| 93 |
iface = gr.Interface(
|
| 94 |
fn=compare_all,
|
| 95 |
inputs=[
|
| 96 |
-
gr.File(label="Old Handout
|
| 97 |
-
gr.File(label="New Handout
|
| 98 |
-
gr.File(label="Learning Outcomes (
|
| 99 |
],
|
| 100 |
outputs=[
|
| 101 |
-
gr.Textbox(label="Summary"),
|
| 102 |
-
gr.
|
| 103 |
-
gr.
|
| 104 |
],
|
| 105 |
-
title="π
|
| 106 |
-
description="
|
| 107 |
)
|
| 108 |
|
| 109 |
-
|
|
|
|
|
|
|
|
|
| 1 |
import gradio as gr
|
| 2 |
from PyPDF2 import PdfReader
|
| 3 |
+
from sklearn.feature_extraction.text import TfidfVectorizer
|
| 4 |
+
from sklearn.metrics.pairwise import cosine_similarity
|
| 5 |
+
from transformers import pipeline
|
| 6 |
import matplotlib.pyplot as plt
|
| 7 |
import pandas as pd
|
| 8 |
import numpy as np
|
| 9 |
+
import io
|
| 10 |
|
| 11 |
+
# Helper to extract text from PDF
|
| 12 |
+
def extract_text_from_pdf(file):
|
|
|
|
| 13 |
try:
|
| 14 |
+
reader = PdfReader(file)
|
| 15 |
text = ""
|
| 16 |
+
for page in reader.pages:
|
| 17 |
+
text += page.extract_text() or ""
|
| 18 |
+
return text.strip()
|
|
|
|
|
|
|
| 19 |
except:
|
| 20 |
return None
|
| 21 |
|
| 22 |
+
# Semantic match for each LO
|
| 23 |
+
def semantic_match(lo_list, content):
|
| 24 |
+
vectorizer = TfidfVectorizer().fit_transform([content] + lo_list)
|
| 25 |
+
vectors = vectorizer.toarray()
|
| 26 |
+
content_vector = vectors[0]
|
| 27 |
+
lo_vectors = vectors[1:]
|
| 28 |
+
similarities = cosine_similarity([content_vector], lo_vectors)[0]
|
| 29 |
+
return similarities.tolist()
|
| 30 |
|
| 31 |
+
# Summarizer (Optional: switch models if needed)
|
| 32 |
+
summarizer = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 33 |
|
| 34 |
+
# Generate summary insight and visualization
|
| 35 |
def compare_all(old_pdf, new_pdf, lo_file):
|
| 36 |
+
old_text = extract_text_from_pdf(old_pdf)
|
| 37 |
+
new_text = extract_text_from_pdf(new_pdf)
|
| 38 |
+
|
| 39 |
if not old_text or not new_text:
|
| 40 |
return "β Could not extract text from one or both PDFs.", None, None
|
| 41 |
|
| 42 |
+
los = lo_file.read().decode("utf-8", errors="ignore").splitlines()
|
| 43 |
+
los = [lo.strip() for lo in los if lo.strip()]
|
| 44 |
+
|
| 45 |
+
# Semantic matching scores
|
| 46 |
+
old_scores = semantic_match(los, old_text)
|
| 47 |
+
new_scores = semantic_match(los, new_text)
|
| 48 |
+
changes = [round(new - old, 2) for old, new in zip(old_scores, new_scores)]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 49 |
|
| 50 |
+
# Table and DataFrame
|
| 51 |
df = pd.DataFrame({
|
| 52 |
"Learning Outcome": los,
|
| 53 |
+
"Old Match (%)": [round(score * 100, 2) for score in old_scores],
|
| 54 |
+
"New Match (%)": [round(score * 100, 2) for score in new_scores],
|
| 55 |
+
"Change (%)": [round(change * 100, 2) for change in changes],
|
| 56 |
})
|
|
|
|
| 57 |
|
| 58 |
+
# Visualization
|
| 59 |
+
fig, ax = plt.subplots(figsize=(10, 5))
|
| 60 |
+
x = np.arange(len(los))
|
| 61 |
+
width = 0.35
|
| 62 |
+
ax.bar(x - width/2, df["Old Match (%)"], width, label='Old')
|
| 63 |
+
ax.bar(x + width/2, df["New Match (%)"], width, label='New')
|
|
|
|
| 64 |
ax.set_ylabel('Match Score (%)')
|
| 65 |
+
ax.set_title('LO-wise Semantic Match: Old vs New')
|
| 66 |
+
ax.set_xticks(x)
|
| 67 |
+
ax.set_xticklabels(los, rotation=45, ha='right')
|
| 68 |
ax.legend()
|
| 69 |
+
plt.tight_layout()
|
| 70 |
+
|
| 71 |
+
# Content similarity and summary
|
| 72 |
+
content_vectorizer = TfidfVectorizer().fit_transform([old_text, new_text])
|
| 73 |
+
content_sim = cosine_similarity(content_vectorizer)[0, 1]
|
| 74 |
+
summary = summarizer(new_text[:2000], max_length=100, min_length=30, do_sample=False)[0]['summary_text']
|
| 75 |
+
|
| 76 |
+
summary_text = (
|
| 77 |
+
f"π Content Change Score: {round((1 - content_sim) * 100, 2)}%\n"
|
| 78 |
+
f"π― Learning Outcomes Analyzed: {len(los)}\n"
|
| 79 |
+
f"π’ Summary of New Content: {summary}"
|
| 80 |
+
)
|
| 81 |
|
| 82 |
+
return summary_text, df, fig
|
| 83 |
|
| 84 |
+
# Interface
|
| 85 |
iface = gr.Interface(
|
| 86 |
fn=compare_all,
|
| 87 |
inputs=[
|
| 88 |
+
gr.File(label="Old Handout PDF"),
|
| 89 |
+
gr.File(label="New Handout PDF"),
|
| 90 |
+
gr.File(label="Learning Outcomes (Text File)", type='file'),
|
| 91 |
],
|
| 92 |
outputs=[
|
| 93 |
+
gr.Textbox(label="Summary & Insights"),
|
| 94 |
+
gr.Dataframe(label="LO-wise Comparison Table"),
|
| 95 |
+
gr.Plot(label="Visual Comparison Chart")
|
| 96 |
],
|
| 97 |
+
title="π Syllabus Comparator with Learning Outcome Evaluation",
|
| 98 |
+
description="Upload two syllabus handouts (old and new) and a file containing learning outcomes. Get LO-wise comparison, visual chart, and overall content insight."
|
| 99 |
)
|
| 100 |
|
| 101 |
+
if __name__ == "__main__":
|
| 102 |
+
iface.launch()
|