Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -1,20 +1,14 @@
|
|
| 1 |
|
| 2 |
-
|
| 3 |
import gradio as gr
|
| 4 |
from PyPDF2 import PdfReader
|
| 5 |
-
from pdf2image import convert_from_bytes
|
| 6 |
-
import pytesseract
|
| 7 |
-
from PIL import Image
|
| 8 |
import io
|
| 9 |
from sklearn.feature_extraction.text import TfidfVectorizer
|
| 10 |
from sklearn.metrics.pairwise import cosine_similarity
|
| 11 |
-
from transformers import pipeline
|
| 12 |
import matplotlib.pyplot as plt
|
| 13 |
import pandas as pd
|
| 14 |
-
|
|
|
|
| 15 |
|
| 16 |
-
# Load transformer model for semantic similarity
|
| 17 |
-
semantic_pipeline = pipeline("feature-extraction", model="sentence-transformers/all-MiniLM-L6-v2")
|
| 18 |
|
| 19 |
def extract_text_from_pdf(pdf_file):
|
| 20 |
try:
|
|
@@ -24,21 +18,12 @@ def extract_text_from_pdf(pdf_file):
|
|
| 24 |
text = page.extract_text()
|
| 25 |
if text:
|
| 26 |
full_text += text
|
| 27 |
-
|
| 28 |
-
return full_text
|
| 29 |
-
except Exception as e:
|
| 30 |
-
print("Text extraction failed:", e)
|
| 31 |
-
|
| 32 |
-
try:
|
| 33 |
-
images = convert_from_bytes(pdf_file)
|
| 34 |
-
text = ""
|
| 35 |
-
for img in images:
|
| 36 |
-
text += pytesseract.image_to_string(img)
|
| 37 |
-
return text
|
| 38 |
except Exception as e:
|
| 39 |
-
print("
|
| 40 |
return ""
|
| 41 |
|
|
|
|
| 42 |
def semantic_match(lo_list, content):
|
| 43 |
lo_texts = [lo for lo in lo_list if lo.strip()]
|
| 44 |
vectorizer = TfidfVectorizer().fit_transform([content] + lo_texts)
|
|
@@ -47,43 +32,31 @@ def semantic_match(lo_list, content):
|
|
| 47 |
scores = [cosine_similarity([content_vec], [vec])[0][0] for vec in vectors[1:]]
|
| 48 |
return scores
|
| 49 |
|
| 50 |
-
def compute_difference_and_text_change(old_text, new_text):
|
| 51 |
-
similarity = SequenceMatcher(None, old_text, new_text).ratio()
|
| 52 |
-
difference_percentage = round((1 - similarity) * 100, 2)
|
| 53 |
-
|
| 54 |
-
len_old = len(old_text.split())
|
| 55 |
-
len_new = len(new_text.split())
|
| 56 |
-
length_change = round(((len_new - len_old) / len_old) * 100, 2)
|
| 57 |
-
return difference_percentage, length_change
|
| 58 |
-
|
| 59 |
-
def transformer_similarity(text1, text2):
|
| 60 |
-
emb1 = semantic_pipeline(text1)[0][0]
|
| 61 |
-
emb2 = semantic_pipeline(text2)[0][0]
|
| 62 |
-
sim = cosine_similarity([emb1], [emb2])[0][0]
|
| 63 |
-
return round(sim * 100, 2)
|
| 64 |
|
| 65 |
def compare_all(old_pdf, new_pdf, lo_file):
|
| 66 |
try:
|
| 67 |
los = lo_file.decode("utf-8", errors="ignore").splitlines()
|
| 68 |
los = [lo.strip() for lo in los if lo.strip()]
|
| 69 |
except:
|
| 70 |
-
return "β Could not read learning outcomes file.", None, None
|
| 71 |
|
| 72 |
old_text = extract_text_from_pdf(old_pdf)
|
| 73 |
new_text = extract_text_from_pdf(new_pdf)
|
| 74 |
|
| 75 |
-
if not old_text
|
| 76 |
-
return "β Could not extract text from one or both PDFs.", None, None
|
| 77 |
|
| 78 |
old_scores = semantic_match(los, old_text)
|
| 79 |
new_scores = semantic_match(los, new_text)
|
| 80 |
|
| 81 |
labels = [f"LO{i+1}" for i in range(len(los))]
|
| 82 |
-
x =
|
|
|
|
|
|
|
| 83 |
fig, ax = plt.subplots()
|
| 84 |
-
ax.bar(x, old_scores, width=0.4, label="Old", align='center')
|
| 85 |
-
ax.bar(
|
| 86 |
-
ax.set_xticks(
|
| 87 |
ax.set_xticklabels(labels, rotation=45)
|
| 88 |
ax.set_ylabel("Semantic Match Score")
|
| 89 |
ax.set_title("Learning Outcomes Comparison")
|
|
@@ -92,35 +65,37 @@ def compare_all(old_pdf, new_pdf, lo_file):
|
|
| 92 |
# Table
|
| 93 |
data = {
|
| 94 |
"Learning Outcome": labels,
|
| 95 |
-
"Old Match": [round(s*100, 2) for s in old_scores],
|
| 96 |
-
"New Match": [round(s*100, 2) for s in new_scores],
|
| 97 |
-
"Change (%)": [round((new - old)*100, 2) for new, old in zip(new_scores, old_scores)]
|
| 98 |
}
|
| 99 |
df = pd.DataFrame(data)
|
| 100 |
|
| 101 |
-
#
|
| 102 |
-
|
| 103 |
-
|
| 104 |
-
|
| 105 |
-
)[0][0] * 100, 2)
|
| 106 |
|
| 107 |
-
|
| 108 |
-
|
|
|
|
|
|
|
| 109 |
|
| 110 |
summary = f"""
|
| 111 |
π **Summary of Comparison**
|
| 112 |
|
| 113 |
-
π **
|
| 114 |
-
|
| 115 |
-
|
| 116 |
-
|
|
|
|
| 117 |
|
| 118 |
π― **Learning Outcome Matches**: {sum(1 for s in new_scores if s >= 0.5)} of {len(los)}
|
| 119 |
-
|
| 120 |
-
π¬ **Tip**: Diff > 30% or word increase > 20% generally reflects real updates.
|
| 121 |
"""
|
| 122 |
|
| 123 |
-
return summary.strip(), df, fig
|
|
|
|
| 124 |
|
| 125 |
iface = gr.Interface(
|
| 126 |
fn=compare_all,
|
|
@@ -130,12 +105,14 @@ iface = gr.Interface(
|
|
| 130 |
gr.File(label="Learning Outcomes (Text File)", type='binary'),
|
| 131 |
],
|
| 132 |
outputs=[
|
| 133 |
-
gr.Textbox(label="
|
| 134 |
-
gr.Dataframe(label="
|
| 135 |
-
gr.Plot(label="π LO
|
|
|
|
| 136 |
],
|
| 137 |
-
title="π Handout Comparator + LO Analyzer
|
| 138 |
-
description="
|
| 139 |
)
|
| 140 |
|
| 141 |
iface.launch()
|
|
|
|
|
|
| 1 |
|
|
|
|
| 2 |
import gradio as gr
|
| 3 |
from PyPDF2 import PdfReader
|
|
|
|
|
|
|
|
|
|
| 4 |
import io
|
| 5 |
from sklearn.feature_extraction.text import TfidfVectorizer
|
| 6 |
from sklearn.metrics.pairwise import cosine_similarity
|
|
|
|
| 7 |
import matplotlib.pyplot as plt
|
| 8 |
import pandas as pd
|
| 9 |
+
import numpy as np
|
| 10 |
+
import re
|
| 11 |
|
|
|
|
|
|
|
| 12 |
|
| 13 |
def extract_text_from_pdf(pdf_file):
|
| 14 |
try:
|
|
|
|
| 18 |
text = page.extract_text()
|
| 19 |
if text:
|
| 20 |
full_text += text
|
| 21 |
+
return full_text.strip()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 22 |
except Exception as e:
|
| 23 |
+
print("PDF extraction error:", e)
|
| 24 |
return ""
|
| 25 |
|
| 26 |
+
|
| 27 |
def semantic_match(lo_list, content):
|
| 28 |
lo_texts = [lo for lo in lo_list if lo.strip()]
|
| 29 |
vectorizer = TfidfVectorizer().fit_transform([content] + lo_texts)
|
|
|
|
| 32 |
scores = [cosine_similarity([content_vec], [vec])[0][0] for vec in vectors[1:]]
|
| 33 |
return scores
|
| 34 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 35 |
|
| 36 |
def compare_all(old_pdf, new_pdf, lo_file):
|
| 37 |
try:
|
| 38 |
los = lo_file.decode("utf-8", errors="ignore").splitlines()
|
| 39 |
los = [lo.strip() for lo in los if lo.strip()]
|
| 40 |
except:
|
| 41 |
+
return "β Could not read learning outcomes file.", None, None, None
|
| 42 |
|
| 43 |
old_text = extract_text_from_pdf(old_pdf)
|
| 44 |
new_text = extract_text_from_pdf(new_pdf)
|
| 45 |
|
| 46 |
+
if not old_text or not new_text:
|
| 47 |
+
return "β Could not extract text from one or both PDFs.", None, None, None
|
| 48 |
|
| 49 |
old_scores = semantic_match(los, old_text)
|
| 50 |
new_scores = semantic_match(los, new_text)
|
| 51 |
|
| 52 |
labels = [f"LO{i+1}" for i in range(len(los))]
|
| 53 |
+
x = np.arange(len(labels))
|
| 54 |
+
|
| 55 |
+
# Plot
|
| 56 |
fig, ax = plt.subplots()
|
| 57 |
+
ax.bar(x - 0.2, old_scores, width=0.4, label="Old", align='center')
|
| 58 |
+
ax.bar(x + 0.2, new_scores, width=0.4, label="New", align='center')
|
| 59 |
+
ax.set_xticks(x)
|
| 60 |
ax.set_xticklabels(labels, rotation=45)
|
| 61 |
ax.set_ylabel("Semantic Match Score")
|
| 62 |
ax.set_title("Learning Outcomes Comparison")
|
|
|
|
| 65 |
# Table
|
| 66 |
data = {
|
| 67 |
"Learning Outcome": labels,
|
| 68 |
+
"Old Match (%)": [round(s * 100, 2) for s in old_scores],
|
| 69 |
+
"New Match (%)": [round(s * 100, 2) for s in new_scores],
|
| 70 |
+
"Change (%)": [round((new - old) * 100, 2) for new, old in zip(new_scores, old_scores)]
|
| 71 |
}
|
| 72 |
df = pd.DataFrame(data)
|
| 73 |
|
| 74 |
+
# Content similarity
|
| 75 |
+
tfidf = TfidfVectorizer().fit_transform([old_text, new_text])
|
| 76 |
+
cosine_sim = cosine_similarity(tfidf[0:1], tfidf[1:2])[0][0] * 100
|
| 77 |
+
content_diff = 100 - round(cosine_sim, 2)
|
|
|
|
| 78 |
|
| 79 |
+
# Text size change
|
| 80 |
+
len_old = len(re.findall(r'\w+', old_text))
|
| 81 |
+
len_new = len(re.findall(r'\w+', new_text))
|
| 82 |
+
word_change_percent = round(((len_new - len_old) / len_old) * 100, 2)
|
| 83 |
|
| 84 |
summary = f"""
|
| 85 |
π **Summary of Comparison**
|
| 86 |
|
| 87 |
+
π **Overall Content Change**: {content_diff:.2f}%
|
| 88 |
+
π This is based on TF-IDF cosine similarity between old and new handouts.
|
| 89 |
+
|
| 90 |
+
π **Text Length Difference**: {'+' if word_change_percent >= 0 else ''}{word_change_percent:.2f}%
|
| 91 |
+
Compared by total number of words in both handouts.
|
| 92 |
|
| 93 |
π― **Learning Outcome Matches**: {sum(1 for s in new_scores if s >= 0.5)} of {len(los)}
|
| 94 |
+
β
New handout appears {'more' if sum(new_scores) > sum(old_scores) else 'less'} aligned with stated outcomes.
|
|
|
|
| 95 |
"""
|
| 96 |
|
| 97 |
+
return summary.strip(), df, fig, "β
Comparison completed successfully."
|
| 98 |
+
|
| 99 |
|
| 100 |
iface = gr.Interface(
|
| 101 |
fn=compare_all,
|
|
|
|
| 105 |
gr.File(label="Learning Outcomes (Text File)", type='binary'),
|
| 106 |
],
|
| 107 |
outputs=[
|
| 108 |
+
gr.Textbox(label="π Summary & Insights", lines=20, max_lines=25),
|
| 109 |
+
gr.Dataframe(label="π LO-wise Comparison Table"),
|
| 110 |
+
gr.Plot(label="π LO Visual Comparison"),
|
| 111 |
+
gr.Textbox(label="βΉοΈ Status", lines=1)
|
| 112 |
],
|
| 113 |
+
title="π Handout Comparator + LO Analyzer",
|
| 114 |
+
description="Upload OLD and NEW handouts in PDF format along with a TXT file of Learning Outcomes. The app compares content changes and evaluates alignment with LOs visually and in table format."
|
| 115 |
)
|
| 116 |
|
| 117 |
iface.launch()
|
| 118 |
+
|