Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -1,16 +1,13 @@
|
|
| 1 |
|
| 2 |
import gradio as gr
|
| 3 |
from PyPDF2 import PdfReader
|
| 4 |
-
from pdf2image import convert_from_bytes
|
| 5 |
-
import pytesseract
|
| 6 |
-
from PIL import Image
|
| 7 |
import io
|
| 8 |
-
from sklearn.feature_extraction.text import TfidfVectorizer
|
| 9 |
-
from sklearn.metrics.pairwise import cosine_similarity
|
| 10 |
import matplotlib.pyplot as plt
|
|
|
|
| 11 |
from sentence_transformers import SentenceTransformer, util
|
| 12 |
|
| 13 |
-
|
|
|
|
| 14 |
|
| 15 |
def extract_text_from_pdf(pdf_file):
|
| 16 |
try:
|
|
@@ -20,35 +17,22 @@ def extract_text_from_pdf(pdf_file):
|
|
| 20 |
text = page.extract_text()
|
| 21 |
if text:
|
| 22 |
full_text += text
|
| 23 |
-
|
| 24 |
-
return full_text
|
| 25 |
except Exception as e:
|
| 26 |
print("Text extraction failed:", e)
|
| 27 |
-
|
| 28 |
-
try:
|
| 29 |
-
images = convert_from_bytes(pdf_file)
|
| 30 |
-
text = ""
|
| 31 |
-
for img in images:
|
| 32 |
-
text += pytesseract.image_to_string(img)
|
| 33 |
-
return text
|
| 34 |
-
except Exception as e:
|
| 35 |
-
print("OCR failed:", e)
|
| 36 |
return ""
|
| 37 |
|
| 38 |
-
def
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
cos_sim = cosine_similarity(vectors[0], vectors[1])[0][0]
|
| 50 |
-
change_percentage = round((1 - cos_sim) * 100, 2)
|
| 51 |
-
return change_percentage
|
| 52 |
|
| 53 |
def compare_all(old_pdf, new_pdf, lo_file):
|
| 54 |
try:
|
|
@@ -63,8 +47,8 @@ def compare_all(old_pdf, new_pdf, lo_file):
|
|
| 63 |
if not old_text.strip() or not new_text.strip():
|
| 64 |
return "β Could not extract text from one or both PDFs.", None, None
|
| 65 |
|
| 66 |
-
old_scores =
|
| 67 |
-
new_scores =
|
| 68 |
|
| 69 |
labels = [f"LO{i+1}" for i in range(len(los))]
|
| 70 |
x = range(len(labels))
|
|
@@ -77,35 +61,22 @@ def compare_all(old_pdf, new_pdf, lo_file):
|
|
| 77 |
ax.set_title("Learning Outcomes Comparison")
|
| 78 |
ax.legend()
|
| 79 |
|
| 80 |
-
|
| 81 |
"Learning Outcome": labels,
|
| 82 |
"Old Match": [round(s*100, 2) for s in old_scores],
|
| 83 |
"New Match": [round(s*100, 2) for s in new_scores],
|
| 84 |
-
"Change (%)": [round((
|
| 85 |
-
}
|
| 86 |
-
df = pd.DataFrame(data)
|
| 87 |
|
| 88 |
-
|
| 89 |
-
|
| 90 |
|
| 91 |
-
|
| 92 |
-
|
| 93 |
-
|
| 94 |
-
|
| 95 |
-
|
| 96 |
-
|
| 97 |
-
|
| 98 |
-
summary = f"""π **Summary of Comparison**
|
| 99 |
-
|
| 100 |
-
π **Overall Content Change**: {change_percentage}%
|
| 101 |
-
π This percentage shows how much the content has changed from old to new handouts using TF-IDF vector similarity (cosine distance).
|
| 102 |
-
|
| 103 |
-
π― **Learning Outcome Matches**: {sum(1 for s in new_scores if s >= 0.5)} of {len(los)}
|
| 104 |
-
π§ The system checks how well the new content aligns with each LO (threshold: 0.5 semantic match).
|
| 105 |
-
|
| 106 |
-
π’ **Insight**: New content appears {'more' if sum(new_scores) > sum(old_scores) else 'less'} aligned with the learning outcomes.
|
| 107 |
-
|
| 108 |
-
π‘ **Note**: Content change > 40% usually indicates significant modifications (new examples, topics, etc.). Review the table and chart below for LO-specific changes."""
|
| 109 |
|
| 110 |
return summary, df, fig
|
| 111 |
|
|
@@ -117,12 +88,13 @@ iface = gr.Interface(
|
|
| 117 |
gr.File(label="Learning Outcomes (Text File)", type='binary'),
|
| 118 |
],
|
| 119 |
outputs=[
|
| 120 |
-
gr.Textbox(label="Summary & Insights", lines=
|
| 121 |
gr.Dataframe(label="LO-wise Comparison Table"),
|
| 122 |
gr.Plot(label="Visual Comparison Chart")
|
| 123 |
],
|
| 124 |
-
title="π Handout Comparator + LO Analysis",
|
| 125 |
-
description="Upload old/new handouts + Learning Outcomes
|
| 126 |
)
|
| 127 |
|
| 128 |
-
iface.launch()
|
|
|
|
|
|
| 1 |
|
| 2 |
import gradio as gr
|
| 3 |
from PyPDF2 import PdfReader
|
|
|
|
|
|
|
|
|
|
| 4 |
import io
|
|
|
|
|
|
|
| 5 |
import matplotlib.pyplot as plt
|
| 6 |
+
import pandas as pd
|
| 7 |
from sentence_transformers import SentenceTransformer, util
|
| 8 |
|
| 9 |
+
# Load sentence transformer model
|
| 10 |
+
model = SentenceTransformer("all-MiniLM-L6-v2")
|
| 11 |
|
| 12 |
def extract_text_from_pdf(pdf_file):
|
| 13 |
try:
|
|
|
|
| 17 |
text = page.extract_text()
|
| 18 |
if text:
|
| 19 |
full_text += text
|
| 20 |
+
return full_text
|
|
|
|
| 21 |
except Exception as e:
|
| 22 |
print("Text extraction failed:", e)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 23 |
return ""
|
| 24 |
|
| 25 |
+
def semantic_similarity_score(text1, text2):
|
| 26 |
+
embeddings = model.encode([text1, text2], convert_to_tensor=True)
|
| 27 |
+
score = util.pytorch_cos_sim(embeddings[0], embeddings[1]).item()
|
| 28 |
+
return score
|
| 29 |
+
|
| 30 |
+
def semantic_match_los(lo_list, content):
|
| 31 |
+
lo_scores = []
|
| 32 |
+
for lo in lo_list:
|
| 33 |
+
score = semantic_similarity_score(lo, content)
|
| 34 |
+
lo_scores.append(score)
|
| 35 |
+
return lo_scores
|
|
|
|
|
|
|
|
|
|
| 36 |
|
| 37 |
def compare_all(old_pdf, new_pdf, lo_file):
|
| 38 |
try:
|
|
|
|
| 47 |
if not old_text.strip() or not new_text.strip():
|
| 48 |
return "β Could not extract text from one or both PDFs.", None, None
|
| 49 |
|
| 50 |
+
old_scores = semantic_match_los(los, old_text)
|
| 51 |
+
new_scores = semantic_match_los(los, new_text)
|
| 52 |
|
| 53 |
labels = [f"LO{i+1}" for i in range(len(los))]
|
| 54 |
x = range(len(labels))
|
|
|
|
| 61 |
ax.set_title("Learning Outcomes Comparison")
|
| 62 |
ax.legend()
|
| 63 |
|
| 64 |
+
df = pd.DataFrame({
|
| 65 |
"Learning Outcome": labels,
|
| 66 |
"Old Match": [round(s*100, 2) for s in old_scores],
|
| 67 |
"New Match": [round(s*100, 2) for s in new_scores],
|
| 68 |
+
"Change (%)": [round((n - o)*100, 2) for o, n in zip(old_scores, new_scores)]
|
| 69 |
+
})
|
|
|
|
| 70 |
|
| 71 |
+
content_sim = semantic_similarity_score(old_text, new_text)
|
| 72 |
+
content_change_pct = round((1 - content_sim) * 100, 2)
|
| 73 |
|
| 74 |
+
summary = f"π **Summary of Comparison**\n\n"
|
| 75 |
+
summary += f"π **Overall Content Change**: {content_change_pct}%\n"
|
| 76 |
+
summary += f"π This percentage shows how much the content has changed using a semantic model (SentenceTransformer).\n\n"
|
| 77 |
+
summary += f"π― **Learning Outcome Matches**: {sum(1 for s in new_scores if s >= 0.5)} of {len(los)}\n"
|
| 78 |
+
summary += f"π§ The system checks how well the new content aligns with each LO (threshold: 0.5 semantic match).\n\n"
|
| 79 |
+
summary += f"π’ **Insight**: New content appears {'more' if sum(new_scores) > sum(old_scores) else 'less'} aligned with the learning outcomes.\n"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 80 |
|
| 81 |
return summary, df, fig
|
| 82 |
|
|
|
|
| 88 |
gr.File(label="Learning Outcomes (Text File)", type='binary'),
|
| 89 |
],
|
| 90 |
outputs=[
|
| 91 |
+
gr.Textbox(label="Summary & Insights", lines=10, max_lines=30),
|
| 92 |
gr.Dataframe(label="LO-wise Comparison Table"),
|
| 93 |
gr.Plot(label="Visual Comparison Chart")
|
| 94 |
],
|
| 95 |
+
title="π Handout Comparator + LO Analysis (Semantic Transformer)",
|
| 96 |
+
description="Upload old/new handouts + Learning Outcomes (TXT). Compares content and outcome match using a transformer model (MiniLM)."
|
| 97 |
)
|
| 98 |
|
| 99 |
+
iface.launch()
|
| 100 |
+
|