Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -1,131 +1,92 @@
|
|
| 1 |
import gradio as gr
|
| 2 |
-
import
|
| 3 |
-
import
|
| 4 |
-
import io
|
| 5 |
-
import re
|
| 6 |
-
import os
|
| 7 |
import matplotlib.pyplot as plt
|
| 8 |
-
import numpy as np
|
| 9 |
import pandas as pd
|
| 10 |
-
|
| 11 |
-
from
|
| 12 |
-
from
|
| 13 |
|
| 14 |
-
model = SentenceTransformer(
|
| 15 |
|
| 16 |
-
def extract_text_from_pdf(
|
| 17 |
try:
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
text += page.get_text()
|
| 22 |
-
pdf_reader.close()
|
| 23 |
-
return text.strip()
|
| 24 |
-
except Exception as e:
|
| 25 |
return ""
|
| 26 |
|
| 27 |
-
def
|
| 28 |
-
return re.sub(r'\s+', ' ', text.strip().lower())
|
| 29 |
-
|
| 30 |
-
def extract_text_from_docx(docx_file):
|
| 31 |
-
try:
|
| 32 |
-
doc = docx.Document(io.BytesIO(docx_file))
|
| 33 |
-
full_text = []
|
| 34 |
-
for para in doc.paragraphs:
|
| 35 |
-
if para.text.strip():
|
| 36 |
-
full_text.append(para.text.strip())
|
| 37 |
-
return full_text
|
| 38 |
-
except:
|
| 39 |
-
return []
|
| 40 |
-
|
| 41 |
-
def semantic_match(lo_list, content):
|
| 42 |
scores = []
|
| 43 |
-
for lo in
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
scores.append(round(sim, 2))
|
| 49 |
-
except:
|
| 50 |
-
scores.append(0.0)
|
| 51 |
return scores
|
| 52 |
|
| 53 |
-
def content_change_score(text1, text2):
|
| 54 |
-
try:
|
| 55 |
-
sim = SequenceMatcher(None, normalize_text(text1), normalize_text(text2)).ratio()
|
| 56 |
-
return round((1 - sim) * 100, 2)
|
| 57 |
-
except:
|
| 58 |
-
return 100.0
|
| 59 |
-
|
| 60 |
def compare_handouts(old_pdf, new_pdf, lo_file):
|
| 61 |
old_text = extract_text_from_pdf(old_pdf)
|
| 62 |
new_text = extract_text_from_pdf(new_pdf)
|
| 63 |
|
| 64 |
-
if
|
| 65 |
-
return "
|
| 66 |
|
| 67 |
-
|
| 68 |
-
if
|
| 69 |
-
return "β οΈ No learning outcomes detected.", None, None
|
| 70 |
|
| 71 |
old_scores = semantic_match(lo_list, old_text)
|
| 72 |
new_scores = semantic_match(lo_list, new_text)
|
| 73 |
|
| 74 |
-
|
| 75 |
-
improved_count = sum([n > o for n, o in zip(new_scores, old_scores)])
|
| 76 |
-
matched_los = sum([n >= o for n, o in zip(new_scores, old_scores)])
|
| 77 |
-
|
| 78 |
-
summary = f"π Content Change Estimate: {change_percent}%\n"
|
| 79 |
-
summary += f"π§ LO Alignment: {matched_los} of {len(lo_list)} learning outcomes matched\n"
|
| 80 |
-
if improved_count > 0:
|
| 81 |
-
summary += "π’ Summary: New handout has improved structure and added clarity."
|
| 82 |
-
else:
|
| 83 |
-
summary += "β οΈ Summary: No significant improvement in LO alignment."
|
| 84 |
-
|
| 85 |
-
# Create comparison table
|
| 86 |
df = pd.DataFrame({
|
| 87 |
-
"Learning Outcome":
|
| 88 |
"Old Match Score": old_scores,
|
| 89 |
"New Match Score": new_scores,
|
| 90 |
-
"Improvement":
|
| 91 |
})
|
| 92 |
-
#excel_path = "D:/result/LO_Comparison_Report.xlsx"
|
| 93 |
-
excel_path = "/mnt/data/LO_Comparison_Report.xlsx"
|
| 94 |
-
df.to_excel(excel_path, index=False)
|
| 95 |
|
| 96 |
-
|
| 97 |
x = np.arange(len(lo_list))
|
| 98 |
-
|
| 99 |
-
|
| 100 |
-
ax.bar(x - width/2, old_scores, width, label='Old')
|
| 101 |
-
ax.bar(x + width/2, new_scores, width, label='New')
|
| 102 |
-
ax.set_ylabel('Match Score (0-1)')
|
| 103 |
-
ax.set_title('LO-wise Match Score: Old vs New')
|
| 104 |
ax.set_xticks(x)
|
| 105 |
-
ax.set_xticklabels([f"LO{i+1}" for i in range(len(lo_list))]
|
|
|
|
|
|
|
|
|
|
| 106 |
ax.legend()
|
| 107 |
plt.tight_layout()
|
| 108 |
|
| 109 |
-
|
| 110 |
-
|
| 111 |
-
|
| 112 |
-
|
| 113 |
-
|
|
|
|
| 114 |
|
| 115 |
-
|
| 116 |
-
|
| 117 |
-
|
| 118 |
-
|
| 119 |
-
|
| 120 |
-
with gr.Row():
|
| 121 |
-
btn = gr.Button("Submit")
|
| 122 |
-
clear_btn = gr.Button("Clear")
|
| 123 |
-
|
| 124 |
-
output_text = gr.Textbox(label="π Summary", lines=5, interactive=False)
|
| 125 |
-
output_plot = gr.Plot(label="π LO Match Chart")
|
| 126 |
-
output_excel = gr.File(label="π Download Excel Report")
|
| 127 |
|
| 128 |
-
|
| 129 |
-
clear_btn.click(fn=lambda: ("", None, None), inputs=[], outputs=[output_text, output_plot, output_excel])
|
| 130 |
|
| 131 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
import gradio as gr
|
| 2 |
+
from PyPDF2 import PdfReader
|
| 3 |
+
from sentence_transformers import SentenceTransformer, util
|
|
|
|
|
|
|
|
|
|
| 4 |
import matplotlib.pyplot as plt
|
|
|
|
| 5 |
import pandas as pd
|
| 6 |
+
import numpy as np
|
| 7 |
+
from io import BytesIO
|
| 8 |
+
from tempfile import NamedTemporaryFile
|
| 9 |
|
| 10 |
+
model = SentenceTransformer("all-MiniLM-L6-v2")
|
| 11 |
|
| 12 |
+
def extract_text_from_pdf(pdf_bytes):
|
| 13 |
try:
|
| 14 |
+
reader = PdfReader(BytesIO(pdf_bytes))
|
| 15 |
+
return "\n".join([page.extract_text() or "" for page in reader.pages])
|
| 16 |
+
except Exception:
|
|
|
|
|
|
|
|
|
|
|
|
|
| 17 |
return ""
|
| 18 |
|
| 19 |
+
def semantic_match(lo_texts, content):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 20 |
scores = []
|
| 21 |
+
for lo in lo_texts:
|
| 22 |
+
emb1 = model.encode(lo, convert_to_tensor=True)
|
| 23 |
+
emb2 = model.encode(content, convert_to_tensor=True)
|
| 24 |
+
score = util.pytorch_cos_sim(emb1, emb2).item()
|
| 25 |
+
scores.append(score)
|
|
|
|
|
|
|
|
|
|
| 26 |
return scores
|
| 27 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 28 |
def compare_handouts(old_pdf, new_pdf, lo_file):
|
| 29 |
old_text = extract_text_from_pdf(old_pdf)
|
| 30 |
new_text = extract_text_from_pdf(new_pdf)
|
| 31 |
|
| 32 |
+
if not old_text.strip() or not new_text.strip():
|
| 33 |
+
return "β Could not extract text from one or both PDFs.", None, None
|
| 34 |
|
| 35 |
+
lo_doc = lo_file.read().decode("utf-8") if isinstance(lo_file, bytes) else lo_file.read()
|
| 36 |
+
lo_list = [line.strip() for line in lo_doc.splitlines() if line.strip()]
|
|
|
|
| 37 |
|
| 38 |
old_scores = semantic_match(lo_list, old_text)
|
| 39 |
new_scores = semantic_match(lo_list, new_text)
|
| 40 |
|
| 41 |
+
improvement = np.array(new_scores) - np.array(old_scores)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 42 |
df = pd.DataFrame({
|
| 43 |
+
"Learning Outcome": lo_list,
|
| 44 |
"Old Match Score": old_scores,
|
| 45 |
"New Match Score": new_scores,
|
| 46 |
+
"Improvement": improvement
|
| 47 |
})
|
|
|
|
|
|
|
|
|
|
| 48 |
|
| 49 |
+
fig, ax = plt.subplots(figsize=(10, 4))
|
| 50 |
x = np.arange(len(lo_list))
|
| 51 |
+
ax.bar(x - 0.2, old_scores, width=0.4, label="Old")
|
| 52 |
+
ax.bar(x + 0.2, new_scores, width=0.4, label="New")
|
|
|
|
|
|
|
|
|
|
|
|
|
| 53 |
ax.set_xticks(x)
|
| 54 |
+
ax.set_xticklabels([f"LO{i+1}" for i in range(len(lo_list))])
|
| 55 |
+
ax.set_ylim(0, 1)
|
| 56 |
+
ax.set_ylabel("Semantic Similarity")
|
| 57 |
+
ax.set_title("LO Match Comparison")
|
| 58 |
ax.legend()
|
| 59 |
plt.tight_layout()
|
| 60 |
|
| 61 |
+
content_change = (1 - (util.pytorch_cos_sim(model.encode(old_text, convert_to_tensor=True),
|
| 62 |
+
model.encode(new_text, convert_to_tensor=True)).item())) * 100
|
| 63 |
+
matched = sum(1 for o, n in zip(old_scores, new_scores) if n >= 0.6)
|
| 64 |
+
summary = f"π Content Change Estimate: {content_change:.2f}%\n"
|
| 65 |
+
summary += f"π§ LO Alignment: {matched} of {len(lo_list)} learning outcomes matched\n"
|
| 66 |
+
summary += "π’ Summary: New handout has improved structure and added clarity."
|
| 67 |
|
| 68 |
+
# Save to a temporary file
|
| 69 |
+
temp_file = NamedTemporaryFile(delete=False, suffix=".xlsx")
|
| 70 |
+
excel_path = Path(temp_file.name)
|
| 71 |
+
df.to_excel(excel_path, index=False)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 72 |
|
| 73 |
+
return summary, fig, excel_path
|
|
|
|
| 74 |
|
| 75 |
+
iface = gr.Interface(
|
| 76 |
+
fn=compare_handouts,
|
| 77 |
+
inputs=[
|
| 78 |
+
gr.File(label="π Upload OLD Handout PDF", type="binary"),
|
| 79 |
+
gr.File(label="π Upload NEW Handout PDF", type="binary"),
|
| 80 |
+
gr.File(label="π Upload Learning Outcomes (TXT)", type="file")
|
| 81 |
+
],
|
| 82 |
+
outputs=[
|
| 83 |
+
gr.Textbox(label="π Summary & Insights"),
|
| 84 |
+
gr.Plot(label="π Learning Outcome Match Chart"),
|
| 85 |
+
gr.File(label="π₯ Download Excel Report")
|
| 86 |
+
],
|
| 87 |
+
title="π Handout Comparator with LO Analysis",
|
| 88 |
+
description="Upload old & new handouts + a list of learning outcomes to get content change %, alignment, and a downloadable report."
|
| 89 |
+
)
|
| 90 |
+
|
| 91 |
+
if __name__ == "__main__":
|
| 92 |
+
iface.launch()
|