Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -1,12 +1,17 @@
|
|
| 1 |
|
|
|
|
| 2 |
import gradio as gr
|
| 3 |
from sklearn.feature_extraction.text import TfidfVectorizer
|
| 4 |
from sklearn.metrics.pairwise import cosine_similarity
|
|
|
|
| 5 |
import fitz # PyMuPDF
|
| 6 |
import docx
|
| 7 |
import matplotlib.pyplot as plt
|
| 8 |
import pandas as pd
|
| 9 |
|
|
|
|
|
|
|
|
|
|
| 10 |
def extract_text_from_pdf(file):
|
| 11 |
text = ""
|
| 12 |
try:
|
|
@@ -22,52 +27,45 @@ def extract_text_from_docx(file):
|
|
| 22 |
return "\n".join([para.text for para in doc.paragraphs])
|
| 23 |
|
| 24 |
def semantic_match(lo_texts, content):
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
similarities = cosine_similarity([content_vector], lo_vectors)[0]
|
| 30 |
return similarities.tolist()
|
| 31 |
|
| 32 |
def compare_handouts(old_pdf, new_pdf, lo_file):
|
| 33 |
-
# Extract text from handouts
|
| 34 |
old_text = extract_text_from_pdf(old_pdf)
|
| 35 |
new_text = extract_text_from_pdf(new_pdf)
|
| 36 |
if not old_text.strip() or not new_text.strip():
|
| 37 |
-
return "Could not extract text from one or both PDFs.", None, None
|
| 38 |
|
| 39 |
-
# Extract Learning Outcomes
|
| 40 |
lo_text = extract_text_from_docx(lo_file)
|
| 41 |
lo_list = [line.strip() for line in lo_text.split("\n") if line.strip()]
|
| 42 |
if not lo_list:
|
| 43 |
-
return "No learning outcomes detected.", None, None
|
| 44 |
|
| 45 |
-
# Match scores
|
| 46 |
old_scores = semantic_match(lo_list, old_text)
|
| 47 |
new_scores = semantic_match(lo_list, new_text)
|
| 48 |
|
| 49 |
-
# Calculate overall change
|
| 50 |
avg_old = sum(old_scores) / len(old_scores)
|
| 51 |
avg_new = sum(new_scores) / len(new_scores)
|
| 52 |
change = round(((avg_new - avg_old) / avg_old) * 100, 2) if avg_old != 0 else 100.0
|
| 53 |
|
| 54 |
-
# Summary
|
| 55 |
matched = sum([1 for o, n in zip(old_scores, new_scores) if n >= o])
|
| 56 |
summary = f"π Content Change: {change:.2f}%\nπ― Matched LOs: {matched} of {len(lo_list)}"
|
| 57 |
if change > 10:
|
| 58 |
summary += "\nπ’ New content appears more detailed and informative."
|
| 59 |
elif change < -10:
|
| 60 |
-
summary += "\nπ΄
|
| 61 |
else:
|
| 62 |
-
summary += "\nπ‘
|
| 63 |
|
| 64 |
-
# LO-wise chart and table
|
| 65 |
los = [f"LO{i+1}" for i in range(len(lo_list))]
|
| 66 |
percentage_change = [round(((n - o) / o) * 100, 2) if o else 100.0 for o, n in zip(old_scores, new_scores)]
|
| 67 |
df = pd.DataFrame({
|
| 68 |
"Learning Outcome": los,
|
| 69 |
-
"Old Score": old_scores,
|
| 70 |
-
"New Score": new_scores,
|
| 71 |
"% Change": percentage_change
|
| 72 |
})
|
| 73 |
|
|
@@ -83,7 +81,7 @@ def compare_handouts(old_pdf, new_pdf, lo_file):
|
|
| 83 |
plt.savefig(table_path, bbox_inches='tight', dpi=300)
|
| 84 |
plt.close()
|
| 85 |
|
| 86 |
-
#
|
| 87 |
fig, ax = plt.subplots(figsize=(10, 4))
|
| 88 |
bar_width = 0.35
|
| 89 |
index = range(len(los))
|
|
@@ -91,10 +89,10 @@ def compare_handouts(old_pdf, new_pdf, lo_file):
|
|
| 91 |
ax.bar([i + bar_width for i in index], new_scores, bar_width, label='New', alpha=0.7)
|
| 92 |
ax.set_xticks([i + bar_width / 2 for i in index])
|
| 93 |
ax.set_xticklabels(los)
|
| 94 |
-
ax.set_ylabel('Match
|
| 95 |
-
ax.set_title('
|
| 96 |
ax.legend()
|
| 97 |
-
chart_path = "/mnt/data/
|
| 98 |
plt.tight_layout()
|
| 99 |
plt.savefig(chart_path)
|
| 100 |
plt.close()
|
|
@@ -103,15 +101,15 @@ def compare_handouts(old_pdf, new_pdf, lo_file):
|
|
| 103 |
|
| 104 |
# Gradio interface
|
| 105 |
with gr.Blocks() as demo:
|
| 106 |
-
gr.Markdown("
|
| 107 |
with gr.Row():
|
| 108 |
-
old_pdf_input = gr.File(label="
|
| 109 |
-
new_pdf_input = gr.File(label="
|
| 110 |
-
lo_input = gr.File(label="
|
| 111 |
-
submit_btn = gr.Button("π Analyze
|
| 112 |
summary_output = gr.Textbox(label="Summary")
|
| 113 |
-
lo_table_output = gr.Image(label="π LO
|
| 114 |
-
lo_chart_output = gr.Image(label="π LO
|
| 115 |
|
| 116 |
submit_btn.click(fn=compare_handouts, inputs=[old_pdf_input, new_pdf_input, lo_input],
|
| 117 |
outputs=[summary_output, lo_table_output, lo_chart_output])
|
|
|
|
| 1 |
|
| 2 |
+
|
| 3 |
import gradio as gr
|
| 4 |
from sklearn.feature_extraction.text import TfidfVectorizer
|
| 5 |
from sklearn.metrics.pairwise import cosine_similarity
|
| 6 |
+
from sentence_transformers import SentenceTransformer, util
|
| 7 |
import fitz # PyMuPDF
|
| 8 |
import docx
|
| 9 |
import matplotlib.pyplot as plt
|
| 10 |
import pandas as pd
|
| 11 |
|
| 12 |
+
# Load transformer model
|
| 13 |
+
model = SentenceTransformer("all-MiniLM-L6-v2")
|
| 14 |
+
|
| 15 |
def extract_text_from_pdf(file):
|
| 16 |
text = ""
|
| 17 |
try:
|
|
|
|
| 27 |
return "\n".join([para.text for para in doc.paragraphs])
|
| 28 |
|
| 29 |
def semantic_match(lo_texts, content):
|
| 30 |
+
embeddings = model.encode([content] + lo_texts, convert_to_tensor=True)
|
| 31 |
+
content_embedding = embeddings[0]
|
| 32 |
+
lo_embeddings = embeddings[1:]
|
| 33 |
+
similarities = util.pytorch_cos_sim(content_embedding, lo_embeddings)[0]
|
|
|
|
| 34 |
return similarities.tolist()
|
| 35 |
|
| 36 |
def compare_handouts(old_pdf, new_pdf, lo_file):
|
|
|
|
| 37 |
old_text = extract_text_from_pdf(old_pdf)
|
| 38 |
new_text = extract_text_from_pdf(new_pdf)
|
| 39 |
if not old_text.strip() or not new_text.strip():
|
| 40 |
+
return "β Could not extract text from one or both PDFs.", None, None
|
| 41 |
|
|
|
|
| 42 |
lo_text = extract_text_from_docx(lo_file)
|
| 43 |
lo_list = [line.strip() for line in lo_text.split("\n") if line.strip()]
|
| 44 |
if not lo_list:
|
| 45 |
+
return "β οΈ No learning outcomes detected in uploaded DOCX file.", None, None
|
| 46 |
|
|
|
|
| 47 |
old_scores = semantic_match(lo_list, old_text)
|
| 48 |
new_scores = semantic_match(lo_list, new_text)
|
| 49 |
|
|
|
|
| 50 |
avg_old = sum(old_scores) / len(old_scores)
|
| 51 |
avg_new = sum(new_scores) / len(new_scores)
|
| 52 |
change = round(((avg_new - avg_old) / avg_old) * 100, 2) if avg_old != 0 else 100.0
|
| 53 |
|
|
|
|
| 54 |
matched = sum([1 for o, n in zip(old_scores, new_scores) if n >= o])
|
| 55 |
summary = f"π Content Change: {change:.2f}%\nπ― Matched LOs: {matched} of {len(lo_list)}"
|
| 56 |
if change > 10:
|
| 57 |
summary += "\nπ’ New content appears more detailed and informative."
|
| 58 |
elif change < -10:
|
| 59 |
+
summary += "\nπ΄ Content may have been reduced or simplified."
|
| 60 |
else:
|
| 61 |
+
summary += "\nπ‘ Only minor updates detected."
|
| 62 |
|
|
|
|
| 63 |
los = [f"LO{i+1}" for i in range(len(lo_list))]
|
| 64 |
percentage_change = [round(((n - o) / o) * 100, 2) if o else 100.0 for o, n in zip(old_scores, new_scores)]
|
| 65 |
df = pd.DataFrame({
|
| 66 |
"Learning Outcome": los,
|
| 67 |
+
"Old Score": [round(s, 3) for s in old_scores],
|
| 68 |
+
"New Score": [round(s, 3) for s in new_scores],
|
| 69 |
"% Change": percentage_change
|
| 70 |
})
|
| 71 |
|
|
|
|
| 81 |
plt.savefig(table_path, bbox_inches='tight', dpi=300)
|
| 82 |
plt.close()
|
| 83 |
|
| 84 |
+
# Bar chart
|
| 85 |
fig, ax = plt.subplots(figsize=(10, 4))
|
| 86 |
bar_width = 0.35
|
| 87 |
index = range(len(los))
|
|
|
|
| 89 |
ax.bar([i + bar_width for i in index], new_scores, bar_width, label='New', alpha=0.7)
|
| 90 |
ax.set_xticks([i + bar_width / 2 for i in index])
|
| 91 |
ax.set_xticklabels(los)
|
| 92 |
+
ax.set_ylabel('Semantic Match (0-1)')
|
| 93 |
+
ax.set_title('Learning Outcome Comparison')
|
| 94 |
ax.legend()
|
| 95 |
+
chart_path = "/mnt/data/lo_score_chart.png"
|
| 96 |
plt.tight_layout()
|
| 97 |
plt.savefig(chart_path)
|
| 98 |
plt.close()
|
|
|
|
| 101 |
|
| 102 |
# Gradio interface
|
| 103 |
with gr.Blocks() as demo:
|
| 104 |
+
gr.Markdown("## π§ Transformer-Based Course Content Comparator")
|
| 105 |
with gr.Row():
|
| 106 |
+
old_pdf_input = gr.File(label="π Old Handout (PDF)", file_types=[".pdf"])
|
| 107 |
+
new_pdf_input = gr.File(label="π New Handout (PDF)", file_types=[".pdf"])
|
| 108 |
+
lo_input = gr.File(label="π Learning Outcomes (DOCX)", file_types=[".docx"])
|
| 109 |
+
submit_btn = gr.Button("π Analyze")
|
| 110 |
summary_output = gr.Textbox(label="Summary")
|
| 111 |
+
lo_table_output = gr.Image(label="π LO Change Table")
|
| 112 |
+
lo_chart_output = gr.Image(label="π LO Match Chart")
|
| 113 |
|
| 114 |
submit_btn.click(fn=compare_handouts, inputs=[old_pdf_input, new_pdf_input, lo_input],
|
| 115 |
outputs=[summary_output, lo_table_output, lo_chart_output])
|