Deevyankar commited on
Commit
28e421a
Β·
verified Β·
1 Parent(s): 9a807cb

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +117 -0
app.py ADDED
@@ -0,0 +1,117 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import fitz # PyMuPDF
3
+ import docx
4
+ import io
5
+ import re
6
+ from sklearn.feature_extraction.text import TfidfVectorizer
7
+ from sentence_transformers import SentenceTransformer, util
8
+ import matplotlib.pyplot as plt
9
+ import numpy as np
10
+ from difflib import SequenceMatcher
11
+
12
+ model = SentenceTransformer('all-MiniLM-L6-v2')
13
+
14
+ def extract_text_from_pdf(pdf_file):
15
+ try:
16
+ pdf_reader = fitz.open(stream=pdf_file, filetype="pdf")
17
+ text = ""
18
+ for page in pdf_reader:
19
+ text += page.get_text()
20
+ pdf_reader.close()
21
+ return text.strip()
22
+ except Exception as e:
23
+ return ""
24
+
25
+ def normalize_text(text):
26
+ return re.sub(r'\s+', ' ', text.strip().lower())
27
+
28
+ def extract_text_from_docx(docx_file):
29
+ try:
30
+ doc = docx.Document(io.BytesIO(docx_file))
31
+ full_text = []
32
+ for para in doc.paragraphs:
33
+ if para.text.strip():
34
+ full_text.append(para.text.strip())
35
+ return full_text
36
+ except:
37
+ return []
38
+
39
+ def semantic_match(lo_list, content):
40
+ scores = []
41
+ for lo in lo_list:
42
+ try:
43
+ lo_embed = model.encode(lo, convert_to_tensor=True)
44
+ content_embed = model.encode(content, convert_to_tensor=True)
45
+ sim = util.pytorch_cos_sim(lo_embed, content_embed).item()
46
+ scores.append(round(sim, 2))
47
+ except:
48
+ scores.append(0.0)
49
+ return scores
50
+
51
+ def content_change_score(text1, text2):
52
+ try:
53
+ sim = SequenceMatcher(None, normalize_text(text1), normalize_text(text2)).ratio()
54
+ return round((1 - sim) * 100, 2)
55
+ except:
56
+ return 100.0
57
+
58
+ def compare_handouts(old_pdf, new_pdf, lo_file):
59
+ old_text = extract_text_from_pdf(old_pdf)
60
+ new_text = extract_text_from_pdf(new_pdf)
61
+
62
+ if len(old_text.strip()) < 200 or len(new_text.strip()) < 200:
63
+ return "⚠️ Could not extract meaningful content from one or both PDFs.", None
64
+
65
+ lo_list = extract_text_from_docx(lo_file)
66
+ if not lo_list:
67
+ return "⚠️ No learning outcomes detected.", None
68
+
69
+ old_scores = semantic_match(lo_list, old_text)
70
+ new_scores = semantic_match(lo_list, new_text)
71
+
72
+ change_percent = content_change_score(old_text, new_text)
73
+ improved_count = sum([n > o for n, o in zip(new_scores, old_scores)])
74
+ matched_los = sum([n >= o for n, o in zip(new_scores, old_scores)])
75
+
76
+ summary = f"πŸ“ˆ Content Change Estimate: {change_percent}%\n"
77
+ summary += f"🧠 LO Alignment: {matched_los} of {len(lo_list)} learning outcomes matched\n"
78
+ if improved_count > 0:
79
+ summary += "🟒 Summary: New handout has improved structure and added clarity."
80
+ else:
81
+ summary += "⚠️ Summary: No significant improvement in LO alignment."
82
+
83
+ # Plot
84
+ x = np.arange(len(lo_list))
85
+ width = 0.35
86
+ fig, ax = plt.subplots()
87
+ ax.bar(x - width/2, old_scores, width, label='Old')
88
+ ax.bar(x + width/2, new_scores, width, label='New')
89
+ ax.set_ylabel('Match Score (0-1)')
90
+ ax.set_title('LO-wise Match Score: Old vs New')
91
+ ax.set_xticks(x)
92
+ ax.set_xticklabels([f"LO{i+1}" for i in range(len(lo_list))], rotation=45)
93
+ ax.legend()
94
+ plt.tight_layout()
95
+
96
+ return summary, fig
97
+
98
+ with gr.Blocks() as demo:
99
+ gr.Markdown("πŸ“˜ **Educational Content Comparator**")
100
+ gr.Markdown("Upload 2 handouts and a .docx file of Learning Outcomes to compare changes and alignment.")
101
+
102
+ with gr.Row():
103
+ old_pdf = gr.File(label="πŸ“‚ Upload Old PDF", file_types=[".pdf"], type="binary")
104
+ new_pdf = gr.File(label="πŸ“‚ Upload New PDF", file_types=[".pdf"], type="binary")
105
+ lo_file = gr.File(label="πŸ“‚ Upload Learning Outcomes (.docx)", file_types=[".docx"], type="binary")
106
+
107
+ with gr.Row():
108
+ btn = gr.Button("Submit")
109
+ clear_btn = gr.Button("Clear")
110
+
111
+ output_text = gr.Textbox(label="πŸ“‹ Summary", lines=5, interactive=False)
112
+ output_plot = gr.Plot(label="πŸ“Š LO Match Chart")
113
+
114
+ btn.click(fn=compare_handouts, inputs=[old_pdf, new_pdf, lo_file], outputs=[output_text, output_plot])
115
+ clear_btn.click(fn=lambda: ("", None), inputs=[], outputs=[output_text, output_plot])
116
+
117
+ demo.launch()