Deevyankar commited on
Commit
65fcd1d
Β·
verified Β·
1 Parent(s): c6ee9d5

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +69 -81
app.py CHANGED
@@ -1,110 +1,98 @@
1
 
2
-
3
- import gradio as gr
4
- from sentence_transformers import SentenceTransformer, util
5
- import fitz # PyMuPDF
6
- import docx
7
  import matplotlib.pyplot as plt
8
- import io
9
- import base64
10
-
11
- model = SentenceTransformer("all-MiniLM-L6-v2")
 
 
12
 
13
- def extract_text_from_pdf(pdf_file):
 
14
  try:
15
- doc = fitz.open(stream=pdf_file.read(), filetype="pdf")
16
  text = ""
17
- for page in doc:
18
- text += page.get_text()
19
- return text
20
- except Exception as e:
21
  return ""
22
 
23
- def extract_text_from_docx(file_obj):
24
- try:
25
- doc = docx.Document(file_obj)
26
- return "\n".join([para.text for para in doc.paragraphs if para.text.strip()])
27
- except Exception as e:
28
- return ""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
29
 
30
- def match_learning_outcomes(lo_list, text):
31
- text_blocks = [para for para in text.split("\n") if len(para.strip()) > 20]
32
- text_embs = model.encode(text_blocks, convert_to_tensor=True)
33
- lo_embs = model.encode(lo_list, convert_to_tensor=True)
34
-
35
- lo_scores = []
36
- for i, lo in enumerate(lo_list):
37
- sims = util.cos_sim(lo_embs[i], text_embs)
38
- max_score = float(sims.max())
39
- lo_scores.append(max_score)
40
- return lo_scores
41
-
42
- def generate_similarity_chart(lo_list, old_scores, new_scores):
43
- fig, ax = plt.subplots(figsize=(10, 5))
44
- x = range(len(lo_list))
45
- ax.bar(x, old_scores, width=0.4, label="Old", align="center")
46
- ax.bar([i + 0.4 for i in x], new_scores, width=0.4, label="New", align="center")
47
- ax.set_xticks([i + 0.2 for i in x])
48
- ax.set_xticklabels([f"LO{i+1}" for i in x], rotation=45)
49
  ax.set_ylabel("Match Score (0-1)")
50
- ax.set_title("LO-wise Semantic Match Score: Old vs New")
51
- ax.legend()
52
-
53
- buf = io.BytesIO()
54
  plt.tight_layout()
55
- plt.savefig(buf, format="png")
56
- buf.seek(0)
57
- encoded = base64.b64encode(buf.read()).decode("utf-8")
58
- plt.close(fig)
59
- return f"data:image/png;base64,{encoded}"
60
 
61
- def compare_handouts(old_pdf, new_pdf, lo_file):
 
62
  old_text = extract_text_from_pdf(old_pdf)
63
  new_text = extract_text_from_pdf(new_pdf)
 
64
  if not old_text or not new_text:
65
  return "❌ Could not extract text from one or both PDFs.", None
66
 
67
- lo_text = extract_text_from_docx(lo_file)
68
- lo_list = [line.strip() for line in lo_text.split("\n") if line.strip()]
69
- if not lo_list:
70
- return "❌ No learning outcomes detected.", None
71
-
72
- # LO Matching
73
- old_scores = match_learning_outcomes(lo_list, old_text)
74
- new_scores = match_learning_outcomes(lo_list, new_text)
75
 
76
- matched_old = sum(score > 0.6 for score in old_scores)
77
- matched_new = sum(score > 0.6 for score in new_scores)
78
-
79
- # Change % based on character count
80
- change_percent = abs(len(new_text) - len(old_text)) / max(len(old_text), 1) * 100
81
 
82
- summary = f"πŸ“ˆ **Content Change:** {change_percent:.2f}%\n"
83
- summary += f"🎯 **LOs Matched (Old vs New):** {matched_old} vs {matched_new} of {len(lo_list)}\n"
84
 
85
- if matched_new > matched_old:
86
- summary += "🟒 New handout covers learning outcomes better.\n"
87
- elif matched_new < matched_old:
88
- summary += "πŸ”΄ New handout covers fewer outcomes.\n"
89
- else:
90
- summary += "🟑 No major change in LO coverage.\n"
91
 
92
- chart = generate_similarity_chart(lo_list, old_scores, new_scores)
93
- return summary, chart
 
94
 
95
- iface = gr.Interface(
 
96
  fn=compare_handouts,
97
  inputs=[
98
- gr.File(label="Old Version PDF"),
99
- gr.File(label="New Version PDF"),
100
- gr.File(label="Learning Outcomes (.docx)", file_types=[".docx"]),
101
  ],
102
  outputs=[
103
- gr.Markdown(label="Summary"),
104
- gr.Image(label="LO Match Chart")
105
  ],
106
  title="πŸ“˜ Educational Content Comparator",
107
- description="Compare two handouts and evaluate changes + Learning Outcome coverage using semantic similarity.",
108
  )
109
 
110
- iface.launch()
 
1
 
2
+ import difflib
 
 
 
 
3
  import matplotlib.pyplot as plt
4
+ import pandas as pd
5
+ from PyPDF2 import PdfReader
6
+ from docx import Document
7
+ import gradio as gr
8
+ from sklearn.feature_extraction.text import TfidfVectorizer
9
+ from sklearn.metrics.pairwise import cosine_similarity
10
 
11
+ # --- Extract Text ---
12
+ def extract_text_from_pdf(pdf_bytes):
13
  try:
14
+ reader = PdfReader(pdf_bytes)
15
  text = ""
16
+ for page in reader.pages:
17
+ text += page.extract_text() or ""
18
+ return text.strip()
19
+ except:
20
  return ""
21
 
22
+ def extract_text_from_docx(docx_file):
23
+ doc = Document(docx_file)
24
+ return "\n".join([para.text.strip() for para in doc.paragraphs if para.text.strip()])
25
+
26
+ # --- Change Percentage ---
27
+ def calculate_change_percentage(old_text, new_text):
28
+ seqm = difflib.SequenceMatcher(None, old_text, new_text)
29
+ return (1 - seqm.ratio()) * 100
30
+
31
+ # --- Semantic Matching ---
32
+ def semantic_match(lo_texts, content):
33
+ vectorizer = TfidfVectorizer().fit_transform([content] + lo_texts)
34
+ similarities = cosine_similarity(vectorizer[0:1], vectorizer[1:]).flatten()
35
+ return similarities
36
+
37
+ # --- Summary Generation ---
38
+ def generate_summary(change_pct, matched_los, total_los):
39
+ msg = f"πŸ“ˆ Content Change: {change_pct:.2f}%\n🎯 Matched LOs: {matched_los} of {total_los}\n"
40
+ if change_pct > 20:
41
+ msg += "🟒 Major improvements detected."
42
+ elif change_pct > 5:
43
+ msg += "πŸ”΅ Some updates found."
44
+ else:
45
+ msg += "🟑 Very little or no update."
46
+ return msg
47
 
48
+ # --- Bar Chart Plot ---
49
+ def plot_lo_chart(lo_labels, old_scores, new_scores):
50
+ df = pd.DataFrame({'Old': old_scores, 'New': new_scores}, index=lo_labels)
51
+ ax = df.plot(kind='bar', figsize=(10, 5), title="LO-wise Match Score: Old vs New")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
52
  ax.set_ylabel("Match Score (0-1)")
53
+ ax.set_ylim(0, 1)
54
+ plt.xticks(rotation=45, ha='right')
 
 
55
  plt.tight_layout()
56
+ return plt.gcf()
 
 
 
 
57
 
58
+ # --- Main Comparator ---
59
+ def compare_handouts(old_pdf, new_pdf, lo_docx):
60
  old_text = extract_text_from_pdf(old_pdf)
61
  new_text = extract_text_from_pdf(new_pdf)
62
+
63
  if not old_text or not new_text:
64
  return "❌ Could not extract text from one or both PDFs.", None
65
 
66
+ lo_text_raw = extract_text_from_docx(lo_docx)
67
+ lo_list = [lo for lo in lo_text_raw.split('\n') if lo.strip()]
 
 
 
 
 
 
68
 
69
+ if not lo_list:
70
+ return "❌ No learning outcomes detected in uploaded file.", None
 
 
 
71
 
72
+ old_scores = semantic_match(lo_list, old_text)
73
+ new_scores = semantic_match(lo_list, new_text)
74
 
75
+ matched = sum(n >= o for o, n in zip(old_scores, new_scores))
76
+ change_pct = calculate_change_percentage(old_text, new_text)
 
 
 
 
77
 
78
+ summary = generate_summary(change_pct, matched, len(lo_list))
79
+ fig = plot_lo_chart([f"LO{i+1}" for i in range(len(lo_list))], old_scores, new_scores)
80
+ return summary, fig
81
 
82
+ # --- Gradio App ---
83
+ demo = gr.Interface(
84
  fn=compare_handouts,
85
  inputs=[
86
+ gr.File(label="Upload Old PDF", type="binary"),
87
+ gr.File(label="Upload New PDF", type="binary"),
88
+ gr.File(label="Upload Learning Outcomes (.docx)", type="binary"),
89
  ],
90
  outputs=[
91
+ gr.Textbox(label="πŸ“‹ Summary"),
92
+ gr.Plot(label="πŸ“Š LO Match Chart")
93
  ],
94
  title="πŸ“˜ Educational Content Comparator",
95
+ description="Upload 2 handouts and LO file (.docx). Detect % update, alignment with learning outcomes, and get visual summary."
96
  )
97
 
98
+ demo.launch(share=True)