Deevyankar commited on
Commit
7889bda
Β·
verified Β·
1 Parent(s): 03f8fb4

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +41 -54
app.py CHANGED
@@ -4,11 +4,11 @@ import fitz # PyMuPDF
4
  import docx
5
  import io
6
  import re
7
- import pandas as pd
8
- from sklearn.feature_extraction.text import TfidfVectorizer
9
- from sentence_transformers import SentenceTransformer, util
10
  import matplotlib.pyplot as plt
11
  import numpy as np
 
 
12
  from difflib import SequenceMatcher
13
 
14
  model = SentenceTransformer('all-MiniLM-L6-v2')
@@ -21,7 +21,7 @@ def extract_text_from_pdf(pdf_file):
21
  text += page.get_text()
22
  pdf_reader.close()
23
  return text.strip()
24
- except Exception:
25
  return ""
26
 
27
  def normalize_text(text):
@@ -30,17 +30,24 @@ def normalize_text(text):
30
  def extract_text_from_docx(docx_file):
31
  try:
32
  doc = docx.Document(io.BytesIO(docx_file))
33
- return [p.text.strip() for p in doc.paragraphs if p.text.strip()]
 
 
 
 
34
  except:
35
  return []
36
 
37
  def semantic_match(lo_list, content):
38
  scores = []
39
- content_embed = model.encode(content, convert_to_tensor=True)
40
  for lo in lo_list:
41
- lo_embed = model.encode(lo, convert_to_tensor=True)
42
- sim = util.pytorch_cos_sim(lo_embed, content_embed).item()
43
- scores.append(round(sim, 3))
 
 
 
 
44
  return scores
45
 
46
  def content_change_score(text1, text2):
@@ -55,78 +62,58 @@ def compare_handouts(old_pdf, new_pdf, lo_file):
55
  new_text = extract_text_from_pdf(new_pdf)
56
 
57
  if len(old_text.strip()) < 200 or len(new_text.strip()) < 200:
58
- return "❌ Could not extract text from one or both PDFs.", None, None
59
 
60
  lo_list = extract_text_from_docx(lo_file)
61
  if not lo_list:
62
- return "⚠️ No learning outcomes detected.", None, None
63
 
64
  old_scores = semantic_match(lo_list, old_text)
65
  new_scores = semantic_match(lo_list, new_text)
66
- improvement = [round(n - o, 3) for n, o in zip(new_scores, old_scores)]
67
- improved_count = sum([i > 0 for i in improvement])
68
-
69
- # Prepare Excel output
70
- df = pd.DataFrame({
71
- "Learning Outcome": lo_list,
72
- "Old Match Score": old_scores,
73
- "New Match Score": new_scores,
74
- "Improvement": improvement
75
- })
76
-
77
- excel_path = "/mnt/data/LO_Comparison_Report.xlsx"
78
- df.to_excel(excel_path, index=False)
79
-
80
- # Scores
81
- content_diff = content_change_score(old_text, new_text)
82
- lo_improvement_percent = round((sum(improvement) / len(lo_list)) * 100, 2)
83
-
84
- summary = (
85
- f"🧠 Improved LOs: {improved_count} / {len(lo_list)}\n"
86
- f"πŸ“„ Content Change Estimate: {content_diff}%\n"
87
- f"πŸ“Š Avg LO Improvement Score: {lo_improvement_percent}%\n\n"
88
- )
89
 
 
 
 
 
 
 
90
  if improved_count > 0:
91
- summary += "🟒 Summary: New handout better aligns with LOs and has improved clarity."
92
  else:
93
  summary += "⚠️ Summary: No significant improvement in LO alignment."
94
 
95
- # Bar chart
96
  x = np.arange(len(lo_list))
97
  width = 0.35
98
- fig, ax = plt.subplots(figsize=(10, 5))
99
  ax.bar(x - width/2, old_scores, width, label='Old')
100
  ax.bar(x + width/2, new_scores, width, label='New')
101
  ax.set_ylabel('Match Score (0-1)')
102
- ax.set_title('LO-wise Match Score Comparison')
103
  ax.set_xticks(x)
104
  ax.set_xticklabels([f"LO{i+1}" for i in range(len(lo_list))], rotation=45)
105
  ax.legend()
106
  plt.tight_layout()
107
 
108
- return summary, fig, excel_path
109
 
110
  with gr.Blocks() as demo:
111
- gr.Markdown("### πŸ“˜ Educational Handout Comparison Tool")
112
- gr.Markdown("Upload an old and new handout PDF, along with Learning Outcomes (.docx), to compare updates.")
113
 
114
  with gr.Row():
115
- old_pdf = gr.File(label="πŸ“‚ Old Handout", file_types=[".pdf"], type="binary")
116
- new_pdf = gr.File(label="πŸ“‚ New Handout", file_types=[".pdf"], type="binary")
117
- lo_file = gr.File(label="πŸ“‚ Learning Outcomes (.docx)", file_types=[".docx"], type="binary")
118
 
119
  with gr.Row():
120
- btn = gr.Button("πŸ” Compare")
121
- clear_btn = gr.Button("♻️ Clear")
122
 
123
- summary_out = gr.Textbox(label="πŸ“‹ Summary", lines=6, interactive=False)
124
- plot_out = gr.Plot(label="πŸ“Š LO Score Chart")
125
- download_link = gr.File(label="πŸ“₯ Download Excel Report")
126
 
127
- btn.click(fn=compare_handouts, inputs=[old_pdf, new_pdf, lo_file],
128
- outputs=[summary_out, plot_out, download_link])
129
- clear_btn.click(fn=lambda: ("", None, None),
130
- inputs=[], outputs=[summary_out, plot_out, download_link])
131
 
132
- demo.launch()
 
4
  import docx
5
  import io
6
  import re
7
+ import os
 
 
8
  import matplotlib.pyplot as plt
9
  import numpy as np
10
+ from sklearn.feature_extraction.text import TfidfVectorizer
11
+ from sentence_transformers import SentenceTransformer, util
12
  from difflib import SequenceMatcher
13
 
14
  model = SentenceTransformer('all-MiniLM-L6-v2')
 
21
  text += page.get_text()
22
  pdf_reader.close()
23
  return text.strip()
24
+ except Exception as e:
25
  return ""
26
 
27
  def normalize_text(text):
 
30
  def extract_text_from_docx(docx_file):
31
  try:
32
  doc = docx.Document(io.BytesIO(docx_file))
33
+ full_text = []
34
+ for para in doc.paragraphs:
35
+ if para.text.strip():
36
+ full_text.append(para.text.strip())
37
+ return full_text
38
  except:
39
  return []
40
 
41
  def semantic_match(lo_list, content):
42
  scores = []
 
43
  for lo in lo_list:
44
+ try:
45
+ lo_embed = model.encode(lo, convert_to_tensor=True)
46
+ content_embed = model.encode(content, convert_to_tensor=True)
47
+ sim = util.pytorch_cos_sim(lo_embed, content_embed).item()
48
+ scores.append(round(sim, 2))
49
+ except:
50
+ scores.append(0.0)
51
  return scores
52
 
53
  def content_change_score(text1, text2):
 
62
  new_text = extract_text_from_pdf(new_pdf)
63
 
64
  if len(old_text.strip()) < 200 or len(new_text.strip()) < 200:
65
+ return "⚠️ Could not extract meaningful content from one or both PDFs.", None
66
 
67
  lo_list = extract_text_from_docx(lo_file)
68
  if not lo_list:
69
+ return "⚠️ No learning outcomes detected.", None
70
 
71
  old_scores = semantic_match(lo_list, old_text)
72
  new_scores = semantic_match(lo_list, new_text)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
73
 
74
+ change_percent = content_change_score(old_text, new_text)
75
+ improved_count = sum([n > o for n, o in zip(new_scores, old_scores)])
76
+ matched_los = sum([n >= o for n, o in zip(new_scores, old_scores)])
77
+
78
+ summary = f"πŸ“ˆ Content Change Estimate: {change_percent}%\n"
79
+ summary += f"🧠 LO Alignment: {matched_los} of {len(lo_list)} learning outcomes matched\n"
80
  if improved_count > 0:
81
+ summary += "🟒 Summary: New handout has improved structure and added clarity."
82
  else:
83
  summary += "⚠️ Summary: No significant improvement in LO alignment."
84
 
85
+ # Plot
86
  x = np.arange(len(lo_list))
87
  width = 0.35
88
+ fig, ax = plt.subplots()
89
  ax.bar(x - width/2, old_scores, width, label='Old')
90
  ax.bar(x + width/2, new_scores, width, label='New')
91
  ax.set_ylabel('Match Score (0-1)')
92
+ ax.set_title('LO-wise Match Score: Old vs New')
93
  ax.set_xticks(x)
94
  ax.set_xticklabels([f"LO{i+1}" for i in range(len(lo_list))], rotation=45)
95
  ax.legend()
96
  plt.tight_layout()
97
 
98
+ return summary, fig
99
 
100
  with gr.Blocks() as demo:
101
+ gr.Markdown("πŸ“˜ **Educational Content Comparator**")
102
+ gr.Markdown("Upload 2 handouts and a .docx file of Learning Outcomes to compare changes and alignment.")
103
 
104
  with gr.Row():
105
+ old_pdf = gr.File(label="πŸ“‚ Upload Old PDF", file_types=[".pdf"], type="binary")
106
+ new_pdf = gr.File(label="πŸ“‚ Upload New PDF", file_types=[".pdf"], type="binary")
107
+ lo_file = gr.File(label="πŸ“‚ Upload Learning Outcomes (.docx)", file_types=[".docx"], type="binary")
108
 
109
  with gr.Row():
110
+ btn = gr.Button("Submit")
111
+ clear_btn = gr.Button("Clear")
112
 
113
+ output_text = gr.Textbox(label="πŸ“‹ Summary", lines=5, interactive=False)
114
+ output_plot = gr.Plot(label="πŸ“Š LO Match Chart")
 
115
 
116
+ btn.click(fn=compare_handouts, inputs=[old_pdf, new_pdf, lo_file], outputs=[output_text, output_plot])
117
+ clear_btn.click(fn=lambda: ("", None), inputs=[], outputs=[output_text, output_plot])
 
 
118
 
119
+ demo.launch()