Deevyankar commited on
Commit
4db3b3a
Β·
verified Β·
1 Parent(s): 3423464

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +85 -65
app.py CHANGED
@@ -1,92 +1,112 @@
1
  import gradio as gr
2
- from PyPDF2 import PdfReader
 
 
 
 
3
  from sentence_transformers import SentenceTransformer, util
4
  import matplotlib.pyplot as plt
5
- import pandas as pd
6
  import numpy as np
7
- from io import BytesIO
8
- from tempfile import NamedTemporaryFile
9
 
10
- model = SentenceTransformer("all-MiniLM-L6-v2")
11
 
12
- def extract_text_from_pdf(pdf_bytes):
13
  try:
14
- reader = PdfReader(BytesIO(pdf_bytes))
15
- return "\n".join([page.extract_text() or "" for page in reader.pages])
16
- except Exception:
 
 
 
 
17
  return ""
18
 
19
- def semantic_match(lo_texts, content):
 
 
 
 
 
 
 
 
 
 
20
  scores = []
21
- for lo in lo_texts:
22
- emb1 = model.encode(lo, convert_to_tensor=True)
23
- emb2 = model.encode(content, convert_to_tensor=True)
24
- score = util.pytorch_cos_sim(emb1, emb2).item()
25
- scores.append(score)
 
 
 
26
  return scores
27
 
 
 
 
 
 
 
 
28
  def compare_handouts(old_pdf, new_pdf, lo_file):
29
  old_text = extract_text_from_pdf(old_pdf)
30
  new_text = extract_text_from_pdf(new_pdf)
31
 
32
- if not old_text.strip() or not new_text.strip():
33
- return "❌ Could not extract text from one or both PDFs.", None, None
34
 
35
- lo_doc = lo_file.decode("utf-8")
36
- lo_list = [line.strip() for line in lo_doc.splitlines() if line.strip()]
 
37
 
38
  old_scores = semantic_match(lo_list, old_text)
39
  new_scores = semantic_match(lo_list, new_text)
40
 
41
- improvement = np.array(new_scores) - np.array(old_scores)
42
- df = pd.DataFrame({
43
- "Learning Outcome": lo_list,
44
- "Old Match Score": old_scores,
45
- "New Match Score": new_scores,
46
- "Improvement": improvement
47
- })
 
 
 
48
 
49
- fig, ax = plt.subplots(figsize=(10, 4))
50
  x = np.arange(len(lo_list))
51
- ax.bar(x - 0.2, old_scores, width=0.4, label="Old")
52
- ax.bar(x + 0.2, new_scores, width=0.4, label="New")
 
 
 
 
53
  ax.set_xticks(x)
54
- ax.set_xticklabels([f"LO{i+1}" for i in range(len(lo_list))])
55
- ax.set_ylim(0, 1)
56
- ax.set_ylabel("Semantic Similarity")
57
- ax.set_title("LO Match Comparison")
58
  ax.legend()
59
  plt.tight_layout()
60
 
61
- content_change = (1 - (util.pytorch_cos_sim(model.encode(old_text, convert_to_tensor=True),
62
- model.encode(new_text, convert_to_tensor=True)).item())) * 100
63
- matched = sum(1 for o, n in zip(old_scores, new_scores) if n >= 0.6)
64
- summary = f"πŸ“ˆ Content Change Estimate: {content_change:.2f}%\n"
65
- summary += f"🧠 LO Alignment: {matched} of {len(lo_list)} learning outcomes matched\n"
66
- summary += "🟒 Summary: New handout has improved structure and added clarity."
67
-
68
- # Save to a temporary file
69
- temp_file = NamedTemporaryFile(delete=False, suffix=".xlsx")
70
- excel_path = Path(temp_file.name)
71
- df.to_excel(excel_path, index=False)
72
-
73
- return summary, fig, excel_path
74
-
75
- iface = gr.Interface(
76
- fn=compare_handouts,
77
- inputs=[
78
- gr.File(label="πŸ“„ Upload OLD Handout PDF", type="binary"),
79
- gr.File(label="πŸ“„ Upload NEW Handout PDF", type="binary"),
80
- gr.File(label="πŸ“„ Upload Learning Outcomes (TXT)", type="binary")
81
- ],
82
- outputs=[
83
- gr.Textbox(label="πŸ“‹ Summary & Insights"),
84
- gr.Plot(label="πŸ“Š Learning Outcome Match Chart"),
85
- gr.File(label="πŸ“₯ Download Excel Report")
86
- ],
87
- title="πŸ“š Handout Comparator with LO Analysis",
88
- description="Upload old & new handouts + a list of learning outcomes to get content change %, alignment, and a downloadable report."
89
- )
90
-
91
- if __name__ == "__main__":
92
- iface.launch()
 
1
  import gradio as gr
2
+ import fitz # PyMuPDF
3
+ import docx
4
+ import io
5
+ import re
6
+ from sklearn.feature_extraction.text import TfidfVectorizer
7
  from sentence_transformers import SentenceTransformer, util
8
  import matplotlib.pyplot as plt
 
9
  import numpy as np
10
+ from difflib import SequenceMatcher
 
11
 
12
+ model = SentenceTransformer('all-MiniLM-L6-v2')
13
 
14
+ def extract_text_from_pdf(pdf_file):
15
  try:
16
+ pdf_reader = fitz.open(stream=pdf_file, filetype="pdf")
17
+ text = ""
18
+ for page in pdf_reader:
19
+ text += page.get_text()
20
+ pdf_reader.close()
21
+ return text.strip()
22
+ except Exception as e:
23
  return ""
24
 
25
+ def normalize_text(text):
26
+ return re.sub(r'\s+', ' ', text.strip().lower())
27
+
28
+ def extract_text_from_docx(file_binary):
29
+ try:
30
+ doc = docx.Document(io.BytesIO(file_binary))
31
+ return [para.text.strip() for para in doc.paragraphs if para.text.strip()]
32
+ except Exception as e:
33
+ return []
34
+
35
+ def semantic_match(lo_list, content):
36
  scores = []
37
+ for lo in lo_list:
38
+ try:
39
+ lo_embed = model.encode(lo, convert_to_tensor=True)
40
+ content_embed = model.encode(content, convert_to_tensor=True)
41
+ sim = util.pytorch_cos_sim(lo_embed, content_embed).item()
42
+ scores.append(round(sim, 2))
43
+ except:
44
+ scores.append(0.0)
45
  return scores
46
 
47
+ def content_change_score(text1, text2):
48
+ try:
49
+ sim = SequenceMatcher(None, normalize_text(text1), normalize_text(text2)).ratio()
50
+ return round((1 - sim) * 100, 2)
51
+ except:
52
+ return 100.0
53
+
54
  def compare_handouts(old_pdf, new_pdf, lo_file):
55
  old_text = extract_text_from_pdf(old_pdf)
56
  new_text = extract_text_from_pdf(new_pdf)
57
 
58
+ if len(old_text.strip()) < 200 or len(new_text.strip()) < 200:
59
+ return "⚠️ Could not extract meaningful content from one or both PDFs.", None
60
 
61
+ lo_list = extract_text_from_docx(lo_file)
62
+ if not lo_list:
63
+ return "⚠️ No learning outcomes detected in .docx file.", None
64
 
65
  old_scores = semantic_match(lo_list, old_text)
66
  new_scores = semantic_match(lo_list, new_text)
67
 
68
+ change_percent = content_change_score(old_text, new_text)
69
+ improved_count = sum([n > o for n, o in zip(new_scores, old_scores)])
70
+ matched_los = sum([n >= o for n, o in zip(new_scores, old_scores)])
71
+
72
+ summary = f"πŸ“ˆ Content Change Estimate: {change_percent}%\n"
73
+ summary += f"🧠 LO Alignment: {matched_los} of {len(lo_list)} learning outcomes matched\n"
74
+ if improved_count > 0:
75
+ summary += "🟒 Summary: New handout has improved structure and added clarity."
76
+ else:
77
+ summary += "⚠️ Summary: No significant improvement in LO alignment."
78
 
 
79
  x = np.arange(len(lo_list))
80
+ width = 0.35
81
+ fig, ax = plt.subplots()
82
+ ax.bar(x - width/2, old_scores, width, label='Old')
83
+ ax.bar(x + width/2, new_scores, width, label='New')
84
+ ax.set_ylabel('Match Score (0-1)')
85
+ ax.set_title('LO-wise Match Score: Old vs New')
86
  ax.set_xticks(x)
87
+ ax.set_xticklabels([f"LO{i+1}" for i in range(len(lo_list))], rotation=45)
 
 
 
88
  ax.legend()
89
  plt.tight_layout()
90
 
91
+ return summary, fig
92
+
93
+ with gr.Blocks() as demo:
94
+ gr.Markdown("πŸ“˜ **Educational Content Comparator**")
95
+ gr.Markdown("Upload 2 handouts and a .docx file of Learning Outcomes to compare changes and alignment.")
96
+
97
+ with gr.Row():
98
+ old_pdf = gr.File(label="πŸ“‚ Upload Old PDF", file_types=[".pdf"], type="binary")
99
+ new_pdf = gr.File(label="πŸ“‚ Upload New PDF", file_types=[".pdf"], type="binary")
100
+ lo_file = gr.File(label="πŸ“‚ Upload Learning Outcomes (.docx)", file_types=[".docx"], type="binary")
101
+
102
+ with gr.Row():
103
+ btn = gr.Button("Submit")
104
+ clear_btn = gr.Button("Clear")
105
+
106
+ output_text = gr.Textbox(label="πŸ“‹ Summary", lines=5, interactive=False)
107
+ output_plot = gr.Plot(label="πŸ“Š LO Match Chart")
108
+
109
+ btn.click(fn=compare_handouts, inputs=[old_pdf, new_pdf, lo_file], outputs=[output_text, output_plot])
110
+ clear_btn.click(fn=lambda: ("", None), inputs=[], outputs=[output_text, output_plot])
111
+
112
+ demo.launch()