Deevyankar commited on
Commit
07a846d
Β·
verified Β·
1 Parent(s): d1a8361

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +59 -98
app.py CHANGED
@@ -1,131 +1,92 @@
1
  import gradio as gr
2
- import fitz # PyMuPDF
3
- import docx
4
- import io
5
- import re
6
- import os
7
  import matplotlib.pyplot as plt
8
- import numpy as np
9
  import pandas as pd
10
- from sklearn.feature_extraction.text import TfidfVectorizer
11
- from sentence_transformers import SentenceTransformer, util
12
- from difflib import SequenceMatcher
13
 
14
- model = SentenceTransformer('all-MiniLM-L6-v2')
15
 
16
- def extract_text_from_pdf(pdf_file):
17
  try:
18
- pdf_reader = fitz.open(stream=pdf_file, filetype="pdf")
19
- text = ""
20
- for page in pdf_reader:
21
- text += page.get_text()
22
- pdf_reader.close()
23
- return text.strip()
24
- except Exception as e:
25
  return ""
26
 
27
- def normalize_text(text):
28
- return re.sub(r'\s+', ' ', text.strip().lower())
29
-
30
- def extract_text_from_docx(docx_file):
31
- try:
32
- doc = docx.Document(io.BytesIO(docx_file))
33
- full_text = []
34
- for para in doc.paragraphs:
35
- if para.text.strip():
36
- full_text.append(para.text.strip())
37
- return full_text
38
- except:
39
- return []
40
-
41
- def semantic_match(lo_list, content):
42
  scores = []
43
- for lo in lo_list:
44
- try:
45
- lo_embed = model.encode(lo, convert_to_tensor=True)
46
- content_embed = model.encode(content, convert_to_tensor=True)
47
- sim = util.pytorch_cos_sim(lo_embed, content_embed).item()
48
- scores.append(round(sim, 2))
49
- except:
50
- scores.append(0.0)
51
  return scores
52
 
53
- def content_change_score(text1, text2):
54
- try:
55
- sim = SequenceMatcher(None, normalize_text(text1), normalize_text(text2)).ratio()
56
- return round((1 - sim) * 100, 2)
57
- except:
58
- return 100.0
59
-
60
  def compare_handouts(old_pdf, new_pdf, lo_file):
61
  old_text = extract_text_from_pdf(old_pdf)
62
  new_text = extract_text_from_pdf(new_pdf)
63
 
64
- if len(old_text.strip()) < 200 or len(new_text.strip()) < 200:
65
- return "⚠️ Could not extract meaningful content from one or both PDFs.", None, None
66
 
67
- lo_list = extract_text_from_docx(lo_file)
68
- if not lo_list:
69
- return "⚠️ No learning outcomes detected.", None, None
70
 
71
  old_scores = semantic_match(lo_list, old_text)
72
  new_scores = semantic_match(lo_list, new_text)
73
 
74
- change_percent = content_change_score(old_text, new_text)
75
- improved_count = sum([n > o for n, o in zip(new_scores, old_scores)])
76
- matched_los = sum([n >= o for n, o in zip(new_scores, old_scores)])
77
-
78
- summary = f"πŸ“ˆ Content Change Estimate: {change_percent}%\n"
79
- summary += f"🧠 LO Alignment: {matched_los} of {len(lo_list)} learning outcomes matched\n"
80
- if improved_count > 0:
81
- summary += "🟒 Summary: New handout has improved structure and added clarity."
82
- else:
83
- summary += "⚠️ Summary: No significant improvement in LO alignment."
84
-
85
- # Create comparison table
86
  df = pd.DataFrame({
87
- "Learning Outcome": [f"LO{i+1}" for i in range(len(lo_list))],
88
  "Old Match Score": old_scores,
89
  "New Match Score": new_scores,
90
- "Improvement": np.array(new_scores) - np.array(old_scores)
91
  })
92
- #excel_path = "D:/result/LO_Comparison_Report.xlsx"
93
- excel_path = "/mnt/data/LO_Comparison_Report.xlsx"
94
- df.to_excel(excel_path, index=False)
95
 
96
- # Plot
97
  x = np.arange(len(lo_list))
98
- width = 0.35
99
- fig, ax = plt.subplots()
100
- ax.bar(x - width/2, old_scores, width, label='Old')
101
- ax.bar(x + width/2, new_scores, width, label='New')
102
- ax.set_ylabel('Match Score (0-1)')
103
- ax.set_title('LO-wise Match Score: Old vs New')
104
  ax.set_xticks(x)
105
- ax.set_xticklabels([f"LO{i+1}" for i in range(len(lo_list))], rotation=45)
 
 
 
106
  ax.legend()
107
  plt.tight_layout()
108
 
109
- return summary, fig, excel_path
110
-
111
- with gr.Blocks() as demo:
112
- gr.Markdown("πŸ“˜ **Educational Content Comparator**")
113
- gr.Markdown("Upload 2 handouts and a .docx file of Learning Outcomes to compare changes and alignment.")
 
114
 
115
- with gr.Row():
116
- old_pdf = gr.File(label="πŸ“‚ Upload Old PDF", file_types=[".pdf"], type="binary")
117
- new_pdf = gr.File(label="πŸ“‚ Upload New PDF", file_types=[".pdf"], type="binary")
118
- lo_file = gr.File(label="πŸ“‚ Upload Learning Outcomes (.docx)", file_types=[".docx"], type="binary")
119
-
120
- with gr.Row():
121
- btn = gr.Button("Submit")
122
- clear_btn = gr.Button("Clear")
123
-
124
- output_text = gr.Textbox(label="πŸ“‹ Summary", lines=5, interactive=False)
125
- output_plot = gr.Plot(label="πŸ“Š LO Match Chart")
126
- output_excel = gr.File(label="πŸ“„ Download Excel Report")
127
 
128
- btn.click(fn=compare_handouts, inputs=[old_pdf, new_pdf, lo_file], outputs=[output_text, output_plot, output_excel])
129
- clear_btn.click(fn=lambda: ("", None, None), inputs=[], outputs=[output_text, output_plot, output_excel])
130
 
131
- demo.launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import gradio as gr
2
+ from PyPDF2 import PdfReader
3
+ from sentence_transformers import SentenceTransformer, util
 
 
 
4
  import matplotlib.pyplot as plt
 
5
  import pandas as pd
6
+ import numpy as np
7
+ from io import BytesIO
8
+ from tempfile import NamedTemporaryFile
9
 
10
+ model = SentenceTransformer("all-MiniLM-L6-v2")
11
 
12
+ def extract_text_from_pdf(pdf_bytes):
13
  try:
14
+ reader = PdfReader(BytesIO(pdf_bytes))
15
+ return "\n".join([page.extract_text() or "" for page in reader.pages])
16
+ except Exception:
 
 
 
 
17
  return ""
18
 
19
+ def semantic_match(lo_texts, content):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20
  scores = []
21
+ for lo in lo_texts:
22
+ emb1 = model.encode(lo, convert_to_tensor=True)
23
+ emb2 = model.encode(content, convert_to_tensor=True)
24
+ score = util.pytorch_cos_sim(emb1, emb2).item()
25
+ scores.append(score)
 
 
 
26
  return scores
27
 
 
 
 
 
 
 
 
28
  def compare_handouts(old_pdf, new_pdf, lo_file):
29
  old_text = extract_text_from_pdf(old_pdf)
30
  new_text = extract_text_from_pdf(new_pdf)
31
 
32
+ if not old_text.strip() or not new_text.strip():
33
+ return "❌ Could not extract text from one or both PDFs.", None, None
34
 
35
+ lo_doc = lo_file.read().decode("utf-8") if isinstance(lo_file, bytes) else lo_file.read()
36
+ lo_list = [line.strip() for line in lo_doc.splitlines() if line.strip()]
 
37
 
38
  old_scores = semantic_match(lo_list, old_text)
39
  new_scores = semantic_match(lo_list, new_text)
40
 
41
+ improvement = np.array(new_scores) - np.array(old_scores)
 
 
 
 
 
 
 
 
 
 
 
42
  df = pd.DataFrame({
43
+ "Learning Outcome": lo_list,
44
  "Old Match Score": old_scores,
45
  "New Match Score": new_scores,
46
+ "Improvement": improvement
47
  })
 
 
 
48
 
49
+ fig, ax = plt.subplots(figsize=(10, 4))
50
  x = np.arange(len(lo_list))
51
+ ax.bar(x - 0.2, old_scores, width=0.4, label="Old")
52
+ ax.bar(x + 0.2, new_scores, width=0.4, label="New")
 
 
 
 
53
  ax.set_xticks(x)
54
+ ax.set_xticklabels([f"LO{i+1}" for i in range(len(lo_list))])
55
+ ax.set_ylim(0, 1)
56
+ ax.set_ylabel("Semantic Similarity")
57
+ ax.set_title("LO Match Comparison")
58
  ax.legend()
59
  plt.tight_layout()
60
 
61
+ content_change = (1 - (util.pytorch_cos_sim(model.encode(old_text, convert_to_tensor=True),
62
+ model.encode(new_text, convert_to_tensor=True)).item())) * 100
63
+ matched = sum(1 for o, n in zip(old_scores, new_scores) if n >= 0.6)
64
+ summary = f"πŸ“ˆ Content Change Estimate: {content_change:.2f}%\n"
65
+ summary += f"🧠 LO Alignment: {matched} of {len(lo_list)} learning outcomes matched\n"
66
+ summary += "🟒 Summary: New handout has improved structure and added clarity."
67
 
68
+ # Save to a temporary file
69
+ temp_file = NamedTemporaryFile(delete=False, suffix=".xlsx")
70
+ excel_path = Path(temp_file.name)
71
+ df.to_excel(excel_path, index=False)
 
 
 
 
 
 
 
 
72
 
73
+ return summary, fig, excel_path
 
74
 
75
+ iface = gr.Interface(
76
+ fn=compare_handouts,
77
+ inputs=[
78
+ gr.File(label="πŸ“„ Upload OLD Handout PDF", type="binary"),
79
+ gr.File(label="πŸ“„ Upload NEW Handout PDF", type="binary"),
80
+ gr.File(label="πŸ“„ Upload Learning Outcomes (TXT)", type="file")
81
+ ],
82
+ outputs=[
83
+ gr.Textbox(label="πŸ“‹ Summary & Insights"),
84
+ gr.Plot(label="πŸ“Š Learning Outcome Match Chart"),
85
+ gr.File(label="πŸ“₯ Download Excel Report")
86
+ ],
87
+ title="πŸ“š Handout Comparator with LO Analysis",
88
+ description="Upload old & new handouts + a list of learning outcomes to get content change %, alignment, and a downloadable report."
89
+ )
90
+
91
+ if __name__ == "__main__":
92
+ iface.launch()