Deevyankar commited on
Commit
c6ee9d5
Β·
verified Β·
1 Parent(s): 99d0d6d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +89 -105
app.py CHANGED
@@ -1,126 +1,110 @@
1
 
2
 
3
  import gradio as gr
 
4
  import fitz # PyMuPDF
5
- from docx import Document
6
- import io
7
- import re
8
- import difflib
9
  import matplotlib.pyplot as plt
10
- import numpy as np
11
- from sklearn.feature_extraction.text import TfidfVectorizer
12
- from sklearn.metrics.pairwise import cosine_similarity
13
-
14
- def extract_text_from_pdf(uploaded_file_bytes):
15
- doc = fitz.open(stream=uploaded_file_bytes, filetype="pdf")
16
- text = ""
17
- for page in doc:
18
- page_text = page.get_text()
19
- if page_text.strip():
20
- text += page_text + "\n"
21
- return text.strip()
22
-
23
- def extract_los(file_bytes, filename=""):
24
- ext = filename.lower().split('.')[-1]
25
- if ext == "txt":
26
- return file_bytes.decode("utf-8").splitlines()
27
- elif ext == "docx":
28
- file_stream = io.BytesIO(file_bytes)
29
- doc = Document(file_stream)
30
- return [p.text.strip() for p in doc.paragraphs if p.text.strip()]
31
- return []
32
-
33
- def quality_check(new_text):
34
- words = new_text.split()
35
- if len(words) > 400:
36
- return "🟒 New content appears more detailed and informative."
37
- elif len(words) > 200:
38
- return "🟑 New content is moderately improved."
39
- else:
40
- return "πŸ”΄ New content may need more detail."
41
-
42
- def find_relevant_los(content, los):
43
- if not los:
44
- return [], 0, [], []
45
- vectorizer = TfidfVectorizer().fit_transform([content] + los)
46
- similarities = cosine_similarity(vectorizer[0:1], vectorizer[1:]).flatten()
47
- matched = []
48
- scores_old = [round(np.random.uniform(1, 3), 1) for _ in los]
49
- scores_new = []
50
-
51
- for i, score in enumerate(similarities):
52
- if score > 0.2:
53
- matched.append(f"βœ“ {los[i]} (Match: {score:.2f})")
54
- scores_new.append(round(score * 5, 1)) # normalize to 5
55
-
56
- return matched, len(matched), scores_old, scores_new
57
-
58
- def summarize_added_lines(old_text, new_text):
59
- old_lines = set(old_text.splitlines())
60
- new_lines = set(new_text.splitlines())
61
- added_lines = list(new_lines - old_lines)
62
- summary = []
63
- for line in added_lines:
64
- line_clean = re.sub(r'[^a-zA-Z0-9., ]', '', line).strip()
65
- if len(line_clean.split()) >= 5:
66
- summary.append("- " + line_clean)
67
- return summary, len(added_lines), len(new_lines)
68
-
69
- def create_bar_chart(los, scores_old, scores_new):
70
- index = np.arange(len(los))
71
- bar_width = 0.35
72
  fig, ax = plt.subplots(figsize=(10, 5))
73
- ax.bar(index, scores_old, bar_width, label="Old")
74
- ax.bar(index + bar_width, scores_new, bar_width, label="New")
75
- ax.set_xlabel("Learning Outcomes")
76
- ax.set_ylabel("Match Score (0-5)")
77
- ax.set_title("LO-wise Match Score: Old vs New")
78
- ax.set_xticks(index + bar_width / 2)
79
- ax.set_xticklabels(los, rotation=45, ha="right")
80
  ax.legend()
81
- ax.grid(True, linestyle="--", alpha=0.4)
82
- fig.tight_layout()
83
- return fig
84
-
85
- def compare_handouts(old_pdf_bytes, new_pdf_bytes, lo_file_bytes, lo_filename):
86
- old_text = extract_text_from_pdf(old_pdf_bytes)
87
- new_text = extract_text_from_pdf(new_pdf_bytes)
88
- los = extract_los(lo_file_bytes, lo_filename)
89
 
 
 
 
 
 
 
 
 
 
 
 
90
  if not old_text or not new_text:
91
- return "❗ Error in file(s)", "", "", None
 
 
 
 
 
92
 
93
- added_summary, added_lines, total_lines = summarize_added_lines(old_text, new_text)
94
- percent_change = (added_lines / max(total_lines, 1)) * 100
95
- lo_matches, matched_count, scores_old, scores_new = find_relevant_los(new_text, los)
96
- quality = quality_check(new_text)
97
 
98
- summary_output = "\n".join(added_summary)
99
- lo_output = "\n".join(lo_matches)
100
- stats = (
101
- f"πŸ“ˆ Content Change: {percent_change:.2f}%\n"
102
- f"🎯 Matched LOs: {matched_count} of {len(los)}\n"
103
- f"{quality}"
104
- )
 
 
 
 
 
 
 
 
105
 
106
- chart = create_bar_chart(los, scores_old, scores_new)
107
- return summary_output, lo_output, stats, chart
108
 
109
  iface = gr.Interface(
110
- fn=lambda old_pdf, new_pdf, lo_file: compare_handouts(old_pdf, new_pdf, lo_file, "learning_outcomes.docx"),
111
  inputs=[
112
- gr.File(label="πŸ“€ Old Handout PDF", type="binary"),
113
- gr.File(label="πŸ“₯ New Handout PDF", type="binary"),
114
- gr.File(label="πŸ“š Learning Outcomes (.docx or .txt)", type="binary")
115
  ],
116
  outputs=[
117
- gr.Textbox(label="πŸ†• New Content Summary", lines=10),
118
- gr.Textbox(label="🎯 LO Matches", lines=10),
119
- gr.Textbox(label="πŸ“Š Stats & Quality", lines=5),
120
- gr.Plot(label="πŸ“‰ LO Match Score Chart")
121
  ],
122
- title="πŸ“˜ Handout Comparator (Binary Safe)",
123
- description="Upload old/new handouts + LO file. Detects changes, LO match, and generates update chart."
124
  )
125
 
126
  iface.launch()
 
1
 
2
 
3
  import gradio as gr
4
+ from sentence_transformers import SentenceTransformer, util
5
  import fitz # PyMuPDF
6
+ import docx
 
 
 
7
  import matplotlib.pyplot as plt
8
+ import io
9
+ import base64
10
+
11
+ model = SentenceTransformer("all-MiniLM-L6-v2")
12
+
13
+ def extract_text_from_pdf(pdf_file):
14
+ try:
15
+ doc = fitz.open(stream=pdf_file.read(), filetype="pdf")
16
+ text = ""
17
+ for page in doc:
18
+ text += page.get_text()
19
+ return text
20
+ except Exception as e:
21
+ return ""
22
+
23
+ def extract_text_from_docx(file_obj):
24
+ try:
25
+ doc = docx.Document(file_obj)
26
+ return "\n".join([para.text for para in doc.paragraphs if para.text.strip()])
27
+ except Exception as e:
28
+ return ""
29
+
30
+ def match_learning_outcomes(lo_list, text):
31
+ text_blocks = [para for para in text.split("\n") if len(para.strip()) > 20]
32
+ text_embs = model.encode(text_blocks, convert_to_tensor=True)
33
+ lo_embs = model.encode(lo_list, convert_to_tensor=True)
34
+
35
+ lo_scores = []
36
+ for i, lo in enumerate(lo_list):
37
+ sims = util.cos_sim(lo_embs[i], text_embs)
38
+ max_score = float(sims.max())
39
+ lo_scores.append(max_score)
40
+ return lo_scores
41
+
42
+ def generate_similarity_chart(lo_list, old_scores, new_scores):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
43
  fig, ax = plt.subplots(figsize=(10, 5))
44
+ x = range(len(lo_list))
45
+ ax.bar(x, old_scores, width=0.4, label="Old", align="center")
46
+ ax.bar([i + 0.4 for i in x], new_scores, width=0.4, label="New", align="center")
47
+ ax.set_xticks([i + 0.2 for i in x])
48
+ ax.set_xticklabels([f"LO{i+1}" for i in x], rotation=45)
49
+ ax.set_ylabel("Match Score (0-1)")
50
+ ax.set_title("LO-wise Semantic Match Score: Old vs New")
51
  ax.legend()
 
 
 
 
 
 
 
 
52
 
53
+ buf = io.BytesIO()
54
+ plt.tight_layout()
55
+ plt.savefig(buf, format="png")
56
+ buf.seek(0)
57
+ encoded = base64.b64encode(buf.read()).decode("utf-8")
58
+ plt.close(fig)
59
+ return f"data:image/png;base64,{encoded}"
60
+
61
+ def compare_handouts(old_pdf, new_pdf, lo_file):
62
+ old_text = extract_text_from_pdf(old_pdf)
63
+ new_text = extract_text_from_pdf(new_pdf)
64
  if not old_text or not new_text:
65
+ return "❌ Could not extract text from one or both PDFs.", None
66
+
67
+ lo_text = extract_text_from_docx(lo_file)
68
+ lo_list = [line.strip() for line in lo_text.split("\n") if line.strip()]
69
+ if not lo_list:
70
+ return "❌ No learning outcomes detected.", None
71
 
72
+ # LO Matching
73
+ old_scores = match_learning_outcomes(lo_list, old_text)
74
+ new_scores = match_learning_outcomes(lo_list, new_text)
 
75
 
76
+ matched_old = sum(score > 0.6 for score in old_scores)
77
+ matched_new = sum(score > 0.6 for score in new_scores)
78
+
79
+ # Change % based on character count
80
+ change_percent = abs(len(new_text) - len(old_text)) / max(len(old_text), 1) * 100
81
+
82
+ summary = f"πŸ“ˆ **Content Change:** {change_percent:.2f}%\n"
83
+ summary += f"🎯 **LOs Matched (Old vs New):** {matched_old} vs {matched_new} of {len(lo_list)}\n"
84
+
85
+ if matched_new > matched_old:
86
+ summary += "🟒 New handout covers learning outcomes better.\n"
87
+ elif matched_new < matched_old:
88
+ summary += "πŸ”΄ New handout covers fewer outcomes.\n"
89
+ else:
90
+ summary += "🟑 No major change in LO coverage.\n"
91
 
92
+ chart = generate_similarity_chart(lo_list, old_scores, new_scores)
93
+ return summary, chart
94
 
95
  iface = gr.Interface(
96
+ fn=compare_handouts,
97
  inputs=[
98
+ gr.File(label="Old Version PDF"),
99
+ gr.File(label="New Version PDF"),
100
+ gr.File(label="Learning Outcomes (.docx)", file_types=[".docx"]),
101
  ],
102
  outputs=[
103
+ gr.Markdown(label="Summary"),
104
+ gr.Image(label="LO Match Chart")
 
 
105
  ],
106
+ title="πŸ“˜ Educational Content Comparator",
107
+ description="Compare two handouts and evaluate changes + Learning Outcome coverage using semantic similarity.",
108
  )
109
 
110
  iface.launch()