Deevyankar commited on
Commit
4bbb77a
Β·
verified Β·
1 Parent(s): ac4e81e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +92 -50
app.py CHANGED
@@ -1,91 +1,133 @@
1
 
 
2
  import gradio as gr
 
3
  import docx
4
  import io
5
- import pdfplumber
6
  from sklearn.feature_extraction.text import TfidfVectorizer
7
- from sklearn.metrics.pairwise import cosine_similarity
8
  import matplotlib.pyplot as plt
9
  import numpy as np
 
10
 
11
- def extract_text_from_pdf(file):
12
- try:
13
- text = ""
14
- with pdfplumber.open(file.name) as pdf:
15
- for page in pdf.pages:
16
- text += page.extract_text() or ""
17
- return text
18
- except Exception as e:
19
- return ""
20
 
21
- def extract_text_from_docx(file):
22
  try:
23
- doc = docx.Document(file.name)
24
- full_text = [para.text for para in doc.paragraphs]
25
- return "\n".join(full_text)
 
 
 
26
  except Exception as e:
27
  return ""
28
 
29
- def calculate_similarity(text1, text2):
30
- vectorizer = TfidfVectorizer().fit_transform([text1, text2])
31
- similarity = cosine_similarity(vectorizer[0:1], vectorizer[1:2])
32
- return float(similarity[0][0]) * 100 # return as percentage
33
 
34
- def semantic_match(lo_texts, content):
 
 
 
 
 
 
 
 
 
 
 
35
  scores = []
36
- vectorizer = TfidfVectorizer().fit_transform([content] + lo_texts)
37
- content_vec = vectorizer[0]
38
- for i in range(1, len(lo_texts) + 1):
39
- score = cosine_similarity(content_vec, vectorizer[i])[0][0]
40
- scores.append(score)
 
 
 
41
  return scores
42
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
43
  def compare_handouts(old_pdf, new_pdf, lo_file):
44
  old_text = extract_text_from_pdf(old_pdf)
45
  new_text = extract_text_from_pdf(new_pdf)
46
- lo_text = extract_text_from_docx(lo_file)
47
 
48
- if not old_text or not new_text:
49
- return "❌ Could not extract text from one or both PDFs.", None
50
 
51
- similarity = calculate_similarity(old_text, new_text)
 
 
52
 
53
- lo_list = [line.strip() for line in lo_text.split("\n") if line.strip()]
54
  old_scores = semantic_match(lo_list, old_text)
55
  new_scores = semantic_match(lo_list, new_text)
56
 
57
- matched_outcomes = sum(n >= o for o, n in zip(old_scores, new_scores))
 
 
 
 
 
 
 
 
 
 
58
 
59
- # Generate a bar plot for each LO
60
- labels = [f"LO{i+1}" for i in range(len(lo_list))]
61
- x = np.arange(len(labels))
 
 
 
62
  width = 0.35
63
  fig, ax = plt.subplots()
64
  ax.bar(x - width/2, old_scores, width, label='Old')
65
  ax.bar(x + width/2, new_scores, width, label='New')
66
- ax.set_ylabel('Semantic Match Score')
67
- ax.set_title('Learning Outcome Coverage')
68
  ax.set_xticks(x)
69
- ax.set_xticklabels(labels)
70
  ax.legend()
71
  plt.tight_layout()
72
 
73
- summary = f"πŸ“ˆ Content Change Estimate: {similarity:.2f}%\n"
74
- summary += f"🎯 Matched LOs: {matched_outcomes} of {len(lo_list)}\n"
75
- summary += "🟒 Summary: New handout has improved structure and added clarity." if similarity > 50 else "⚠️ Summary: Minimal updates or low improvement detected."
76
-
77
  return summary, fig
78
 
79
  with gr.Blocks() as demo:
80
- gr.Markdown("# πŸ“š Handout Comparator with LO Alignment")
 
 
 
 
 
 
 
81
  with gr.Row():
82
- old_pdf = gr.File(label="πŸ“„ Upload OLD Handout (PDF)")
83
- new_pdf = gr.File(label="πŸ†• Upload NEW Handout (PDF)")
84
- lo_file = gr.File(label="πŸ“˜ Upload Learning Outcomes (DOCX)")
85
- compare_btn = gr.Button("πŸ” Analyze and Compare")
86
- output_text = gr.Textbox(label="πŸ“Š Summary")
87
- output_plot = gr.Plot(label="πŸ“ˆ Learning Outcome Comparison")
88
-
89
- compare_btn.click(fn=compare_handouts, inputs=[old_pdf, new_pdf, lo_file], outputs=[output_text, output_plot])
90
 
91
  demo.launch()
 
1
 
2
+
3
  import gradio as gr
4
+ import fitz # PyMuPDF
5
  import docx
6
  import io
7
+ import re
8
  from sklearn.feature_extraction.text import TfidfVectorizer
9
+ from sentence_transformers import SentenceTransformer, util
10
  import matplotlib.pyplot as plt
11
  import numpy as np
12
+ from difflib import SequenceMatcher
13
 
14
+ model = SentenceTransformer('all-MiniLM-L6-v2')
 
 
 
 
 
 
 
 
15
 
16
+ def extract_text_from_pdf(pdf_file):
17
  try:
18
+ pdf_reader = fitz.open(stream=pdf_file, filetype="pdf")
19
+ text = ""
20
+ for page in pdf_reader:
21
+ text += page.get_text()
22
+ pdf_reader.close()
23
+ return text.strip()
24
  except Exception as e:
25
  return ""
26
 
27
+ def normalize_text(text):
28
+ return re.sub(r'\s+', ' ', text.strip().lower())
 
 
29
 
30
+ def extract_text_from_docx(docx_file):
31
+ try:
32
+ doc = docx.Document(io.BytesIO(docx_file))
33
+ full_text = []
34
+ for para in doc.paragraphs:
35
+ if para.text.strip():
36
+ full_text.append(para.text.strip())
37
+ return full_text
38
+ except:
39
+ return []
40
+
41
+ def semantic_match(lo_list, content):
42
  scores = []
43
+ for lo in lo_list:
44
+ try:
45
+ lo_embed = model.encode(lo, convert_to_tensor=True)
46
+ content_embed = model.encode(content, convert_to_tensor=True)
47
+ sim = util.pytorch_cos_sim(lo_embed, content_embed).item()
48
+ scores.append(round(sim, 2))
49
+ except:
50
+ scores.append(0.0)
51
  return scores
52
 
53
+ def content_change_score(text1, text2):
54
+ try:
55
+ sim = SequenceMatcher(None, normalize_text(text1), normalize_text(text2)).ratio()
56
+ return round((1 - sim) * 100, 2)
57
+ except:
58
+ return 100.0
59
+
60
+ def structure_score(text):
61
+ toc_score = 5 if "table of contents" in text.lower() else 0
62
+ bullet_score = 5 if len(re.findall(r"\n\s*[-β€’*]", text)) > 10 else 0
63
+ return toc_score + bullet_score
64
+
65
+ def calculate_weighted_score(content_diff, improved_los, total_los, struct_score=10):
66
+ lo_component = (improved_los / total_los) * 100 if total_los > 0 else 0
67
+ return round(0.6 * lo_component + 0.3 * content_diff + 0.1 * struct_score, 2)
68
+
69
  def compare_handouts(old_pdf, new_pdf, lo_file):
70
  old_text = extract_text_from_pdf(old_pdf)
71
  new_text = extract_text_from_pdf(new_pdf)
 
72
 
73
+ if len(old_text.strip()) < 200 or len(new_text.strip()) < 200:
74
+ return "⚠️ Could not extract meaningful content from one or both PDFs.", None
75
 
76
+ lo_list = extract_text_from_docx(lo_file)
77
+ if not lo_list:
78
+ return "⚠️ No learning outcomes detected.", None
79
 
 
80
  old_scores = semantic_match(lo_list, old_text)
81
  new_scores = semantic_match(lo_list, new_text)
82
 
83
+ content_diff = content_change_score(old_text, new_text)
84
+ improved_count = sum([n > o for n, o in zip(new_scores, old_scores)])
85
+ matched_los = sum([n >= o for n, o in zip(new_scores, old_scores)])
86
+ struct_score = structure_score(new_text)
87
+
88
+ weighted_score = calculate_weighted_score(content_diff, improved_count, len(lo_list), struct_score)
89
+
90
+ summary = f"🧠 Improved LOs: {improved_count} / {len(lo_list)}\n"
91
+ summary += f"πŸ“„ Content Change Estimate: {content_diff}%\n"
92
+ summary += f"πŸ—οΈ Structure Score: {struct_score}/10\n"
93
+ summary += f"πŸ”’ Final Weighted Score: {weighted_score}%\n"
94
 
95
+ if improved_count > 0:
96
+ summary += "\n🟒 Summary: New handout better aligns with LOs and has improved clarity."
97
+ else:
98
+ summary += "\n⚠️ Summary: No significant LO improvement detected."
99
+
100
+ x = np.arange(len(lo_list))
101
  width = 0.35
102
  fig, ax = plt.subplots()
103
  ax.bar(x - width/2, old_scores, width, label='Old')
104
  ax.bar(x + width/2, new_scores, width, label='New')
105
+ ax.set_ylabel('Match Score (0-1)')
106
+ ax.set_title('LO-wise Match Score: Old vs New')
107
  ax.set_xticks(x)
108
+ ax.set_xticklabels([f"LO{i+1}" for i in range(len(lo_list))], rotation=45)
109
  ax.legend()
110
  plt.tight_layout()
111
 
 
 
 
 
112
  return summary, fig
113
 
114
  with gr.Blocks() as demo:
115
+ gr.Markdown("πŸ“˜ **Educational Content Comparator - Weighted Analysis**")
116
+ gr.Markdown("Upload old & new handouts + Learning Outcomes (.docx) to evaluate change & alignment.")
117
+
118
+ with gr.Row():
119
+ old_pdf = gr.File(label="πŸ“‚ Upload Old PDF", file_types=[".pdf"], type="binary")
120
+ new_pdf = gr.File(label="πŸ“‚ Upload New PDF", file_types=[".pdf"], type="binary")
121
+ lo_file = gr.File(label="πŸ“‚ Upload Learning Outcomes (.docx)", file_types=[".docx"], type="binary")
122
+
123
  with gr.Row():
124
+ btn = gr.Button("Submit")
125
+ clear_btn = gr.Button("Clear")
126
+
127
+ output_text = gr.Textbox(label="πŸ“‹ Summary", lines=6)
128
+ output_plot = gr.Plot(label="πŸ“Š LO Match Chart")
129
+
130
+ btn.click(fn=compare_handouts, inputs=[old_pdf, new_pdf, lo_file], outputs=[output_text, output_plot])
131
+ clear_btn.click(fn=lambda: ("", None), inputs=[], outputs=[output_text, output_plot])
132
 
133
  demo.launch()