Deevyankar commited on
Commit
ace889b
Β·
verified Β·
1 Parent(s): 89e559a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +18 -22
app.py CHANGED
@@ -7,34 +7,33 @@ from docx import Document
7
  import gradio as gr
8
  from sklearn.feature_extraction.text import TfidfVectorizer
9
  from sklearn.metrics.pairwise import cosine_similarity
 
10
 
11
- # --- Extract Text ---
12
- def extract_text_from_pdf(pdf_bytes):
13
  try:
14
- reader = PdfReader(pdf_bytes)
15
  text = ""
16
  for page in reader.pages:
17
  text += page.extract_text() or ""
18
  return text.strip()
19
- except:
20
  return ""
21
 
22
- def extract_text_from_docx(docx_file):
23
- doc = Document(docx_file)
24
- return "\n".join([para.text.strip() for para in doc.paragraphs if para.text.strip()])
 
 
 
25
 
26
- # --- Change Percentage ---
27
  def calculate_change_percentage(old_text, new_text):
28
  seqm = difflib.SequenceMatcher(None, old_text, new_text)
29
  return (1 - seqm.ratio()) * 100
30
 
31
- # --- Semantic Matching ---
32
  def semantic_match(lo_texts, content):
33
- vectorizer = TfidfVectorizer().fit_transform([content] + lo_texts)
34
- similarities = cosine_similarity(vectorizer[0:1], vectorizer[1:]).flatten()
35
- return similarities
36
 
37
- # --- Summary Generation ---
38
  def generate_summary(change_pct, matched_los, total_los):
39
  msg = f"πŸ“ˆ Content Change: {change_pct:.2f}%\n🎯 Matched LOs: {matched_los} of {total_los}\n"
40
  if change_pct > 20:
@@ -45,7 +44,6 @@ def generate_summary(change_pct, matched_los, total_los):
45
  msg += "🟑 Very little or no update."
46
  return msg
47
 
48
- # --- Bar Chart Plot ---
49
  def plot_lo_chart(lo_labels, old_scores, new_scores):
50
  df = pd.DataFrame({'Old': old_scores, 'New': new_scores}, index=lo_labels)
51
  ax = df.plot(kind='bar', figsize=(10, 5), title="LO-wise Match Score: Old vs New")
@@ -55,7 +53,6 @@ def plot_lo_chart(lo_labels, old_scores, new_scores):
55
  plt.tight_layout()
56
  return plt.gcf()
57
 
58
- # --- Main Comparator ---
59
  def compare_handouts(old_pdf, new_pdf, lo_docx):
60
  old_text = extract_text_from_pdf(old_pdf)
61
  new_text = extract_text_from_pdf(new_pdf)
@@ -63,11 +60,11 @@ def compare_handouts(old_pdf, new_pdf, lo_docx):
63
  if not old_text or not new_text:
64
  return "❌ Could not extract text from one or both PDFs.", None
65
 
66
- lo_text_raw = extract_text_from_docx(lo_docx)
67
- lo_list = [lo for lo in lo_text_raw.split('\n') if lo.strip()]
68
 
69
  if not lo_list:
70
- return "❌ No learning outcomes detected in uploaded file.", None
71
 
72
  old_scores = semantic_match(lo_list, old_text)
73
  new_scores = semantic_match(lo_list, new_text)
@@ -79,20 +76,19 @@ def compare_handouts(old_pdf, new_pdf, lo_docx):
79
  fig = plot_lo_chart([f"LO{i+1}" for i in range(len(lo_list))], old_scores, new_scores)
80
  return summary, fig
81
 
82
- # --- Gradio App ---
83
  demo = gr.Interface(
84
  fn=compare_handouts,
85
  inputs=[
86
  gr.File(label="Upload Old PDF", type="binary"),
87
  gr.File(label="Upload New PDF", type="binary"),
88
- gr.File(label="Upload Learning Outcomes (.docx)", type="binary"),
89
  ],
90
  outputs=[
91
  gr.Textbox(label="πŸ“‹ Summary"),
92
  gr.Plot(label="πŸ“Š LO Match Chart")
93
  ],
94
  title="πŸ“˜ Educational Content Comparator",
95
- description="Upload 2 handouts and LO file (.docx). Detect % update, alignment with learning outcomes, and get visual summary."
96
  )
97
 
98
- demo.launch(share=True)
 
7
  import gradio as gr
8
  from sklearn.feature_extraction.text import TfidfVectorizer
9
  from sklearn.metrics.pairwise import cosine_similarity
10
+ import io
11
 
12
+ def extract_text_from_pdf(pdf_binary):
 
13
  try:
14
+ reader = PdfReader(io.BytesIO(pdf_binary))
15
  text = ""
16
  for page in reader.pages:
17
  text += page.extract_text() or ""
18
  return text.strip()
19
+ except Exception as e:
20
  return ""
21
 
22
+ def extract_text_from_docx(docx_binary):
23
+ try:
24
+ doc = Document(io.BytesIO(docx_binary))
25
+ return "\n".join([p.text.strip() for p in doc.paragraphs if p.text.strip()])
26
+ except Exception as e:
27
+ return ""
28
 
 
29
  def calculate_change_percentage(old_text, new_text):
30
  seqm = difflib.SequenceMatcher(None, old_text, new_text)
31
  return (1 - seqm.ratio()) * 100
32
 
 
33
  def semantic_match(lo_texts, content):
34
+ vectorizer = TtfidfVectorizer().fit_transform([content] + lo_texts)
35
+ return cosine_similarity(vectorizer[0:1], vectorizer[1:]).flatten()
 
36
 
 
37
  def generate_summary(change_pct, matched_los, total_los):
38
  msg = f"πŸ“ˆ Content Change: {change_pct:.2f}%\n🎯 Matched LOs: {matched_los} of {total_los}\n"
39
  if change_pct > 20:
 
44
  msg += "🟑 Very little or no update."
45
  return msg
46
 
 
47
  def plot_lo_chart(lo_labels, old_scores, new_scores):
48
  df = pd.DataFrame({'Old': old_scores, 'New': new_scores}, index=lo_labels)
49
  ax = df.plot(kind='bar', figsize=(10, 5), title="LO-wise Match Score: Old vs New")
 
53
  plt.tight_layout()
54
  return plt.gcf()
55
 
 
56
  def compare_handouts(old_pdf, new_pdf, lo_docx):
57
  old_text = extract_text_from_pdf(old_pdf)
58
  new_text = extract_text_from_pdf(new_pdf)
 
60
  if not old_text or not new_text:
61
  return "❌ Could not extract text from one or both PDFs.", None
62
 
63
+ lo_raw_text = extract_text_from_docx(lo_docx)
64
+ lo_list = [line for line in lo_raw_text.split("\n") if line.strip()]
65
 
66
  if not lo_list:
67
+ return "❌ No learning outcomes detected.", None
68
 
69
  old_scores = semantic_match(lo_list, old_text)
70
  new_scores = semantic_match(lo_list, new_text)
 
76
  fig = plot_lo_chart([f"LO{i+1}" for i in range(len(lo_list))], old_scores, new_scores)
77
  return summary, fig
78
 
 
79
  demo = gr.Interface(
80
  fn=compare_handouts,
81
  inputs=[
82
  gr.File(label="Upload Old PDF", type="binary"),
83
  gr.File(label="Upload New PDF", type="binary"),
84
+ gr.File(label="Upload Learning Outcomes (.docx)", type="binary")
85
  ],
86
  outputs=[
87
  gr.Textbox(label="πŸ“‹ Summary"),
88
  gr.Plot(label="πŸ“Š LO Match Chart")
89
  ],
90
  title="πŸ“˜ Educational Content Comparator",
91
+ description="Upload 2 handouts and a .docx file of Learning Outcomes to compare changes and alignment."
92
  )
93
 
94
+ demo.launch(share=True)