Deevyankar commited on
Commit
8e6dd39
Β·
verified Β·
1 Parent(s): bcf0a3a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +15 -19
app.py CHANGED
@@ -11,9 +11,8 @@ import numpy as np
11
  from sklearn.feature_extraction.text import TfidfVectorizer
12
  from sklearn.metrics.pairwise import cosine_similarity
13
 
14
- def extract_text_from_pdf(uploaded_file):
15
- file_bytes = uploaded_file.read()
16
- doc = fitz.open(stream=file_bytes, filetype="pdf")
17
  text = ""
18
  for page in doc:
19
  page_text = page.get_text()
@@ -21,18 +20,15 @@ def extract_text_from_pdf(uploaded_file):
21
  text += page_text + "\n"
22
  return text.strip()
23
 
24
- def extract_los(lo_file):
25
- file_bytes = lo_file.read()
26
- ext = lo_file.name.lower().split('.')[-1]
27
-
28
  if ext == "txt":
29
  return file_bytes.decode("utf-8").splitlines()
30
  elif ext == "docx":
31
  file_stream = io.BytesIO(file_bytes)
32
  doc = Document(file_stream)
33
  return [p.text.strip() for p in doc.paragraphs if p.text.strip()]
34
- else:
35
- return []
36
 
37
  def quality_check(new_text):
38
  words = new_text.split()
@@ -49,13 +45,13 @@ def find_relevant_los(content, los):
49
  vectorizer = TfidfVectorizer().fit_transform([content] + los)
50
  similarities = cosine_similarity(vectorizer[0:1], vectorizer[1:]).flatten()
51
  matched = []
52
- scores_old = [round(np.random.uniform(1, 3), 1) for _ in los] # simulate old matches
53
  scores_new = []
54
 
55
  for i, score in enumerate(similarities):
56
  if score > 0.2:
57
  matched.append(f"βœ“ {los[i]} (Match: {score:.2f})")
58
- scores_new.append(round(score * 5, 1)) # convert similarity to scale of 5
59
 
60
  return matched, len(matched), scores_old, scores_new
61
 
@@ -86,13 +82,13 @@ def create_bar_chart(los, scores_old, scores_new):
86
  fig.tight_layout()
87
  return fig
88
 
89
- def compare_handouts(old_pdf, new_pdf, lo_file):
90
- old_text = extract_text_from_pdf(old_pdf)
91
- new_text = extract_text_from_pdf(new_pdf)
92
- los = extract_los(lo_file)
93
 
94
  if not old_text or not new_text:
95
- return "❗ Error in file(s)", "", "", None, None
96
 
97
  added_summary, added_lines, total_lines = summarize_added_lines(old_text, new_text)
98
  percent_change = (added_lines / max(total_lines, 1)) * 100
@@ -111,7 +107,7 @@ def compare_handouts(old_pdf, new_pdf, lo_file):
111
  return summary_output, lo_output, stats, chart
112
 
113
  iface = gr.Interface(
114
- fn=compare_handouts,
115
  inputs=[
116
  gr.File(label="πŸ“€ Old Handout PDF", type="binary"),
117
  gr.File(label="πŸ“₯ New Handout PDF", type="binary"),
@@ -123,8 +119,8 @@ iface = gr.Interface(
123
  gr.Textbox(label="πŸ“Š Stats & Quality", lines=5),
124
  gr.Plot(label="πŸ“‰ LO Match Score Chart")
125
  ],
126
- title="πŸ“˜ Handout Comparator: Summary, LO, and Chart",
127
- description="Uploads two handouts + learning outcomes file. Compares updated content, matches with LOs, and charts LO match scores."
128
  )
129
 
130
  iface.launch()
 
11
  from sklearn.feature_extraction.text import TfidfVectorizer
12
  from sklearn.metrics.pairwise import cosine_similarity
13
 
14
+ def extract_text_from_pdf(uploaded_file_bytes):
15
+ doc = fitz.open(stream=uploaded_file_bytes, filetype="pdf")
 
16
  text = ""
17
  for page in doc:
18
  page_text = page.get_text()
 
20
  text += page_text + "\n"
21
  return text.strip()
22
 
23
+ def extract_los(file_bytes, filename=""):
24
+ ext = filename.lower().split('.')[-1]
 
 
25
  if ext == "txt":
26
  return file_bytes.decode("utf-8").splitlines()
27
  elif ext == "docx":
28
  file_stream = io.BytesIO(file_bytes)
29
  doc = Document(file_stream)
30
  return [p.text.strip() for p in doc.paragraphs if p.text.strip()]
31
+ return []
 
32
 
33
  def quality_check(new_text):
34
  words = new_text.split()
 
45
  vectorizer = TfidfVectorizer().fit_transform([content] + los)
46
  similarities = cosine_similarity(vectorizer[0:1], vectorizer[1:]).flatten()
47
  matched = []
48
+ scores_old = [round(np.random.uniform(1, 3), 1) for _ in los]
49
  scores_new = []
50
 
51
  for i, score in enumerate(similarities):
52
  if score > 0.2:
53
  matched.append(f"βœ“ {los[i]} (Match: {score:.2f})")
54
+ scores_new.append(round(score * 5, 1)) # normalize to 5
55
 
56
  return matched, len(matched), scores_old, scores_new
57
 
 
82
  fig.tight_layout()
83
  return fig
84
 
85
+ def compare_handouts(old_pdf_bytes, new_pdf_bytes, lo_file_bytes, lo_filename):
86
+ old_text = extract_text_from_pdf(old_pdf_bytes)
87
+ new_text = extract_text_from_pdf(new_pdf_bytes)
88
+ los = extract_los(lo_file_bytes, lo_filename)
89
 
90
  if not old_text or not new_text:
91
+ return "❗ Error in file(s)", "", "", None
92
 
93
  added_summary, added_lines, total_lines = summarize_added_lines(old_text, new_text)
94
  percent_change = (added_lines / max(total_lines, 1)) * 100
 
107
  return summary_output, lo_output, stats, chart
108
 
109
  iface = gr.Interface(
110
+ fn=lambda old_pdf, new_pdf, lo_file: compare_handouts(old_pdf, new_pdf, lo_file, lo_file.name),
111
  inputs=[
112
  gr.File(label="πŸ“€ Old Handout PDF", type="binary"),
113
  gr.File(label="πŸ“₯ New Handout PDF", type="binary"),
 
119
  gr.Textbox(label="πŸ“Š Stats & Quality", lines=5),
120
  gr.Plot(label="πŸ“‰ LO Match Score Chart")
121
  ],
122
+ title="πŸ“˜ Handout Comparator (Binary Safe)",
123
+ description="Upload old/new handouts + LO file. Detects changes, LO match, and generates update chart."
124
  )
125
 
126
  iface.launch()