Deevyankar commited on
Commit
d0c3e02
Β·
verified Β·
1 Parent(s): 2135a5e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +55 -42
app.py CHANGED
@@ -5,83 +5,96 @@ from sentence_transformers import SentenceTransformer, util
5
  from docx import Document
6
  import io
7
 
8
- # Load the sentence-transformer model
9
  model = SentenceTransformer('all-MiniLM-L6-v2')
10
 
11
- # Extract text from PDF using PyMuPDF
12
  def extract_text_from_pdf(pdf_file):
13
- doc = fitz.open(stream=pdf_file.read(), filetype="pdf")
14
- full_text = ""
15
- for page in doc:
16
- full_text += page.get_text()
17
- return full_text
 
 
 
 
18
 
19
- # Extract Learning Outcomes from .txt or .docx
20
  def extract_los(lo_file):
21
- if lo_file.name.endswith('.txt'):
22
- return lo_file.read().decode('utf-8').splitlines()
23
- elif lo_file.name.endswith('.docx'):
24
- file_bytes = io.BytesIO(lo_file.read())
25
- doc = Document(file_bytes)
26
- return [para.text.strip() for para in doc.paragraphs if para.text.strip()]
27
- else:
 
 
 
 
28
  return []
29
 
30
- # Main app logic
31
  def compare_and_assess(old_pdf, new_pdf, lo_file):
32
  if not old_pdf or not new_pdf or not lo_file:
33
  return "❌ Please upload all three files."
34
 
35
- # Extract text
36
  old_text = extract_text_from_pdf(old_pdf)
37
  new_text = extract_text_from_pdf(new_pdf)
38
 
39
- if len(old_text.strip()) < 50 or len(new_text.strip()) < 50:
40
  return "⚠️ One of the PDFs may be empty or unreadable."
41
 
42
- # Diff analysis
43
  old_lines = old_text.splitlines()
44
  new_lines = new_text.splitlines()
45
- diff = list(difflib.unified_diff(old_lines, new_lines))
46
 
47
- added = [line for line in diff if line.startswith('+') and not line.startswith('+++')]
48
- removed = [line for line in diff if line.startswith('-') and not line.startswith('---')]
 
49
  percent_change = (len(added) + len(removed)) / max(len(old_lines), 1) * 100
50
 
51
- # LO analysis
52
  los = extract_los(lo_file)
53
- if not los:
54
- return "⚠️ No valid Learning Outcomes found in the file."
55
-
56
- new_emb = model.encode(new_text, convert_to_tensor=True)
57
  lo_scores = []
58
- for lo in los:
59
- lo_emb = model.encode(lo, convert_to_tensor=True)
60
- sim = util.cos_sim(new_emb, lo_emb).max().item()
61
- lo_scores.append(f"β€’ {lo[:80]}: {sim*100:.1f}% relevant")
62
 
63
- # Output
64
- summary = f"πŸ“ˆ Content Updated: {percent_change:.2f}%\n"
65
- summary += f"πŸ”Ό Added Lines: {len(added)}\nπŸ”½ Removed Lines: {len(removed)}\n\n"
66
- summary += "🎯 Learning Outcome Coverage:\n" + "\n".join(lo_scores[:10])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
67
 
68
- # Debug logs (can be viewed in Hugging Face Logs tab)
69
- print("βœ… PDFs compared successfully.")
70
- print("LOs evaluated:", len(lo_scores))
71
 
72
- return summary
73
 
74
- # Gradio interface
75
  iface = gr.Interface(
76
  fn=compare_and_assess,
77
  inputs=[
78
  gr.File(label="Upload Old PDF", type="binary"),
79
  gr.File(label="Upload New PDF", type="binary"),
80
- gr.File(label="Upload Learning Outcomes (.txt or .docx)", type="binary")
81
  ],
82
  outputs="text",
83
  title="πŸ“š Course Handout Comparator + LO Evaluator",
84
- description="Compare two PDF handouts (old + new) and a Learning Outcome file. Calculates % updated and checks how well the new content aligns with your course outcomes."
85
  )
86
 
87
  iface.launch()
 
5
  from docx import Document
6
  import io
7
 
8
+ # Load model
9
  model = SentenceTransformer('all-MiniLM-L6-v2')
10
 
11
+ # PDF text extraction
12
  def extract_text_from_pdf(pdf_file):
13
+ try:
14
+ doc = fitz.open(stream=pdf_file.read(), filetype="pdf")
15
+ text = ""
16
+ for page in doc:
17
+ text += page.get_text()
18
+ return text
19
+ except Exception as e:
20
+ print(f"[PDF ERROR] {e}")
21
+ return ""
22
 
23
+ # Extract LO from .txt or .docx
24
  def extract_los(lo_file):
25
+ try:
26
+ if lo_file.name.endswith(".txt"):
27
+ return lo_file.read().decode("utf-8").splitlines()
28
+ elif lo_file.name.endswith(".docx"):
29
+ file_bytes = io.BytesIO(lo_file.read())
30
+ doc = Document(file_bytes)
31
+ return [para.text.strip() for para in doc.paragraphs if para.text.strip()]
32
+ else:
33
+ return []
34
+ except Exception as e:
35
+ print(f"[LO ERROR] {e}")
36
  return []
37
 
38
+ # Main function
39
  def compare_and_assess(old_pdf, new_pdf, lo_file):
40
  if not old_pdf or not new_pdf or not lo_file:
41
  return "❌ Please upload all three files."
42
 
 
43
  old_text = extract_text_from_pdf(old_pdf)
44
  new_text = extract_text_from_pdf(new_pdf)
45
 
46
+ if len(old_text.strip()) < 20 or len(new_text.strip()) < 20:
47
  return "⚠️ One of the PDFs may be empty or unreadable."
48
 
49
+ # Compare content
50
  old_lines = old_text.splitlines()
51
  new_lines = new_text.splitlines()
 
52
 
53
+ diff = list(difflib.unified_diff(old_lines, new_lines))
54
+ added = [l for l in diff if l.startswith("+") and not l.startswith("+++")]
55
+ removed = [l for l in diff if l.startswith("-") and not l.startswith("---")]
56
  percent_change = (len(added) + len(removed)) / max(len(old_lines), 1) * 100
57
 
58
+ # LO Coverage
59
  los = extract_los(lo_file)
 
 
 
 
60
  lo_scores = []
 
 
 
 
61
 
62
+ if los:
63
+ new_emb = model.encode(new_text, convert_to_tensor=True)
64
+ for lo in los:
65
+ lo_emb = model.encode(lo, convert_to_tensor=True)
66
+ sim = util.cos_sim(new_emb, lo_emb).max().item()
67
+ lo_scores.append((lo, sim))
68
+
69
+ lo_scores = sorted(lo_scores, key=lambda x: x[1], reverse=True)
70
+ lo_summary = "\n".join([f"β€’ {lo[:90]} β€” {score*100:.1f}%" for lo, score in lo_scores[:10]])
71
+ else:
72
+ lo_summary = "⚠️ No valid Learning Outcomes found."
73
+
74
+ # Final output
75
+ result = f"πŸ“Š **Comparison Summary**\n"
76
+ result += f"- 🧾 Added lines: {len(added)}\n"
77
+ result += f"- πŸ—‘οΈ Removed lines: {len(removed)}\n"
78
+ result += f"- πŸ”„ Overall update: {percent_change:.2f}%\n\n"
79
+ result += f"πŸ“Œ **Top Learning Outcome Coverage:**\n{lo_summary}"
80
 
81
+ # Debug logs
82
+ print("βœ… Comparison done.")
83
+ print(f"LOs analyzed: {len(lo_scores)}")
84
 
85
+ return result
86
 
87
+ # Gradio UI
88
  iface = gr.Interface(
89
  fn=compare_and_assess,
90
  inputs=[
91
  gr.File(label="Upload Old PDF", type="binary"),
92
  gr.File(label="Upload New PDF", type="binary"),
93
+ gr.File(label="Upload Learning Outcomes (.txt or .docx)", type="binary"),
94
  ],
95
  outputs="text",
96
  title="πŸ“š Course Handout Comparator + LO Evaluator",
97
+ description="Compare two PDF handouts and check how well the new version matches your Learning Outcomes. Supports .txt and .docx LO files.",
98
  )
99
 
100
  iface.launch()