Deevyankar commited on
Commit
bc78525
Β·
verified Β·
1 Parent(s): ef62d76

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +33 -61
app.py CHANGED
@@ -1,16 +1,13 @@
1
 
2
  import gradio as gr
3
  from PyPDF2 import PdfReader
4
- from pdf2image import convert_from_bytes
5
- import pytesseract
6
- from PIL import Image
7
  import io
8
- from sklearn.feature_extraction.text import TfidfVectorizer
9
- from sklearn.metrics.pairwise import cosine_similarity
10
  import matplotlib.pyplot as plt
 
11
  from sentence_transformers import SentenceTransformer, util
12
 
13
- import pandas as pd
 
14
 
15
  def extract_text_from_pdf(pdf_file):
16
  try:
@@ -20,35 +17,22 @@ def extract_text_from_pdf(pdf_file):
20
  text = page.extract_text()
21
  if text:
22
  full_text += text
23
- if full_text.strip():
24
- return full_text
25
  except Exception as e:
26
  print("Text extraction failed:", e)
27
-
28
- try:
29
- images = convert_from_bytes(pdf_file)
30
- text = ""
31
- for img in images:
32
- text += pytesseract.image_to_string(img)
33
- return text
34
- except Exception as e:
35
- print("OCR failed:", e)
36
  return ""
37
 
38
- def semantic_match(lo_list, content):
39
- lo_texts = [lo for lo in lo_list if lo.strip()]
40
- vectorizer = TfidfVectorizer().fit_transform([content] + lo_texts)
41
- vectors = vectorizer.toarray()
42
- content_vec = vectors[0]
43
- scores = [cosine_similarity([content_vec], [vec])[0][0] for vec in vectors[1:]]
44
- return scores
45
-
46
- def compute_change_percentage(old_text, new_text):
47
- vectorizer = TfidfVectorizer()
48
- vectors = vectorizer.fit_transform([old_text, new_text])
49
- cos_sim = cosine_similarity(vectors[0], vectors[1])[0][0]
50
- change_percentage = round((1 - cos_sim) * 100, 2)
51
- return change_percentage
52
 
53
  def compare_all(old_pdf, new_pdf, lo_file):
54
  try:
@@ -63,8 +47,8 @@ def compare_all(old_pdf, new_pdf, lo_file):
63
  if not old_text.strip() or not new_text.strip():
64
  return "❌ Could not extract text from one or both PDFs.", None, None
65
 
66
- old_scores = semantic_match(los, old_text)
67
- new_scores = semantic_match(los, new_text)
68
 
69
  labels = [f"LO{i+1}" for i in range(len(los))]
70
  x = range(len(labels))
@@ -77,35 +61,22 @@ def compare_all(old_pdf, new_pdf, lo_file):
77
  ax.set_title("Learning Outcomes Comparison")
78
  ax.legend()
79
 
80
- data = {
81
  "Learning Outcome": labels,
82
  "Old Match": [round(s*100, 2) for s in old_scores],
83
  "New Match": [round(s*100, 2) for s in new_scores],
84
- "Change (%)": [round((new - old)*100, 2) for new, old in zip(new_scores, old_scores)]
85
- }
86
- df = pd.DataFrame(data)
87
 
88
- change_percentage = compute_change_percentage(old_text, new_text)
89
- matched_los = sum(1 for s in new_scores if s >= 0.5)
90
 
91
-
92
- # Overall content change using cosine similarity
93
- vectorizer = TfidfVectorizer()
94
- vectors = vectorizer.fit_transform([old_text, new_text])
95
- cos_sim = cosine_similarity(vectors[0], vectors[1])[0][0]
96
- change_percentage = round((1 - cos_sim) * 100, 2)
97
-
98
- summary = f"""πŸ“˜ **Summary of Comparison**
99
-
100
- πŸ“ˆ **Overall Content Change**: {change_percentage}%
101
- πŸ” This percentage shows how much the content has changed from old to new handouts using TF-IDF vector similarity (cosine distance).
102
-
103
- 🎯 **Learning Outcome Matches**: {sum(1 for s in new_scores if s >= 0.5)} of {len(los)}
104
- 🧠 The system checks how well the new content aligns with each LO (threshold: 0.5 semantic match).
105
-
106
- 🟒 **Insight**: New content appears {'more' if sum(new_scores) > sum(old_scores) else 'less'} aligned with the learning outcomes.
107
-
108
- πŸ’‘ **Note**: Content change > 40% usually indicates significant modifications (new examples, topics, etc.). Review the table and chart below for LO-specific changes."""
109
 
110
  return summary, df, fig
111
 
@@ -117,12 +88,13 @@ iface = gr.Interface(
117
  gr.File(label="Learning Outcomes (Text File)", type='binary'),
118
  ],
119
  outputs=[
120
- gr.Textbox(label="Summary & Insights", lines=15, max_lines=20),
121
  gr.Dataframe(label="LO-wise Comparison Table"),
122
  gr.Plot(label="Visual Comparison Chart")
123
  ],
124
- title="πŸ“˜ Handout Comparator + LO Analysis",
125
- description="Upload old/new handouts + Learning Outcomes file (TXT). See content diff, visual LO match, and semantic change."
126
  )
127
 
128
- iface.launch()
 
 
1
 
2
  import gradio as gr
3
  from PyPDF2 import PdfReader
 
 
 
4
  import io
 
 
5
  import matplotlib.pyplot as plt
6
+ import pandas as pd
7
  from sentence_transformers import SentenceTransformer, util
8
 
9
+ # Load sentence transformer model
10
+ model = SentenceTransformer("all-MiniLM-L6-v2")
11
 
12
  def extract_text_from_pdf(pdf_file):
13
  try:
 
17
  text = page.extract_text()
18
  if text:
19
  full_text += text
20
+ return full_text
 
21
  except Exception as e:
22
  print("Text extraction failed:", e)
 
 
 
 
 
 
 
 
 
23
  return ""
24
 
25
+ def semantic_similarity_score(text1, text2):
26
+ embeddings = model.encode([text1, text2], convert_to_tensor=True)
27
+ score = util.pytorch_cos_sim(embeddings[0], embeddings[1]).item()
28
+ return score
29
+
30
+ def semantic_match_los(lo_list, content):
31
+ lo_scores = []
32
+ for lo in lo_list:
33
+ score = semantic_similarity_score(lo, content)
34
+ lo_scores.append(score)
35
+ return lo_scores
 
 
 
36
 
37
  def compare_all(old_pdf, new_pdf, lo_file):
38
  try:
 
47
  if not old_text.strip() or not new_text.strip():
48
  return "❌ Could not extract text from one or both PDFs.", None, None
49
 
50
+ old_scores = semantic_match_los(los, old_text)
51
+ new_scores = semantic_match_los(los, new_text)
52
 
53
  labels = [f"LO{i+1}" for i in range(len(los))]
54
  x = range(len(labels))
 
61
  ax.set_title("Learning Outcomes Comparison")
62
  ax.legend()
63
 
64
+ df = pd.DataFrame({
65
  "Learning Outcome": labels,
66
  "Old Match": [round(s*100, 2) for s in old_scores],
67
  "New Match": [round(s*100, 2) for s in new_scores],
68
+ "Change (%)": [round((n - o)*100, 2) for o, n in zip(old_scores, new_scores)]
69
+ })
 
70
 
71
+ content_sim = semantic_similarity_score(old_text, new_text)
72
+ content_change_pct = round((1 - content_sim) * 100, 2)
73
 
74
+ summary = f"πŸ“˜ **Summary of Comparison**\n\n"
75
+ summary += f"πŸ“ˆ **Overall Content Change**: {content_change_pct}%\n"
76
+ summary += f"πŸ” This percentage shows how much the content has changed using a semantic model (SentenceTransformer).\n\n"
77
+ summary += f"🎯 **Learning Outcome Matches**: {sum(1 for s in new_scores if s >= 0.5)} of {len(los)}\n"
78
+ summary += f"🧠 The system checks how well the new content aligns with each LO (threshold: 0.5 semantic match).\n\n"
79
+ summary += f"🟒 **Insight**: New content appears {'more' if sum(new_scores) > sum(old_scores) else 'less'} aligned with the learning outcomes.\n"
 
 
 
 
 
 
 
 
 
 
 
 
80
 
81
  return summary, df, fig
82
 
 
88
  gr.File(label="Learning Outcomes (Text File)", type='binary'),
89
  ],
90
  outputs=[
91
+ gr.Textbox(label="Summary & Insights", lines=10, max_lines=30),
92
  gr.Dataframe(label="LO-wise Comparison Table"),
93
  gr.Plot(label="Visual Comparison Chart")
94
  ],
95
+ title="πŸ“˜ Handout Comparator + LO Analysis (Semantic Transformer)",
96
+ description="Upload old/new handouts + Learning Outcomes (TXT). Compares content and outcome match using a transformer model (MiniLM)."
97
  )
98
 
99
+ iface.launch()
100
+