Deevyankar commited on
Commit
8751b01
Β·
verified Β·
1 Parent(s): bc78525

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +64 -35
app.py CHANGED
@@ -1,13 +1,17 @@
1
 
2
  import gradio as gr
3
  from PyPDF2 import PdfReader
 
 
 
4
  import io
 
 
5
  import matplotlib.pyplot as plt
6
  import pandas as pd
7
- from sentence_transformers import SentenceTransformer, util
8
 
9
- # Load sentence transformer model
10
- model = SentenceTransformer("all-MiniLM-L6-v2")
11
 
12
  def extract_text_from_pdf(pdf_file):
13
  try:
@@ -17,22 +21,28 @@ def extract_text_from_pdf(pdf_file):
17
  text = page.extract_text()
18
  if text:
19
  full_text += text
20
- return full_text
 
21
  except Exception as e:
22
  print("Text extraction failed:", e)
23
- return ""
24
 
25
- def semantic_similarity_score(text1, text2):
26
- embeddings = model.encode([text1, text2], convert_to_tensor=True)
27
- score = util.pytorch_cos_sim(embeddings[0], embeddings[1]).item()
28
- return score
 
 
 
 
 
29
 
30
- def semantic_match_los(lo_list, content):
31
- lo_scores = []
32
- for lo in lo_list:
33
- score = semantic_similarity_score(lo, content)
34
- lo_scores.append(score)
35
- return lo_scores
 
36
 
37
  def compare_all(old_pdf, new_pdf, lo_file):
38
  try:
@@ -47,12 +57,22 @@ def compare_all(old_pdf, new_pdf, lo_file):
47
  if not old_text.strip() or not new_text.strip():
48
  return "❌ Could not extract text from one or both PDFs.", None, None
49
 
50
- old_scores = semantic_match_los(los, old_text)
51
- new_scores = semantic_match_los(los, new_text)
 
 
 
 
 
52
 
 
 
 
 
 
53
  labels = [f"LO{i+1}" for i in range(len(los))]
54
  x = range(len(labels))
55
- fig, ax = plt.subplots()
56
  ax.bar(x, old_scores, width=0.4, label="Old", align='center')
57
  ax.bar([i + 0.4 for i in x], new_scores, width=0.4, label="New", align='center')
58
  ax.set_xticks([i + 0.2 for i in x])
@@ -61,22 +81,32 @@ def compare_all(old_pdf, new_pdf, lo_file):
61
  ax.set_title("Learning Outcomes Comparison")
62
  ax.legend()
63
 
64
- df = pd.DataFrame({
 
65
  "Learning Outcome": labels,
66
  "Old Match": [round(s*100, 2) for s in old_scores],
67
  "New Match": [round(s*100, 2) for s in new_scores],
68
- "Change (%)": [round((n - o)*100, 2) for o, n in zip(old_scores, new_scores)]
69
- })
70
-
71
- content_sim = semantic_similarity_score(old_text, new_text)
72
- content_change_pct = round((1 - content_sim) * 100, 2)
73
-
74
- summary = f"πŸ“˜ **Summary of Comparison**\n\n"
75
- summary += f"πŸ“ˆ **Overall Content Change**: {content_change_pct}%\n"
76
- summary += f"πŸ” This percentage shows how much the content has changed using a semantic model (SentenceTransformer).\n\n"
77
- summary += f"🎯 **Learning Outcome Matches**: {sum(1 for s in new_scores if s >= 0.5)} of {len(los)}\n"
78
- summary += f"🧠 The system checks how well the new content aligns with each LO (threshold: 0.5 semantic match).\n\n"
79
- summary += f"🟒 **Insight**: New content appears {'more' if sum(new_scores) > sum(old_scores) else 'less'} aligned with the learning outcomes.\n"
 
 
 
 
 
 
 
 
 
80
 
81
  return summary, df, fig
82
 
@@ -88,13 +118,12 @@ iface = gr.Interface(
88
  gr.File(label="Learning Outcomes (Text File)", type='binary'),
89
  ],
90
  outputs=[
91
- gr.Textbox(label="Summary & Insights", lines=10, max_lines=30),
92
  gr.Dataframe(label="LO-wise Comparison Table"),
93
  gr.Plot(label="Visual Comparison Chart")
94
  ],
95
- title="πŸ“˜ Handout Comparator + LO Analysis (Semantic Transformer)",
96
- description="Upload old/new handouts + Learning Outcomes (TXT). Compares content and outcome match using a transformer model (MiniLM)."
97
  )
98
 
99
  iface.launch()
100
-
 
1
 
2
  import gradio as gr
3
  from PyPDF2 import PdfReader
4
+ from pdf2image import convert_from_bytes
5
+ import pytesseract
6
+ from PIL import Image
7
  import io
8
+ from sklearn.feature_extraction.text import TfidfVectorizer
9
+ from sklearn.metrics.pairwise import cosine_similarity
10
  import matplotlib.pyplot as plt
11
  import pandas as pd
12
+ from sentence_transformers import SentenceTransformer
13
 
14
+ model = SentenceTransformer('all-MiniLM-L6-v2')
 
15
 
16
  def extract_text_from_pdf(pdf_file):
17
  try:
 
21
  text = page.extract_text()
22
  if text:
23
  full_text += text
24
+ if full_text.strip():
25
+ return full_text
26
  except Exception as e:
27
  print("Text extraction failed:", e)
 
28
 
29
+ try:
30
+ images = convert_from_bytes(pdf_file)
31
+ text = ""
32
+ for img in images:
33
+ text += pytesseract.image_to_string(img)
34
+ return text
35
+ except Exception as e:
36
+ print("OCR failed:", e)
37
+ return ""
38
 
39
+ def semantic_match(lo_list, content):
40
+ lo_texts = [lo for lo in lo_list if lo.strip()]
41
+ vectorizer = TfidfVectorizer().fit_transform([content] + lo_texts)
42
+ vectors = vectorizer.toarray()
43
+ content_vec = vectors[0]
44
+ scores = [cosine_similarity([content_vec], [vec])[0][0] for vec in vectors[1:]]
45
+ return scores
46
 
47
  def compare_all(old_pdf, new_pdf, lo_file):
48
  try:
 
57
  if not old_text.strip() or not new_text.strip():
58
  return "❌ Could not extract text from one or both PDFs.", None, None
59
 
60
+ # Similarity Calculations
61
+ tfidf_vectorizer = TfidfVectorizer().fit_transform([old_text, new_text])
62
+ tfidf_score = cosine_similarity([tfidf_vectorizer.toarray()[0]], [tfidf_vectorizer.toarray()[1]])[0][0] * 100
63
+
64
+ embed_old = model.encode(old_text, convert_to_tensor=True)
65
+ embed_new = model.encode(new_text, convert_to_tensor=True)
66
+ semantic_score = float(cosine_similarity([embed_old], [embed_new])[0][0]) * 100
67
 
68
+ # LO Scores
69
+ old_scores = semantic_match(los, old_text)
70
+ new_scores = semantic_match(los, new_text)
71
+
72
+ # Bar Plot
73
  labels = [f"LO{i+1}" for i in range(len(los))]
74
  x = range(len(labels))
75
+ fig, ax = plt.subplots(figsize=(10, 5))
76
  ax.bar(x, old_scores, width=0.4, label="Old", align='center')
77
  ax.bar([i + 0.4 for i in x], new_scores, width=0.4, label="New", align='center')
78
  ax.set_xticks([i + 0.2 for i in x])
 
81
  ax.set_title("Learning Outcomes Comparison")
82
  ax.legend()
83
 
84
+ # Table
85
+ data = {
86
  "Learning Outcome": labels,
87
  "Old Match": [round(s*100, 2) for s in old_scores],
88
  "New Match": [round(s*100, 2) for s in new_scores],
89
+ "Change (%)": [round((new - old)*100, 2) for new, old in zip(new_scores, old_scores)]
90
+ }
91
+ df = pd.DataFrame(data)
92
+
93
+ # Insight Generation
94
+ lo_diff = sum(new_scores) - sum(old_scores)
95
+ if abs(lo_diff) < 0.01:
96
+ insight = "βšͺ No significant change in alignment with learning outcomes."
97
+ elif lo_diff > 0:
98
+ insight = "🟒 New content appears more aligned with outcomes."
99
+ else:
100
+ insight = "πŸ”΄ New content appears less aligned with outcomes."
101
+
102
+ matched_lo = sum(1 for s in new_scores if s >= 0.5)
103
+ total_lo = len(los)
104
+
105
+ summary = f"πŸ“˜ Summary of Comparison\n\n"
106
+ summary += f"πŸ”Ή Semantic Similarity (Transformer): {round(semantic_score, 2)}%\n"
107
+ summary += f"πŸ”Ή Structural Similarity (TF-IDF): {round(tfidf_score, 2)}%\n\n"
108
+ summary += f"🎯 Learning Outcome Matches: {matched_lo} of {total_lo}\n"
109
+ summary += insight
110
 
111
  return summary, df, fig
112
 
 
118
  gr.File(label="Learning Outcomes (Text File)", type='binary'),
119
  ],
120
  outputs=[
121
+ gr.Textbox(label="Summary & Insights", lines=10),
122
  gr.Dataframe(label="LO-wise Comparison Table"),
123
  gr.Plot(label="Visual Comparison Chart")
124
  ],
125
+ title="πŸ“˜ Handout Comparator + LO Analysis (Dual Similarity)",
126
+ description="Upload old/new handouts + Learning Outcomes file (TXT). See content diff, LO alignment, and dual similarity scoring (TF-IDF + Transformers)."
127
  )
128
 
129
  iface.launch()