Deevyankar commited on
Commit
9af4462
Β·
verified Β·
1 Parent(s): 3378b7b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +40 -63
app.py CHANGED
@@ -1,20 +1,14 @@
1
 
2
-
3
  import gradio as gr
4
  from PyPDF2 import PdfReader
5
- from pdf2image import convert_from_bytes
6
- import pytesseract
7
- from PIL import Image
8
  import io
9
  from sklearn.feature_extraction.text import TfidfVectorizer
10
  from sklearn.metrics.pairwise import cosine_similarity
11
- from transformers import pipeline
12
  import matplotlib.pyplot as plt
13
  import pandas as pd
14
- from difflib import SequenceMatcher
 
15
 
16
- # Load transformer model for semantic similarity
17
- semantic_pipeline = pipeline("feature-extraction", model="sentence-transformers/all-MiniLM-L6-v2")
18
 
19
  def extract_text_from_pdf(pdf_file):
20
  try:
@@ -24,21 +18,12 @@ def extract_text_from_pdf(pdf_file):
24
  text = page.extract_text()
25
  if text:
26
  full_text += text
27
- if full_text.strip():
28
- return full_text
29
- except Exception as e:
30
- print("Text extraction failed:", e)
31
-
32
- try:
33
- images = convert_from_bytes(pdf_file)
34
- text = ""
35
- for img in images:
36
- text += pytesseract.image_to_string(img)
37
- return text
38
  except Exception as e:
39
- print("OCR failed:", e)
40
  return ""
41
 
 
42
  def semantic_match(lo_list, content):
43
  lo_texts = [lo for lo in lo_list if lo.strip()]
44
  vectorizer = TfidfVectorizer().fit_transform([content] + lo_texts)
@@ -47,43 +32,31 @@ def semantic_match(lo_list, content):
47
  scores = [cosine_similarity([content_vec], [vec])[0][0] for vec in vectors[1:]]
48
  return scores
49
 
50
- def compute_difference_and_text_change(old_text, new_text):
51
- similarity = SequenceMatcher(None, old_text, new_text).ratio()
52
- difference_percentage = round((1 - similarity) * 100, 2)
53
-
54
- len_old = len(old_text.split())
55
- len_new = len(new_text.split())
56
- length_change = round(((len_new - len_old) / len_old) * 100, 2)
57
- return difference_percentage, length_change
58
-
59
- def transformer_similarity(text1, text2):
60
- emb1 = semantic_pipeline(text1)[0][0]
61
- emb2 = semantic_pipeline(text2)[0][0]
62
- sim = cosine_similarity([emb1], [emb2])[0][0]
63
- return round(sim * 100, 2)
64
 
65
  def compare_all(old_pdf, new_pdf, lo_file):
66
  try:
67
  los = lo_file.decode("utf-8", errors="ignore").splitlines()
68
  los = [lo.strip() for lo in los if lo.strip()]
69
  except:
70
- return "❌ Could not read learning outcomes file.", None, None
71
 
72
  old_text = extract_text_from_pdf(old_pdf)
73
  new_text = extract_text_from_pdf(new_pdf)
74
 
75
- if not old_text.strip() or not new_text.strip():
76
- return "❌ Could not extract text from one or both PDFs.", None, None
77
 
78
  old_scores = semantic_match(los, old_text)
79
  new_scores = semantic_match(los, new_text)
80
 
81
  labels = [f"LO{i+1}" for i in range(len(los))]
82
- x = range(len(labels))
 
 
83
  fig, ax = plt.subplots()
84
- ax.bar(x, old_scores, width=0.4, label="Old", align='center')
85
- ax.bar([i + 0.4 for i in x], new_scores, width=0.4, label="New", align='center')
86
- ax.set_xticks([i + 0.2 for i in x])
87
  ax.set_xticklabels(labels, rotation=45)
88
  ax.set_ylabel("Semantic Match Score")
89
  ax.set_title("Learning Outcomes Comparison")
@@ -92,35 +65,37 @@ def compare_all(old_pdf, new_pdf, lo_file):
92
  # Table
93
  data = {
94
  "Learning Outcome": labels,
95
- "Old Match": [round(s*100, 2) for s in old_scores],
96
- "New Match": [round(s*100, 2) for s in new_scores],
97
- "Change (%)": [round((new - old)*100, 2) for new, old in zip(new_scores, old_scores)]
98
  }
99
  df = pd.DataFrame(data)
100
 
101
- # Calculate metrics
102
- tfidf_similarity = round(cosine_similarity(
103
- [TfidfVectorizer().fit_transform([old_text, new_text]).toarray()[0]],
104
- [TfidfVectorizer().fit_transform([old_text, new_text]).toarray()[1]]
105
- )[0][0] * 100, 2)
106
 
107
- diff_pct, length_delta = compute_difference_and_text_change(old_text, new_text)
108
- transformer_sim = transformer_similarity(old_text, new_text)
 
 
109
 
110
  summary = f"""
111
  πŸ“˜ **Summary of Comparison**
112
 
113
- πŸ“ˆ **TF-IDF Similarity**: {tfidf_similarity}%
114
- πŸ€– **Transformer Similarity**: {transformer_sim}%
115
- πŸ”„ **Textual Change** (Diff-based): {diff_pct}%
116
- πŸ“ **Text Length Change**: {length_delta}% (words)
 
117
 
118
  🎯 **Learning Outcome Matches**: {sum(1 for s in new_scores if s >= 0.5)} of {len(los)}
119
- 🧠 **Insight**: New content appears {'more' if sum(new_scores) > sum(old_scores) else 'less'} aligned with the learning outcomes.
120
- πŸ’¬ **Tip**: Diff > 30% or word increase > 20% generally reflects real updates.
121
  """
122
 
123
- return summary.strip(), df, fig
 
124
 
125
  iface = gr.Interface(
126
  fn=compare_all,
@@ -130,12 +105,14 @@ iface = gr.Interface(
130
  gr.File(label="Learning Outcomes (Text File)", type='binary'),
131
  ],
132
  outputs=[
133
- gr.Textbox(label="πŸ“Š Summary Report", lines=12),
134
- gr.Dataframe(label="πŸ“‹ LO-wise Comparison Table"),
135
- gr.Plot(label="πŸ“ˆ LO Match Chart")
 
136
  ],
137
- title="πŸ“˜ Handout Comparator + LO Analyzer (with AI)",
138
- description="Compare two handouts and learning outcomes. View similarity via TF-IDF and Transformers. Bar chart and table included."
139
  )
140
 
141
  iface.launch()
 
 
1
 
 
2
  import gradio as gr
3
  from PyPDF2 import PdfReader
 
 
 
4
  import io
5
  from sklearn.feature_extraction.text import TfidfVectorizer
6
  from sklearn.metrics.pairwise import cosine_similarity
 
7
  import matplotlib.pyplot as plt
8
  import pandas as pd
9
+ import numpy as np
10
+ import re
11
 
 
 
12
 
13
  def extract_text_from_pdf(pdf_file):
14
  try:
 
18
  text = page.extract_text()
19
  if text:
20
  full_text += text
21
+ return full_text.strip()
 
 
 
 
 
 
 
 
 
 
22
  except Exception as e:
23
+ print("PDF extraction error:", e)
24
  return ""
25
 
26
+
27
  def semantic_match(lo_list, content):
28
  lo_texts = [lo for lo in lo_list if lo.strip()]
29
  vectorizer = TfidfVectorizer().fit_transform([content] + lo_texts)
 
32
  scores = [cosine_similarity([content_vec], [vec])[0][0] for vec in vectors[1:]]
33
  return scores
34
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
35
 
36
  def compare_all(old_pdf, new_pdf, lo_file):
37
  try:
38
  los = lo_file.decode("utf-8", errors="ignore").splitlines()
39
  los = [lo.strip() for lo in los if lo.strip()]
40
  except:
41
+ return "❌ Could not read learning outcomes file.", None, None, None
42
 
43
  old_text = extract_text_from_pdf(old_pdf)
44
  new_text = extract_text_from_pdf(new_pdf)
45
 
46
+ if not old_text or not new_text:
47
+ return "❌ Could not extract text from one or both PDFs.", None, None, None
48
 
49
  old_scores = semantic_match(los, old_text)
50
  new_scores = semantic_match(los, new_text)
51
 
52
  labels = [f"LO{i+1}" for i in range(len(los))]
53
+ x = np.arange(len(labels))
54
+
55
+ # Plot
56
  fig, ax = plt.subplots()
57
+ ax.bar(x - 0.2, old_scores, width=0.4, label="Old", align='center')
58
+ ax.bar(x + 0.2, new_scores, width=0.4, label="New", align='center')
59
+ ax.set_xticks(x)
60
  ax.set_xticklabels(labels, rotation=45)
61
  ax.set_ylabel("Semantic Match Score")
62
  ax.set_title("Learning Outcomes Comparison")
 
65
  # Table
66
  data = {
67
  "Learning Outcome": labels,
68
+ "Old Match (%)": [round(s * 100, 2) for s in old_scores],
69
+ "New Match (%)": [round(s * 100, 2) for s in new_scores],
70
+ "Change (%)": [round((new - old) * 100, 2) for new, old in zip(new_scores, old_scores)]
71
  }
72
  df = pd.DataFrame(data)
73
 
74
+ # Content similarity
75
+ tfidf = TfidfVectorizer().fit_transform([old_text, new_text])
76
+ cosine_sim = cosine_similarity(tfidf[0:1], tfidf[1:2])[0][0] * 100
77
+ content_diff = 100 - round(cosine_sim, 2)
 
78
 
79
+ # Text size change
80
+ len_old = len(re.findall(r'\w+', old_text))
81
+ len_new = len(re.findall(r'\w+', new_text))
82
+ word_change_percent = round(((len_new - len_old) / len_old) * 100, 2)
83
 
84
  summary = f"""
85
  πŸ“˜ **Summary of Comparison**
86
 
87
+ πŸ“ˆ **Overall Content Change**: {content_diff:.2f}%
88
+ πŸ” This is based on TF-IDF cosine similarity between old and new handouts.
89
+
90
+ πŸ“ **Text Length Difference**: {'+' if word_change_percent >= 0 else ''}{word_change_percent:.2f}%
91
+ Compared by total number of words in both handouts.
92
 
93
  🎯 **Learning Outcome Matches**: {sum(1 for s in new_scores if s >= 0.5)} of {len(los)}
94
+ βœ… New handout appears {'more' if sum(new_scores) > sum(old_scores) else 'less'} aligned with stated outcomes.
 
95
  """
96
 
97
+ return summary.strip(), df, fig, "βœ… Comparison completed successfully."
98
+
99
 
100
  iface = gr.Interface(
101
  fn=compare_all,
 
105
  gr.File(label="Learning Outcomes (Text File)", type='binary'),
106
  ],
107
  outputs=[
108
+ gr.Textbox(label="πŸ“˜ Summary & Insights", lines=20, max_lines=25),
109
+ gr.Dataframe(label="πŸ“Š LO-wise Comparison Table"),
110
+ gr.Plot(label="πŸ“ˆ LO Visual Comparison"),
111
+ gr.Textbox(label="ℹ️ Status", lines=1)
112
  ],
113
+ title="πŸ“˜ Handout Comparator + LO Analyzer",
114
+ description="Upload OLD and NEW handouts in PDF format along with a TXT file of Learning Outcomes. The app compares content changes and evaluates alignment with LOs visually and in table format."
115
  )
116
 
117
  iface.launch()
118
+