Deevyankar commited on
Commit
671e233
Β·
verified Β·
1 Parent(s): e0b1653

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +62 -80
app.py CHANGED
@@ -1,4 +1,5 @@
1
 
 
2
  import gradio as gr
3
  from PyPDF2 import PdfReader
4
  from sklearn.feature_extraction.text import TfidfVectorizer
@@ -8,71 +9,70 @@ import matplotlib.pyplot as plt
8
  import pandas as pd
9
  import io
10
 
11
- # Load Sentence Transformer model
12
- model = SentenceTransformer('all-MiniLM-L6-v2')
13
 
14
- def extract_text_from_pdf(pdf_file):
15
  try:
16
- reader = PdfReader(io.BytesIO(pdf_file))
17
- full_text = ""
18
- for page in reader.pages:
19
- text = page.extract_text()
20
- if text:
21
- full_text += text
22
- return full_text
23
  except Exception as e:
 
24
  return ""
25
 
26
- def semantic_match(lo_list, content):
27
- lo_texts = [lo for lo in lo_list if lo.strip()]
28
- vectorizer = TfidfVectorizer().fit_transform([content] + lo_texts)
29
- vectors = vectorizer.toarray()
30
- content_vec = vectors[0]
31
- scores = [cosine_similarity([content_vec], [vec])[0][0] for vec in vectors[1:]]
32
- return scores
33
 
34
  def transformer_similarity(text1, text2):
35
- emb1 = model.encode(text1, convert_to_tensor=True)
36
- emb2 = model.encode(text2, convert_to_tensor=True)
37
- return float(util.cos_sim(emb1, emb2))
38
-
39
- def calculate_quality_index(new_text):
40
- levels = {
41
- "Remember": ["define", "list", "recall"],
42
- "Understand": ["explain", "summarize", "classify"],
43
- "Apply": ["demonstrate", "use", "implement"],
44
- "Analyze": ["differentiate", "organize", "compare"],
45
- "Evaluate": ["justify", "criticize", "conclude"],
46
- "Create": ["design", "construct", "develop"]
47
  }
48
- quality_score = 0
49
- for level, keywords in levels.items():
50
- for word in keywords:
51
- if word in new_text.lower():
52
- quality_score += list(levels).index(level) + 1
53
- break
54
- return round(quality_score / (len(levels) * 1.0) * 100, 2)
 
 
 
 
55
 
56
  def compare_all(old_pdf, new_pdf, lo_file):
57
  try:
58
- los = lo_file.decode("utf-8", errors="ignore").splitlines()
59
- los = [lo.strip() for lo in los if lo.strip()]
60
- except:
61
  return "❌ Could not read learning outcomes file.", None, None, None
62
 
63
  old_text = extract_text_from_pdf(old_pdf)
64
  new_text = extract_text_from_pdf(new_pdf)
65
 
66
- if not old_text.strip() or not new_text.strip():
67
  return "❌ Could not extract text from one or both PDFs.", None, None, None
68
 
69
- old_scores = semantic_match(los, old_text)
70
- new_scores = semantic_match(los, new_text)
 
 
 
 
71
 
72
- # Bar Plot
73
  labels = [f"LO{i+1}" for i in range(len(los))]
74
  x = range(len(labels))
75
- fig, ax = plt.subplots()
76
  ax.bar(x, old_scores, width=0.4, label="Old", align='center')
77
  ax.bar([i + 0.4 for i in x], new_scores, width=0.4, label="New", align='center')
78
  ax.set_xticks([i + 0.2 for i in x])
@@ -81,63 +81,45 @@ def compare_all(old_pdf, new_pdf, lo_file):
81
  ax.set_title("Learning Outcomes Comparison")
82
  ax.legend()
83
 
84
- # Table
85
  data = {
86
  "Learning Outcome": labels,
 
 
87
  "Old Match": [round(s*100, 2) for s in old_scores],
88
  "New Match": [round(s*100, 2) for s in new_scores],
89
- "Change (%)": [round((new - old)*100, 2) for new, old in zip(new_scores, old_scores)]
90
  }
91
  df = pd.DataFrame(data)
92
 
93
- # TF-IDF Cosine Similarity
94
- tfidf_sim = cosine_similarity(
95
- TfidfVectorizer().fit_transform([old_text, new_text])
96
- )[0][1]
97
- tfidf_percent = round(tfidf_sim * 100, 2)
98
-
99
- # Sentence Transformer Similarity
100
- try:
101
- trans_sim = round(transformer_similarity(old_text, new_text) * 100, 2)
102
- except:
103
- trans_sim = "N/A"
104
-
105
- # Text Length Change
106
- length_change = round(((len(new_text) - len(old_text)) / len(old_text)) * 100, 2)
107
-
108
- # Bloom's Taxonomy Quality Index
109
- quality_index = calculate_quality_index(new_text)
110
 
111
- # Summary Box
112
- summary = f"""
113
- πŸ“˜ **Summary of Comparison**
114
 
115
- πŸ“ˆ **TF-IDF Content Similarity**: {tfidf_percent}%
116
- 🧠 **Transformer-based Semantic Similarity**: {trans_sim}%
117
- ✍️ **Text Length Change**: {length_change}% ({'more' if length_change > 0 else 'less'} content)
118
- 🌟 **Quality Index (Bloom's Taxonomy)**: {quality_index}%
119
-
120
- 🎯 **Matched Learning Outcomes**: {sum(1 for s in new_scores if s >= 0.5)} of {len(los)}
121
- πŸ“Š **Insight**: New content appears {'better' if sum(new_scores) > sum(old_scores) else 'less'} aligned with outcomes.
122
  """
123
 
124
- return summary, df, fig, quality_index
 
 
125
 
126
  iface = gr.Interface(
127
  fn=compare_all,
128
  inputs=[
129
  gr.File(label="Old Handout PDF", type='binary'),
130
  gr.File(label="New Handout PDF", type='binary'),
131
- gr.File(label="Learning Outcomes (TXT)", type='binary'),
132
  ],
133
  outputs=[
134
- gr.Markdown(label="πŸ“˜ Summary & Quality"),
135
  gr.Dataframe(label="πŸ“Š LO-wise Comparison Table"),
136
- gr.Plot(label="πŸ“‰ Visual LO Alignment"),
137
- gr.Number(label="🌟 Bloom’s Quality Index (%)")
138
  ],
139
- title="🧾 Handout Comparator + Bloom Level Analyzer",
140
- description="Compare two handouts and detect content change, alignment with outcomes, and cognitive improvement using AI & Bloom's Taxonomy."
141
  )
142
 
143
  iface.launch()
 
1
 
2
+
3
  import gradio as gr
4
  from PyPDF2 import PdfReader
5
  from sklearn.feature_extraction.text import TfidfVectorizer
 
9
  import pandas as pd
10
  import io
11
 
12
+ # Load transformer model for semantic similarity
13
+ model = SentenceTransformer('paraphrase-MiniLM-L6-v2')
14
 
15
+ def extract_text_from_pdf(file_bytes):
16
  try:
17
+ reader = PdfReader(io.BytesIO(file_bytes))
18
+ return " ".join([page.extract_text() or "" for page in reader.pages]).strip()
 
 
 
 
 
19
  except Exception as e:
20
+ print("Error extracting text:", e)
21
  return ""
22
 
23
+ def tfidf_similarity(text1, text2):
24
+ vectorizer = TfidfVectorizer().fit_transform([text1, text2])
25
+ return cosine_similarity(vectorizer[0:1], vectorizer[1:2])[0][0]
 
 
 
 
26
 
27
  def transformer_similarity(text1, text2):
28
+ embeddings = model.encode([text1, text2], convert_to_tensor=True)
29
+ return util.pytorch_cos_sim(embeddings[0], embeddings[1]).item()
30
+
31
+ def bloom_level(term):
32
+ term = term.lower()
33
+ blooms = {
34
+ "remember": ["define", "list", "recall", "identify"],
35
+ "understand": ["explain", "describe", "summarize"],
36
+ "apply": ["apply", "demonstrate", "use"],
37
+ "analyze": ["analyze", "compare", "contrast"],
38
+ "evaluate": ["evaluate", "judge", "critique"],
39
+ "create": ["create", "design", "formulate"]
40
  }
41
+ for level, keywords in blooms.items():
42
+ if any(word in term for word in keywords):
43
+ return level.capitalize()
44
+ return "Unknown"
45
+
46
+ def lo_semantic_scores(los, content):
47
+ scores = []
48
+ for lo in los:
49
+ score = transformer_similarity(lo, content)
50
+ scores.append(score)
51
+ return scores
52
 
53
  def compare_all(old_pdf, new_pdf, lo_file):
54
  try:
55
+ lo_content = lo_file.read().decode("utf-8", errors="ignore") if hasattr(lo_file, "read") else lo_file.decode("utf-8", errors="ignore")
56
+ los = [line.strip() for line in lo_content.splitlines() if line.strip()]
57
+ except Exception as e:
58
  return "❌ Could not read learning outcomes file.", None, None, None
59
 
60
  old_text = extract_text_from_pdf(old_pdf)
61
  new_text = extract_text_from_pdf(new_pdf)
62
 
63
+ if not old_text or not new_text:
64
  return "❌ Could not extract text from one or both PDFs.", None, None, None
65
 
66
+ tfidf_sim = tfidf_similarity(old_text, new_text)
67
+ transformer_sim = transformer_similarity(old_text, new_text)
68
+ text_growth = round(((len(new_text) - len(old_text)) / len(old_text)) * 100, 2)
69
+
70
+ old_scores = lo_semantic_scores(los, old_text)
71
+ new_scores = lo_semantic_scores(los, new_text)
72
 
 
73
  labels = [f"LO{i+1}" for i in range(len(los))]
74
  x = range(len(labels))
75
+ fig, ax = plt.subplots(figsize=(10, 5))
76
  ax.bar(x, old_scores, width=0.4, label="Old", align='center')
77
  ax.bar([i + 0.4 for i in x], new_scores, width=0.4, label="New", align='center')
78
  ax.set_xticks([i + 0.2 for i in x])
 
81
  ax.set_title("Learning Outcomes Comparison")
82
  ax.legend()
83
 
 
84
  data = {
85
  "Learning Outcome": labels,
86
+ "LO Text": los,
87
+ "Bloom Level": [bloom_level(lo) for lo in los],
88
  "Old Match": [round(s*100, 2) for s in old_scores],
89
  "New Match": [round(s*100, 2) for s in new_scores],
90
+ "Change (%)": [round((n - o)*100, 2) for n, o in zip(new_scores, old_scores)]
91
  }
92
  df = pd.DataFrame(data)
93
 
94
+ summary = f"""πŸ“˜ **Summary of Comparison**
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
95
 
96
+ πŸ“ˆ **TF-IDF Content Change**: {round((1 - tfidf_sim) * 100, 2)}%
97
+ 🧠 **Transformer-based Similarity**: {round(transformer_sim * 100, 2)}%
98
+ πŸ“ **Content Length Change**: {text_growth}% {"πŸ“‰ Reduced" if text_growth < 0 else "πŸ“ˆ Increased"}
99
 
100
+ 🎯 **LO Matches**: {sum(1 for score in new_scores if score > 0.5)} of {len(los)}
101
+ πŸ“Š **Content appears {'more' if sum(new_scores) > sum(old_scores) else 'less'} aligned with learning outcomes.**
 
 
 
 
 
102
  """
103
 
104
+ return summary, df, fig, new_text[:2000] + "..."
105
+
106
+ import gradio as gr
107
 
108
  iface = gr.Interface(
109
  fn=compare_all,
110
  inputs=[
111
  gr.File(label="Old Handout PDF", type='binary'),
112
  gr.File(label="New Handout PDF", type='binary'),
113
+ gr.File(label="Learning Outcomes (TXT)", type='binary')
114
  ],
115
  outputs=[
116
+ gr.Markdown(label="πŸ“‹ Summary"),
117
  gr.Dataframe(label="πŸ“Š LO-wise Comparison Table"),
118
+ gr.Plot(label="πŸ“ˆ LO Match Chart"),
119
+ gr.Textbox(label="πŸ“ Preview of New Content")
120
  ],
121
+ title="πŸ“˜ AI Handout Comparator + LO Aligner",
122
+ description="Compare two versions of handouts using both TF-IDF and Transformers. Analyze changes in content, alignment with Learning Outcomes, and Bloom’s taxonomy level."
123
  )
124
 
125
  iface.launch()