Deevyankar commited on
Commit
b0754aa
Β·
verified Β·
1 Parent(s): fe0db2c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +68 -75
app.py CHANGED
@@ -1,109 +1,102 @@
1
-
2
  import gradio as gr
3
  from PyPDF2 import PdfReader
4
- from sentence_transformers import SentenceTransformer, util
 
 
5
  import matplotlib.pyplot as plt
6
  import pandas as pd
7
  import numpy as np
 
8
 
9
- model = SentenceTransformer("all-MiniLM-L6-v2")
10
-
11
- def extract_text_pdf(file_obj):
12
  try:
13
- pdf_reader = PdfReader(file_obj)
14
  text = ""
15
- for page in pdf_reader.pages:
16
- extracted = page.extract_text()
17
- if extracted:
18
- text += extracted + "\n"
19
- return text if text.strip() else None
20
  except:
21
  return None
22
 
23
- def semantic_similarity(text1, text2):
24
- emb1 = model.encode([text1], convert_to_tensor=True)
25
- emb2 = model.encode([text2], convert_to_tensor=True)
26
- return float(util.pytorch_cos_sim(emb1, emb2)[0][0])
 
 
 
 
27
 
28
- def compare_with_los(text, lo_list):
29
- scores = []
30
- for lo in lo_list:
31
- score = util.cos_sim(model.encode(lo, convert_to_tensor=True),
32
- model.encode(text, convert_to_tensor=True))[0][0].item()
33
- scores.append(round(score * 100, 2))
34
- return scores
35
 
 
36
  def compare_all(old_pdf, new_pdf, lo_file):
37
- old_text = extract_text_pdf(old_pdf)
38
- new_text = extract_text_pdf(new_pdf)
39
-
40
  if not old_text or not new_text:
41
  return "❌ Could not extract text from one or both PDFs.", None, None
42
 
43
- # Overall semantic similarity
44
- sim_score = semantic_similarity(old_text, new_text)
45
- change_percent = round((1 - sim_score) * 100, 2)
46
- summary = f"πŸ“ˆ Content Change: {change_percent}%\n🧠 Similarity Score: {sim_score:.2f}\n\n"
47
-
48
- if change_percent < 10:
49
- summary += "βœ… Minor content update."
50
- elif change_percent < 40:
51
- summary += "πŸ”„ Moderate update."
52
- else:
53
- summary += "πŸ†• Significant changes detected."
54
-
55
- # LO comparison
56
- if hasattr(lo_file, 'read'):
57
- lo_text = lo_file.read().decode("utf-8", errors="ignore")
58
- else:
59
- lo_text = lo_file # already a string
60
-
61
- los = [line.strip() for line in lo_text.splitlines() if line.strip()]
62
- if not los:
63
- return summary + "\n⚠️ No valid Learning Outcomes found.", None, None
64
-
65
- old_scores = compare_with_los(old_text, los)
66
- new_scores = compare_with_los(new_text, los)
67
- score_diff = [round(new - old, 2) for old, new in zip(old_scores, new_scores)]
68
 
 
69
  df = pd.DataFrame({
70
  "Learning Outcome": los,
71
- "Old Match (%)": old_scores,
72
- "New Match (%)": new_scores,
73
- "Change (%)": score_diff
74
  })
75
- table_html = df.to_html(index=False)
76
 
77
- # Bar chart
78
- fig, ax = plt.subplots(figsize=(10, 4))
79
- index = np.arange(len(los))
80
- bar_width = 0.35
81
- ax.bar(index, old_scores, bar_width, label='Old')
82
- ax.bar(index + bar_width, new_scores, bar_width, label='New')
83
- ax.set_xlabel('Learning Outcomes')
84
  ax.set_ylabel('Match Score (%)')
85
- ax.set_title('LO-wise Semantic Match')
86
- ax.set_xticks(index + bar_width / 2)
87
- ax.set_xticklabels([f"LO{i+1}" for i in range(len(los))], rotation=45)
88
  ax.legend()
89
- fig.tight_layout()
 
 
 
 
 
 
 
 
 
 
 
90
 
91
- return summary, fig, table_html
92
 
 
93
  iface = gr.Interface(
94
  fn=compare_all,
95
  inputs=[
96
- gr.File(label="Old Handout (PDF)"),
97
- gr.File(label="New Handout (PDF)"),
98
- gr.File(label="Learning Outcomes (.txt)", file_types=[".txt"])
99
  ],
100
  outputs=[
101
- gr.Textbox(label="Summary"),
102
- gr.Plot(label="LO-wise Bar Chart"),
103
- gr.HTML(label="LO-wise Comparison Table")
104
  ],
105
- title="πŸ“˜ Semantic Handout Comparator with LO Alignment (Final Patch)",
106
- description="Compare course handouts and learning outcomes using robust PDF reading and transformers."
107
  )
108
 
109
- iface.launch()
 
 
 
1
  import gradio as gr
2
  from PyPDF2 import PdfReader
3
+ from sklearn.feature_extraction.text import TfidfVectorizer
4
+ from sklearn.metrics.pairwise import cosine_similarity
5
+ from transformers import pipeline
6
  import matplotlib.pyplot as plt
7
  import pandas as pd
8
  import numpy as np
9
+ import io
10
 
11
+ # Helper to extract text from PDF
12
+ def extract_text_from_pdf(file):
 
13
  try:
14
+ reader = PdfReader(file)
15
  text = ""
16
+ for page in reader.pages:
17
+ text += page.extract_text() or ""
18
+ return text.strip()
 
 
19
  except:
20
  return None
21
 
22
+ # Semantic match for each LO
23
+ def semantic_match(lo_list, content):
24
+ vectorizer = TfidfVectorizer().fit_transform([content] + lo_list)
25
+ vectors = vectorizer.toarray()
26
+ content_vector = vectors[0]
27
+ lo_vectors = vectors[1:]
28
+ similarities = cosine_similarity([content_vector], lo_vectors)[0]
29
+ return similarities.tolist()
30
 
31
+ # Summarizer (Optional: switch models if needed)
32
+ summarizer = pipeline("summarization", model="sshleifer/distilbart-cnn-12-6")
 
 
 
 
 
33
 
34
+ # Generate summary insight and visualization
35
  def compare_all(old_pdf, new_pdf, lo_file):
36
+ old_text = extract_text_from_pdf(old_pdf)
37
+ new_text = extract_text_from_pdf(new_pdf)
38
+
39
  if not old_text or not new_text:
40
  return "❌ Could not extract text from one or both PDFs.", None, None
41
 
42
+ los = lo_file.read().decode("utf-8", errors="ignore").splitlines()
43
+ los = [lo.strip() for lo in los if lo.strip()]
44
+
45
+ # Semantic matching scores
46
+ old_scores = semantic_match(los, old_text)
47
+ new_scores = semantic_match(los, new_text)
48
+ changes = [round(new - old, 2) for old, new in zip(old_scores, new_scores)]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
49
 
50
+ # Table and DataFrame
51
  df = pd.DataFrame({
52
  "Learning Outcome": los,
53
+ "Old Match (%)": [round(score * 100, 2) for score in old_scores],
54
+ "New Match (%)": [round(score * 100, 2) for score in new_scores],
55
+ "Change (%)": [round(change * 100, 2) for change in changes],
56
  })
 
57
 
58
+ # Visualization
59
+ fig, ax = plt.subplots(figsize=(10, 5))
60
+ x = np.arange(len(los))
61
+ width = 0.35
62
+ ax.bar(x - width/2, df["Old Match (%)"], width, label='Old')
63
+ ax.bar(x + width/2, df["New Match (%)"], width, label='New')
 
64
  ax.set_ylabel('Match Score (%)')
65
+ ax.set_title('LO-wise Semantic Match: Old vs New')
66
+ ax.set_xticks(x)
67
+ ax.set_xticklabels(los, rotation=45, ha='right')
68
  ax.legend()
69
+ plt.tight_layout()
70
+
71
+ # Content similarity and summary
72
+ content_vectorizer = TfidfVectorizer().fit_transform([old_text, new_text])
73
+ content_sim = cosine_similarity(content_vectorizer)[0, 1]
74
+ summary = summarizer(new_text[:2000], max_length=100, min_length=30, do_sample=False)[0]['summary_text']
75
+
76
+ summary_text = (
77
+ f"πŸ“ˆ Content Change Score: {round((1 - content_sim) * 100, 2)}%\n"
78
+ f"🎯 Learning Outcomes Analyzed: {len(los)}\n"
79
+ f"🟒 Summary of New Content: {summary}"
80
+ )
81
 
82
+ return summary_text, df, fig
83
 
84
+ # Interface
85
  iface = gr.Interface(
86
  fn=compare_all,
87
  inputs=[
88
+ gr.File(label="Old Handout PDF"),
89
+ gr.File(label="New Handout PDF"),
90
+ gr.File(label="Learning Outcomes (Text File)", type='file'),
91
  ],
92
  outputs=[
93
+ gr.Textbox(label="Summary & Insights"),
94
+ gr.Dataframe(label="LO-wise Comparison Table"),
95
+ gr.Plot(label="Visual Comparison Chart")
96
  ],
97
+ title="πŸ“˜ Syllabus Comparator with Learning Outcome Evaluation",
98
+ description="Upload two syllabus handouts (old and new) and a file containing learning outcomes. Get LO-wise comparison, visual chart, and overall content insight."
99
  )
100
 
101
+ if __name__ == "__main__":
102
+ iface.launch()