Deevyankar commited on
Commit
a4cd443
Β·
verified Β·
1 Parent(s): 7a8b10d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +26 -28
app.py CHANGED
@@ -1,12 +1,17 @@
1
 
 
2
  import gradio as gr
3
  from sklearn.feature_extraction.text import TfidfVectorizer
4
  from sklearn.metrics.pairwise import cosine_similarity
 
5
  import fitz # PyMuPDF
6
  import docx
7
  import matplotlib.pyplot as plt
8
  import pandas as pd
9
 
 
 
 
10
  def extract_text_from_pdf(file):
11
  text = ""
12
  try:
@@ -22,52 +27,45 @@ def extract_text_from_docx(file):
22
  return "\n".join([para.text for para in doc.paragraphs])
23
 
24
  def semantic_match(lo_texts, content):
25
- vectorizer = TfidfVectorizer().fit_transform([content] + lo_texts)
26
- vectors = vectorizer.toarray()
27
- content_vector = vectors[0]
28
- lo_vectors = vectors[1:]
29
- similarities = cosine_similarity([content_vector], lo_vectors)[0]
30
  return similarities.tolist()
31
 
32
  def compare_handouts(old_pdf, new_pdf, lo_file):
33
- # Extract text from handouts
34
  old_text = extract_text_from_pdf(old_pdf)
35
  new_text = extract_text_from_pdf(new_pdf)
36
  if not old_text.strip() or not new_text.strip():
37
- return "Could not extract text from one or both PDFs.", None, None
38
 
39
- # Extract Learning Outcomes
40
  lo_text = extract_text_from_docx(lo_file)
41
  lo_list = [line.strip() for line in lo_text.split("\n") if line.strip()]
42
  if not lo_list:
43
- return "No learning outcomes detected.", None, None
44
 
45
- # Match scores
46
  old_scores = semantic_match(lo_list, old_text)
47
  new_scores = semantic_match(lo_list, new_text)
48
 
49
- # Calculate overall change
50
  avg_old = sum(old_scores) / len(old_scores)
51
  avg_new = sum(new_scores) / len(new_scores)
52
  change = round(((avg_new - avg_old) / avg_old) * 100, 2) if avg_old != 0 else 100.0
53
 
54
- # Summary
55
  matched = sum([1 for o, n in zip(old_scores, new_scores) if n >= o])
56
  summary = f"πŸ“ˆ Content Change: {change:.2f}%\n🎯 Matched LOs: {matched} of {len(lo_list)}"
57
  if change > 10:
58
  summary += "\n🟒 New content appears more detailed and informative."
59
  elif change < -10:
60
- summary += "\nπŸ”΄ Some content may have been removed or simplified."
61
  else:
62
- summary += "\n🟑 Minor updates detected."
63
 
64
- # LO-wise chart and table
65
  los = [f"LO{i+1}" for i in range(len(lo_list))]
66
  percentage_change = [round(((n - o) / o) * 100, 2) if o else 100.0 for o, n in zip(old_scores, new_scores)]
67
  df = pd.DataFrame({
68
  "Learning Outcome": los,
69
- "Old Score": old_scores,
70
- "New Score": new_scores,
71
  "% Change": percentage_change
72
  })
73
 
@@ -83,7 +81,7 @@ def compare_handouts(old_pdf, new_pdf, lo_file):
83
  plt.savefig(table_path, bbox_inches='tight', dpi=300)
84
  plt.close()
85
 
86
- # Chart image
87
  fig, ax = plt.subplots(figsize=(10, 4))
88
  bar_width = 0.35
89
  index = range(len(los))
@@ -91,10 +89,10 @@ def compare_handouts(old_pdf, new_pdf, lo_file):
91
  ax.bar([i + bar_width for i in index], new_scores, bar_width, label='New', alpha=0.7)
92
  ax.set_xticks([i + bar_width / 2 for i in index])
93
  ax.set_xticklabels(los)
94
- ax.set_ylabel('Match Score (0-1)')
95
- ax.set_title('LO-wise Match Score Comparison')
96
  ax.legend()
97
- chart_path = "/mnt/data/lo_comparison_chart.png"
98
  plt.tight_layout()
99
  plt.savefig(chart_path)
100
  plt.close()
@@ -103,15 +101,15 @@ def compare_handouts(old_pdf, new_pdf, lo_file):
103
 
104
  # Gradio interface
105
  with gr.Blocks() as demo:
106
- gr.Markdown("# πŸ“˜ Handout Change Analyzer with LO Mapping")
107
  with gr.Row():
108
- old_pdf_input = gr.File(label="Upload Old Handout (PDF)", file_types=[".pdf"])
109
- new_pdf_input = gr.File(label="Upload New Handout (PDF)", file_types=[".pdf"])
110
- lo_input = gr.File(label="Upload Learning Outcomes (DOCX)", file_types=[".docx"])
111
- submit_btn = gr.Button("πŸ” Analyze Changes")
112
  summary_output = gr.Textbox(label="Summary")
113
- lo_table_output = gr.Image(label="πŸ“‹ LO Comparison Table")
114
- lo_chart_output = gr.Image(label="πŸ“ˆ LO Score Chart")
115
 
116
  submit_btn.click(fn=compare_handouts, inputs=[old_pdf_input, new_pdf_input, lo_input],
117
  outputs=[summary_output, lo_table_output, lo_chart_output])
 
1
 
2
+
3
  import gradio as gr
4
  from sklearn.feature_extraction.text import TfidfVectorizer
5
  from sklearn.metrics.pairwise import cosine_similarity
6
+ from sentence_transformers import SentenceTransformer, util
7
  import fitz # PyMuPDF
8
  import docx
9
  import matplotlib.pyplot as plt
10
  import pandas as pd
11
 
12
+ # Load transformer model
13
+ model = SentenceTransformer("all-MiniLM-L6-v2")
14
+
15
  def extract_text_from_pdf(file):
16
  text = ""
17
  try:
 
27
  return "\n".join([para.text for para in doc.paragraphs])
28
 
29
  def semantic_match(lo_texts, content):
30
+ embeddings = model.encode([content] + lo_texts, convert_to_tensor=True)
31
+ content_embedding = embeddings[0]
32
+ lo_embeddings = embeddings[1:]
33
+ similarities = util.pytorch_cos_sim(content_embedding, lo_embeddings)[0]
 
34
  return similarities.tolist()
35
 
36
  def compare_handouts(old_pdf, new_pdf, lo_file):
 
37
  old_text = extract_text_from_pdf(old_pdf)
38
  new_text = extract_text_from_pdf(new_pdf)
39
  if not old_text.strip() or not new_text.strip():
40
+ return "❌ Could not extract text from one or both PDFs.", None, None
41
 
 
42
  lo_text = extract_text_from_docx(lo_file)
43
  lo_list = [line.strip() for line in lo_text.split("\n") if line.strip()]
44
  if not lo_list:
45
+ return "⚠️ No learning outcomes detected in uploaded DOCX file.", None, None
46
 
 
47
  old_scores = semantic_match(lo_list, old_text)
48
  new_scores = semantic_match(lo_list, new_text)
49
 
 
50
  avg_old = sum(old_scores) / len(old_scores)
51
  avg_new = sum(new_scores) / len(new_scores)
52
  change = round(((avg_new - avg_old) / avg_old) * 100, 2) if avg_old != 0 else 100.0
53
 
 
54
  matched = sum([1 for o, n in zip(old_scores, new_scores) if n >= o])
55
  summary = f"πŸ“ˆ Content Change: {change:.2f}%\n🎯 Matched LOs: {matched} of {len(lo_list)}"
56
  if change > 10:
57
  summary += "\n🟒 New content appears more detailed and informative."
58
  elif change < -10:
59
+ summary += "\nπŸ”΄ Content may have been reduced or simplified."
60
  else:
61
+ summary += "\n🟑 Only minor updates detected."
62
 
 
63
  los = [f"LO{i+1}" for i in range(len(lo_list))]
64
  percentage_change = [round(((n - o) / o) * 100, 2) if o else 100.0 for o, n in zip(old_scores, new_scores)]
65
  df = pd.DataFrame({
66
  "Learning Outcome": los,
67
+ "Old Score": [round(s, 3) for s in old_scores],
68
+ "New Score": [round(s, 3) for s in new_scores],
69
  "% Change": percentage_change
70
  })
71
 
 
81
  plt.savefig(table_path, bbox_inches='tight', dpi=300)
82
  plt.close()
83
 
84
+ # Bar chart
85
  fig, ax = plt.subplots(figsize=(10, 4))
86
  bar_width = 0.35
87
  index = range(len(los))
 
89
  ax.bar([i + bar_width for i in index], new_scores, bar_width, label='New', alpha=0.7)
90
  ax.set_xticks([i + bar_width / 2 for i in index])
91
  ax.set_xticklabels(los)
92
+ ax.set_ylabel('Semantic Match (0-1)')
93
+ ax.set_title('Learning Outcome Comparison')
94
  ax.legend()
95
+ chart_path = "/mnt/data/lo_score_chart.png"
96
  plt.tight_layout()
97
  plt.savefig(chart_path)
98
  plt.close()
 
101
 
102
  # Gradio interface
103
  with gr.Blocks() as demo:
104
+ gr.Markdown("## 🧠 Transformer-Based Course Content Comparator")
105
  with gr.Row():
106
+ old_pdf_input = gr.File(label="πŸ“‚ Old Handout (PDF)", file_types=[".pdf"])
107
+ new_pdf_input = gr.File(label="πŸ“‚ New Handout (PDF)", file_types=[".pdf"])
108
+ lo_input = gr.File(label="πŸ“„ Learning Outcomes (DOCX)", file_types=[".docx"])
109
+ submit_btn = gr.Button("πŸ” Analyze")
110
  summary_output = gr.Textbox(label="Summary")
111
+ lo_table_output = gr.Image(label="πŸ“‹ LO Change Table")
112
+ lo_chart_output = gr.Image(label="πŸ“ˆ LO Match Chart")
113
 
114
  submit_btn.click(fn=compare_handouts, inputs=[old_pdf_input, new_pdf_input, lo_input],
115
  outputs=[summary_output, lo_table_output, lo_chart_output])