Deevyankar commited on
Commit
c3bf685
Β·
verified Β·
1 Parent(s): 4db3b3a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +95 -85
app.py CHANGED
@@ -1,112 +1,122 @@
 
1
  import gradio as gr
 
 
2
  import fitz # PyMuPDF
3
  import docx
4
- import io
5
- import re
6
- from sklearn.feature_extraction.text import TfidfVectorizer
7
- from sentence_transformers import SentenceTransformer, util
8
  import matplotlib.pyplot as plt
9
- import numpy as np
10
- from difflib import SequenceMatcher
11
 
12
- model = SentenceTransformer('all-MiniLM-L6-v2')
13
-
14
- def extract_text_from_pdf(pdf_file):
15
  try:
16
- pdf_reader = fitz.open(stream=pdf_file, filetype="pdf")
17
- text = ""
18
- for page in pdf_reader:
19
- text += page.get_text()
20
- pdf_reader.close()
21
- return text.strip()
22
  except Exception as e:
23
- return ""
24
-
25
- def normalize_text(text):
26
- return re.sub(r'\s+', ' ', text.strip().lower())
27
-
28
- def extract_text_from_docx(file_binary):
29
- try:
30
- doc = docx.Document(io.BytesIO(file_binary))
31
- return [para.text.strip() for para in doc.paragraphs if para.text.strip()]
32
- except Exception as e:
33
- return []
34
-
35
- def semantic_match(lo_list, content):
36
- scores = []
37
- for lo in lo_list:
38
- try:
39
- lo_embed = model.encode(lo, convert_to_tensor=True)
40
- content_embed = model.encode(content, convert_to_tensor=True)
41
- sim = util.pytorch_cos_sim(lo_embed, content_embed).item()
42
- scores.append(round(sim, 2))
43
- except:
44
- scores.append(0.0)
45
- return scores
46
-
47
- def content_change_score(text1, text2):
48
- try:
49
- sim = SequenceMatcher(None, normalize_text(text1), normalize_text(text2)).ratio()
50
- return round((1 - sim) * 100, 2)
51
- except:
52
- return 100.0
53
 
54
  def compare_handouts(old_pdf, new_pdf, lo_file):
 
55
  old_text = extract_text_from_pdf(old_pdf)
56
  new_text = extract_text_from_pdf(new_pdf)
 
 
57
 
58
- if len(old_text.strip()) < 200 or len(new_text.strip()) < 200:
59
- return "⚠️ Could not extract meaningful content from one or both PDFs.", None
60
-
61
- lo_list = extract_text_from_docx(lo_file)
62
  if not lo_list:
63
- return "⚠️ No learning outcomes detected in .docx file.", None
64
 
 
65
  old_scores = semantic_match(lo_list, old_text)
66
  new_scores = semantic_match(lo_list, new_text)
67
 
68
- change_percent = content_change_score(old_text, new_text)
69
- improved_count = sum([n > o for n, o in zip(new_scores, old_scores)])
70
- matched_los = sum([n >= o for n, o in zip(new_scores, old_scores)])
71
-
72
- summary = f"πŸ“ˆ Content Change Estimate: {change_percent}%\n"
73
- summary += f"🧠 LO Alignment: {matched_los} of {len(lo_list)} learning outcomes matched\n"
74
- if improved_count > 0:
75
- summary += "🟒 Summary: New handout has improved structure and added clarity."
 
 
 
 
76
  else:
77
- summary += "⚠️ Summary: No significant improvement in LO alignment."
78
-
79
- x = np.arange(len(lo_list))
80
- width = 0.35
81
- fig, ax = plt.subplots()
82
- ax.bar(x - width/2, old_scores, width, label='Old')
83
- ax.bar(x + width/2, new_scores, width, label='New')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
84
  ax.set_ylabel('Match Score (0-1)')
85
- ax.set_title('LO-wise Match Score: Old vs New')
86
- ax.set_xticks(x)
87
- ax.set_xticklabels([f"LO{i+1}" for i in range(len(lo_list))], rotation=45)
88
  ax.legend()
 
89
  plt.tight_layout()
 
 
90
 
91
- return summary, fig
92
 
 
93
  with gr.Blocks() as demo:
94
- gr.Markdown("πŸ“˜ **Educational Content Comparator**")
95
- gr.Markdown("Upload 2 handouts and a .docx file of Learning Outcomes to compare changes and alignment.")
96
-
97
  with gr.Row():
98
- old_pdf = gr.File(label="πŸ“‚ Upload Old PDF", file_types=[".pdf"], type="binary")
99
- new_pdf = gr.File(label="πŸ“‚ Upload New PDF", file_types=[".pdf"], type="binary")
100
- lo_file = gr.File(label="πŸ“‚ Upload Learning Outcomes (.docx)", file_types=[".docx"], type="binary")
101
-
102
- with gr.Row():
103
- btn = gr.Button("Submit")
104
- clear_btn = gr.Button("Clear")
105
-
106
- output_text = gr.Textbox(label="πŸ“‹ Summary", lines=5, interactive=False)
107
- output_plot = gr.Plot(label="πŸ“Š LO Match Chart")
108
-
109
- btn.click(fn=compare_handouts, inputs=[old_pdf, new_pdf, lo_file], outputs=[output_text, output_plot])
110
- clear_btn.click(fn=lambda: ("", None), inputs=[], outputs=[output_text, output_plot])
111
 
112
  demo.launch()
 
1
+
2
  import gradio as gr
3
+ from sklearn.feature_extraction.text import TfidfVectorizer
4
+ from sklearn.metrics.pairwise import cosine_similarity
5
  import fitz # PyMuPDF
6
  import docx
 
 
 
 
7
  import matplotlib.pyplot as plt
8
+ import pandas as pd
9
+ import tempfile
10
 
11
+ def extract_text_from_pdf(file):
12
+ text = ""
 
13
  try:
14
+ with fitz.open(stream=file, filetype="pdf") as doc:
15
+ for page in doc:
16
+ text += page.get_text()
 
 
 
17
  except Exception as e:
18
+ print(f"Error extracting PDF text: {e}")
19
+ return text
20
+
21
+ def extract_text_from_docx(file):
22
+ doc = docx.Document(file)
23
+ return "
24
+ ".join([para.text for para in doc.paragraphs])
25
+
26
+ def semantic_match(lo_texts, content):
27
+ vectorizer = TfidfVectorizer().fit_transform([content] + lo_texts)
28
+ vectors = vectorizer.toarray()
29
+ content_vector = vectors[0]
30
+ lo_vectors = vectors[1:]
31
+ similarities = cosine_similarity([content_vector], lo_vectors)[0]
32
+ return similarities.tolist()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
 
34
  def compare_handouts(old_pdf, new_pdf, lo_file):
35
+ # Extract text from handouts
36
  old_text = extract_text_from_pdf(old_pdf)
37
  new_text = extract_text_from_pdf(new_pdf)
38
+ if not old_text.strip() or not new_text.strip():
39
+ return "Could not extract text from one or both PDFs.", None, None
40
 
41
+ # Extract Learning Outcomes
42
+ lo_text = extract_text_from_docx(lo_file)
43
+ lo_list = [line.strip() for line in lo_text.split("
44
+ ") if line.strip()]
45
  if not lo_list:
46
+ return "No learning outcomes detected.", None, None
47
 
48
+ # Match scores
49
  old_scores = semantic_match(lo_list, old_text)
50
  new_scores = semantic_match(lo_list, new_text)
51
 
52
+ # Calculate overall change
53
+ avg_old = sum(old_scores) / len(old_scores)
54
+ avg_new = sum(new_scores) / len(new_scores)
55
+ change = round(((avg_new - avg_old) / avg_old) * 100, 2) if avg_old != 0 else 100.0
56
+
57
+ # Summary
58
+ matched = sum([1 for o, n in zip(old_scores, new_scores) if n >= o])
59
+ summary = f"πŸ“ˆ Content Change: {change:.2f}%\n🎯 Matched LOs: {matched} of {len(lo_list)}"
60
+ if change > 10:
61
+ summary += "\n🟒 New content appears more detailed and informative."
62
+ elif change < -10:
63
+ summary += "\nπŸ”΄ Some content may have been removed or simplified."
64
  else:
65
+ summary += "\n🟑 Minor updates detected."
66
+
67
+ # LO-wise chart and table
68
+ los = [f"LO{i+1}" for i in range(len(lo_list))]
69
+ percentage_change = [round(((n - o) / o) * 100, 2) if o else 100.0 for o, n in zip(old_scores, new_scores)]
70
+ df = pd.DataFrame({
71
+ "Learning Outcome": los,
72
+ "Old Score": old_scores,
73
+ "New Score": new_scores,
74
+ "% Change": percentage_change
75
+ })
76
+
77
+ # Table image
78
+ fig, ax = plt.subplots(figsize=(9, 3))
79
+ ax.axis('tight')
80
+ ax.axis('off')
81
+ table = ax.table(cellText=df.values, colLabels=df.columns, cellLoc='center', loc='center')
82
+ table.auto_set_font_size(False)
83
+ table.set_fontsize(10)
84
+ table.scale(1.2, 1.2)
85
+ table_path = "/mnt/data/lo_comparison_table.png"
86
+ plt.savefig(table_path, bbox_inches='tight', dpi=300)
87
+ plt.close()
88
+
89
+ # Chart image
90
+ fig, ax = plt.subplots(figsize=(10, 4))
91
+ bar_width = 0.35
92
+ index = range(len(los))
93
+ ax.bar(index, old_scores, bar_width, label='Old', alpha=0.7)
94
+ ax.bar([i + bar_width for i in index], new_scores, bar_width, label='New', alpha=0.7)
95
+ ax.set_xticks([i + bar_width / 2 for i in index])
96
+ ax.set_xticklabels(los)
97
  ax.set_ylabel('Match Score (0-1)')
98
+ ax.set_title('LO-wise Match Score Comparison')
 
 
99
  ax.legend()
100
+ chart_path = "/mnt/data/lo_comparison_chart.png"
101
  plt.tight_layout()
102
+ plt.savefig(chart_path)
103
+ plt.close()
104
 
105
+ return summary, table_path, chart_path
106
 
107
+ # Gradio interface
108
  with gr.Blocks() as demo:
109
+ gr.Markdown("# πŸ“˜ Handout Change Analyzer with LO Mapping")
 
 
110
  with gr.Row():
111
+ old_pdf_input = gr.File(label="Upload Old Handout (PDF)", file_types=[".pdf"])
112
+ new_pdf_input = gr.File(label="Upload New Handout (PDF)", file_types=[".pdf"])
113
+ lo_input = gr.File(label="Upload Learning Outcomes (DOCX)", file_types=[".docx"])
114
+ submit_btn = gr.Button("πŸ” Analyze Changes")
115
+ summary_output = gr.Textbox(label="Summary")
116
+ lo_table_output = gr.Image(label="πŸ“‹ LO Comparison Table")
117
+ lo_chart_output = gr.Image(label="πŸ“ˆ LO Score Chart")
118
+
119
+ submit_btn.click(fn=compare_handouts, inputs=[old_pdf_input, new_pdf_input, lo_input],
120
+ outputs=[summary_output, lo_table_output, lo_chart_output])
 
 
 
121
 
122
  demo.launch()