Deevyankar commited on
Commit
3c108e3
Β·
verified Β·
1 Parent(s): a4cd443

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +45 -98
app.py CHANGED
@@ -1,117 +1,64 @@
1
 
2
 
3
  import gradio as gr
4
- from sklearn.feature_extraction.text import TfidfVectorizer
5
- from sklearn.metrics.pairwise import cosine_similarity
6
- from sentence_transformers import SentenceTransformer, util
7
  import fitz # PyMuPDF
8
- import docx
9
  import matplotlib.pyplot as plt
10
- import pandas as pd
 
11
 
12
- # Load transformer model
13
  model = SentenceTransformer("all-MiniLM-L6-v2")
14
 
15
- def extract_text_from_pdf(file):
16
- text = ""
17
  try:
18
- with fitz.open(stream=file, filetype="pdf") as doc:
 
19
  for page in doc:
20
  text += page.get_text()
 
21
  except Exception as e:
22
- print(f"Error extracting PDF text: {e}")
23
- return text
24
-
25
- def extract_text_from_docx(file):
26
- doc = docx.Document(file)
27
- return "\n".join([para.text for para in doc.paragraphs])
28
 
29
- def semantic_match(lo_texts, content):
30
- embeddings = model.encode([content] + lo_texts, convert_to_tensor=True)
31
- content_embedding = embeddings[0]
32
- lo_embeddings = embeddings[1:]
33
- similarities = util.pytorch_cos_sim(content_embedding, lo_embeddings)[0]
34
- return similarities.tolist()
35
 
36
- def compare_handouts(old_pdf, new_pdf, lo_file):
37
- old_text = extract_text_from_pdf(old_pdf)
38
- new_text = extract_text_from_pdf(new_pdf)
39
- if not old_text.strip() or not new_text.strip():
40
- return "❌ Could not extract text from one or both PDFs.", None, None
41
 
42
- lo_text = extract_text_from_docx(lo_file)
43
- lo_list = [line.strip() for line in lo_text.split("\n") if line.strip()]
44
- if not lo_list:
45
- return "⚠️ No learning outcomes detected in uploaded DOCX file.", None, None
46
 
47
- old_scores = semantic_match(lo_list, old_text)
48
- new_scores = semantic_match(lo_list, new_text)
49
 
50
- avg_old = sum(old_scores) / len(old_scores)
51
- avg_new = sum(new_scores) / len(new_scores)
52
- change = round(((avg_new - avg_old) / avg_old) * 100, 2) if avg_old != 0 else 100.0
53
 
54
- matched = sum([1 for o, n in zip(old_scores, new_scores) if n >= o])
55
- summary = f"πŸ“ˆ Content Change: {change:.2f}%\n🎯 Matched LOs: {matched} of {len(lo_list)}"
56
- if change > 10:
57
- summary += "\n🟒 New content appears more detailed and informative."
58
- elif change < -10:
59
- summary += "\nπŸ”΄ Content may have been reduced or simplified."
60
  else:
61
- summary += "\n🟑 Only minor updates detected."
62
-
63
- los = [f"LO{i+1}" for i in range(len(lo_list))]
64
- percentage_change = [round(((n - o) / o) * 100, 2) if o else 100.0 for o, n in zip(old_scores, new_scores)]
65
- df = pd.DataFrame({
66
- "Learning Outcome": los,
67
- "Old Score": [round(s, 3) for s in old_scores],
68
- "New Score": [round(s, 3) for s in new_scores],
69
- "% Change": percentage_change
70
- })
71
-
72
- # Table image
73
- fig, ax = plt.subplots(figsize=(9, 3))
74
- ax.axis('tight')
75
- ax.axis('off')
76
- table = ax.table(cellText=df.values, colLabels=df.columns, cellLoc='center', loc='center')
77
- table.auto_set_font_size(False)
78
- table.set_fontsize(10)
79
- table.scale(1.2, 1.2)
80
- table_path = "/mnt/data/lo_comparison_table.png"
81
- plt.savefig(table_path, bbox_inches='tight', dpi=300)
82
- plt.close()
83
-
84
- # Bar chart
85
- fig, ax = plt.subplots(figsize=(10, 4))
86
- bar_width = 0.35
87
- index = range(len(los))
88
- ax.bar(index, old_scores, bar_width, label='Old', alpha=0.7)
89
- ax.bar([i + bar_width for i in index], new_scores, bar_width, label='New', alpha=0.7)
90
- ax.set_xticks([i + bar_width / 2 for i in index])
91
- ax.set_xticklabels(los)
92
- ax.set_ylabel('Semantic Match (0-1)')
93
- ax.set_title('Learning Outcome Comparison')
94
- ax.legend()
95
- chart_path = "/mnt/data/lo_score_chart.png"
96
- plt.tight_layout()
97
- plt.savefig(chart_path)
98
- plt.close()
99
-
100
- return summary, table_path, chart_path
101
-
102
- # Gradio interface
103
- with gr.Blocks() as demo:
104
- gr.Markdown("## 🧠 Transformer-Based Course Content Comparator")
105
- with gr.Row():
106
- old_pdf_input = gr.File(label="πŸ“‚ Old Handout (PDF)", file_types=[".pdf"])
107
- new_pdf_input = gr.File(label="πŸ“‚ New Handout (PDF)", file_types=[".pdf"])
108
- lo_input = gr.File(label="πŸ“„ Learning Outcomes (DOCX)", file_types=[".docx"])
109
- submit_btn = gr.Button("πŸ” Analyze")
110
- summary_output = gr.Textbox(label="Summary")
111
- lo_table_output = gr.Image(label="πŸ“‹ LO Change Table")
112
- lo_chart_output = gr.Image(label="πŸ“ˆ LO Match Chart")
113
-
114
- submit_btn.click(fn=compare_handouts, inputs=[old_pdf_input, new_pdf_input, lo_input],
115
- outputs=[summary_output, lo_table_output, lo_chart_output])
116
-
117
- demo.launch()
 
1
 
2
 
3
  import gradio as gr
 
 
 
4
  import fitz # PyMuPDF
5
+ from sentence_transformers import SentenceTransformer, util
6
  import matplotlib.pyplot as plt
7
+ import numpy as np
8
+ import os
9
 
10
+ # Load transformer model once
11
  model = SentenceTransformer("all-MiniLM-L6-v2")
12
 
13
+ def extract_text_pdf(file_obj):
 
14
  try:
15
+ with fitz.open(stream=file_obj, filetype="pdf") as doc:
16
+ text = ""
17
  for page in doc:
18
  text += page.get_text()
19
+ return text if text.strip() else None
20
  except Exception as e:
21
+ return None
 
 
 
 
 
22
 
23
+ def semantic_similarity(text1, text2):
24
+ emb1 = model.encode([text1], convert_to_tensor=True)
25
+ emb2 = model.encode([text2], convert_to_tensor=True)
26
+ return float(util.pytorch_cos_sim(emb1, emb2)[0][0])
 
 
27
 
28
+ def compare_docs(old_pdf, new_pdf):
29
+ old_text = extract_text_pdf(old_pdf)
30
+ new_text = extract_text_pdf(new_pdf)
 
 
31
 
32
+ if not old_text or not new_text:
33
+ return "❌ Could not extract text from one or both PDFs.", None
 
 
34
 
35
+ sim_score = semantic_similarity(old_text, new_text)
36
+ change_percent = round((1 - sim_score) * 100, 2)
37
 
38
+ summary = f"πŸ“ˆ Estimated Content Change: {change_percent}%\n\n"
39
+ summary += "🧠 Semantic Similarity Score: {:.2f}\n".format(sim_score)
 
40
 
41
+ if change_percent < 10:
42
+ summary += "βœ… Minor updates detected, mostly similar content."
43
+ elif change_percent < 40:
44
+ summary += "πŸ”„ Moderate content updates detected."
 
 
45
  else:
46
+ summary += "πŸ†• Major revisions and new content identified."
47
+
48
+ return summary, None
49
+
50
+ iface = gr.Interface(
51
+ fn=compare_docs,
52
+ inputs=[
53
+ gr.File(label="Upload Old Handout (PDF)", file_types=[".pdf"]),
54
+ gr.File(label="Upload New Handout (PDF)", file_types=[".pdf"])
55
+ ],
56
+ outputs=[
57
+ gr.Textbox(label="Comparison Summary"),
58
+ gr.Plot(label="(Coming Soon) Visual Summary")
59
+ ],
60
+ title="πŸ“˜ Course Handout Comparator with Semantic AI",
61
+ description="Upload old and new PDFs to see how much content has changed. Uses transformer model for expert-like judgment.",
62
+ )
63
+
64
+ iface.launch()