Deevyankar commited on
Commit
3378b7b
Β·
verified Β·
1 Parent(s): 8751b01

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +46 -34
app.py CHANGED
@@ -1,4 +1,5 @@
1
 
 
2
  import gradio as gr
3
  from PyPDF2 import PdfReader
4
  from pdf2image import convert_from_bytes
@@ -7,11 +8,13 @@ from PIL import Image
7
  import io
8
  from sklearn.feature_extraction.text import TfidfVectorizer
9
  from sklearn.metrics.pairwise import cosine_similarity
 
10
  import matplotlib.pyplot as plt
11
  import pandas as pd
12
- from sentence_transformers import SentenceTransformer
13
 
14
- model = SentenceTransformer('all-MiniLM-L6-v2')
 
15
 
16
  def extract_text_from_pdf(pdf_file):
17
  try:
@@ -44,6 +47,21 @@ def semantic_match(lo_list, content):
44
  scores = [cosine_similarity([content_vec], [vec])[0][0] for vec in vectors[1:]]
45
  return scores
46
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
47
  def compare_all(old_pdf, new_pdf, lo_file):
48
  try:
49
  los = lo_file.decode("utf-8", errors="ignore").splitlines()
@@ -57,22 +75,12 @@ def compare_all(old_pdf, new_pdf, lo_file):
57
  if not old_text.strip() or not new_text.strip():
58
  return "❌ Could not extract text from one or both PDFs.", None, None
59
 
60
- # Similarity Calculations
61
- tfidf_vectorizer = TfidfVectorizer().fit_transform([old_text, new_text])
62
- tfidf_score = cosine_similarity([tfidf_vectorizer.toarray()[0]], [tfidf_vectorizer.toarray()[1]])[0][0] * 100
63
-
64
- embed_old = model.encode(old_text, convert_to_tensor=True)
65
- embed_new = model.encode(new_text, convert_to_tensor=True)
66
- semantic_score = float(cosine_similarity([embed_old], [embed_new])[0][0]) * 100
67
-
68
- # LO Scores
69
  old_scores = semantic_match(los, old_text)
70
  new_scores = semantic_match(los, new_text)
71
 
72
- # Bar Plot
73
  labels = [f"LO{i+1}" for i in range(len(los))]
74
  x = range(len(labels))
75
- fig, ax = plt.subplots(figsize=(10, 5))
76
  ax.bar(x, old_scores, width=0.4, label="Old", align='center')
77
  ax.bar([i + 0.4 for i in x], new_scores, width=0.4, label="New", align='center')
78
  ax.set_xticks([i + 0.2 for i in x])
@@ -90,25 +98,29 @@ def compare_all(old_pdf, new_pdf, lo_file):
90
  }
91
  df = pd.DataFrame(data)
92
 
93
- # Insight Generation
94
- lo_diff = sum(new_scores) - sum(old_scores)
95
- if abs(lo_diff) < 0.01:
96
- insight = "βšͺ No significant change in alignment with learning outcomes."
97
- elif lo_diff > 0:
98
- insight = "🟒 New content appears more aligned with outcomes."
99
- else:
100
- insight = "πŸ”΄ New content appears less aligned with outcomes."
 
 
 
101
 
102
- matched_lo = sum(1 for s in new_scores if s >= 0.5)
103
- total_lo = len(los)
 
 
104
 
105
- summary = f"πŸ“˜ Summary of Comparison\n\n"
106
- summary += f"πŸ”Ή Semantic Similarity (Transformer): {round(semantic_score, 2)}%\n"
107
- summary += f"πŸ”Ή Structural Similarity (TF-IDF): {round(tfidf_score, 2)}%\n\n"
108
- summary += f"🎯 Learning Outcome Matches: {matched_lo} of {total_lo}\n"
109
- summary += insight
110
 
111
- return summary, df, fig
112
 
113
  iface = gr.Interface(
114
  fn=compare_all,
@@ -118,12 +130,12 @@ iface = gr.Interface(
118
  gr.File(label="Learning Outcomes (Text File)", type='binary'),
119
  ],
120
  outputs=[
121
- gr.Textbox(label="Summary & Insights", lines=10),
122
- gr.Dataframe(label="LO-wise Comparison Table"),
123
- gr.Plot(label="Visual Comparison Chart")
124
  ],
125
- title="πŸ“˜ Handout Comparator + LO Analysis (Dual Similarity)",
126
- description="Upload old/new handouts + Learning Outcomes file (TXT). See content diff, LO alignment, and dual similarity scoring (TF-IDF + Transformers)."
127
  )
128
 
129
  iface.launch()
 
1
 
2
+
3
  import gradio as gr
4
  from PyPDF2 import PdfReader
5
  from pdf2image import convert_from_bytes
 
8
  import io
9
  from sklearn.feature_extraction.text import TfidfVectorizer
10
  from sklearn.metrics.pairwise import cosine_similarity
11
+ from transformers import pipeline
12
  import matplotlib.pyplot as plt
13
  import pandas as pd
14
+ from difflib import SequenceMatcher
15
 
16
+ # Load transformer model for semantic similarity
17
+ semantic_pipeline = pipeline("feature-extraction", model="sentence-transformers/all-MiniLM-L6-v2")
18
 
19
  def extract_text_from_pdf(pdf_file):
20
  try:
 
47
  scores = [cosine_similarity([content_vec], [vec])[0][0] for vec in vectors[1:]]
48
  return scores
49
 
50
+ def compute_difference_and_text_change(old_text, new_text):
51
+ similarity = SequenceMatcher(None, old_text, new_text).ratio()
52
+ difference_percentage = round((1 - similarity) * 100, 2)
53
+
54
+ len_old = len(old_text.split())
55
+ len_new = len(new_text.split())
56
+ length_change = round(((len_new - len_old) / len_old) * 100, 2)
57
+ return difference_percentage, length_change
58
+
59
+ def transformer_similarity(text1, text2):
60
+ emb1 = semantic_pipeline(text1)[0][0]
61
+ emb2 = semantic_pipeline(text2)[0][0]
62
+ sim = cosine_similarity([emb1], [emb2])[0][0]
63
+ return round(sim * 100, 2)
64
+
65
  def compare_all(old_pdf, new_pdf, lo_file):
66
  try:
67
  los = lo_file.decode("utf-8", errors="ignore").splitlines()
 
75
  if not old_text.strip() or not new_text.strip():
76
  return "❌ Could not extract text from one or both PDFs.", None, None
77
 
 
 
 
 
 
 
 
 
 
78
  old_scores = semantic_match(los, old_text)
79
  new_scores = semantic_match(los, new_text)
80
 
 
81
  labels = [f"LO{i+1}" for i in range(len(los))]
82
  x = range(len(labels))
83
+ fig, ax = plt.subplots()
84
  ax.bar(x, old_scores, width=0.4, label="Old", align='center')
85
  ax.bar([i + 0.4 for i in x], new_scores, width=0.4, label="New", align='center')
86
  ax.set_xticks([i + 0.2 for i in x])
 
98
  }
99
  df = pd.DataFrame(data)
100
 
101
+ # Calculate metrics
102
+ tfidf_similarity = round(cosine_similarity(
103
+ [TfidfVectorizer().fit_transform([old_text, new_text]).toarray()[0]],
104
+ [TfidfVectorizer().fit_transform([old_text, new_text]).toarray()[1]]
105
+ )[0][0] * 100, 2)
106
+
107
+ diff_pct, length_delta = compute_difference_and_text_change(old_text, new_text)
108
+ transformer_sim = transformer_similarity(old_text, new_text)
109
+
110
+ summary = f"""
111
+ πŸ“˜ **Summary of Comparison**
112
 
113
+ πŸ“ˆ **TF-IDF Similarity**: {tfidf_similarity}%
114
+ πŸ€– **Transformer Similarity**: {transformer_sim}%
115
+ πŸ”„ **Textual Change** (Diff-based): {diff_pct}%
116
+ πŸ“ **Text Length Change**: {length_delta}% (words)
117
 
118
+ 🎯 **Learning Outcome Matches**: {sum(1 for s in new_scores if s >= 0.5)} of {len(los)}
119
+ 🧠 **Insight**: New content appears {'more' if sum(new_scores) > sum(old_scores) else 'less'} aligned with the learning outcomes.
120
+ πŸ’¬ **Tip**: Diff > 30% or word increase > 20% generally reflects real updates.
121
+ """
 
122
 
123
+ return summary.strip(), df, fig
124
 
125
  iface = gr.Interface(
126
  fn=compare_all,
 
130
  gr.File(label="Learning Outcomes (Text File)", type='binary'),
131
  ],
132
  outputs=[
133
+ gr.Textbox(label="πŸ“Š Summary Report", lines=12),
134
+ gr.Dataframe(label="πŸ“‹ LO-wise Comparison Table"),
135
+ gr.Plot(label="πŸ“ˆ LO Match Chart")
136
  ],
137
+ title="πŸ“˜ Handout Comparator + LO Analyzer (with AI)",
138
+ description="Compare two handouts and learning outcomes. View similarity via TF-IDF and Transformers. Bar chart and table included."
139
  )
140
 
141
  iface.launch()