aradhyapavan commited on
Commit
815ff45
·
verified ·
1 Parent(s): 653544a

Update components/vector_embeddings.py

Browse files
Files changed (1) hide show
  1. components/vector_embeddings.py +272 -241
components/vector_embeddings.py CHANGED
@@ -1,241 +1,272 @@
1
- import matplotlib
2
- matplotlib.use('Agg') # Use non-GUI backend
3
- import matplotlib.pyplot as plt
4
- import numpy as np
5
- import pandas as pd
6
- import spacy
7
- import time
8
- import faiss
9
- from sentence_transformers import SentenceTransformer, util
10
- from sklearn.decomposition import PCA
11
- import textwrap
12
- from sklearn.metrics.pairwise import cosine_similarity
13
-
14
- from utils.model_loader import load_embedding_model
15
- from utils.helpers import fig_to_html, df_to_html_table
16
-
17
- def vector_embeddings_handler(text_input, search_query=""):
18
- """Show vector embeddings and semantic search capabilities."""
19
- output_html = []
20
-
21
- # Add result area container
22
- output_html.append('<div class="result-area">')
23
- output_html.append('<h2 class="task-header">Vector Embeddings Analysis Results</h2>')
24
-
25
- output_html.append("""
26
- <div class="alert alert-success">
27
- <h4><i class="fas fa-check-circle me-2"></i>Embeddings Generated Successfully!</h4>
28
- <p class="mb-0">Your text has been processed and converted into high-dimensional vector representations.</p>
29
- </div>
30
- """)
31
-
32
- # Load model and create embeddings
33
- try:
34
- model = load_embedding_model()
35
-
36
- # Split the text into chunks (sentences)
37
- import spacy
38
- nlp = spacy.load("en_core_web_sm")
39
- doc = nlp(text_input)
40
- sentences = [sent.text.strip() for sent in doc.sents if len(sent.text.strip()) > 10]
41
-
42
- # If we have too few sentences, create artificial chunks
43
- if len(sentences) < 3:
44
- words = text_input.split()
45
- chunk_size = max(10, len(words) // 3)
46
- sentences = [' '.join(words[i:i+chunk_size]) for i in range(0, len(words), chunk_size) if i+chunk_size <= len(words)]
47
-
48
- # Limit to 10 sentences to avoid overwhelming the visualization
49
- if len(sentences) > 10:
50
- sentences = sentences[:10]
51
-
52
- # Create embeddings
53
- embeddings = model.encode(sentences)
54
-
55
- # Text Statistics
56
- output_html.append(f"""
57
- <div class="row mb-4">
58
- <div class="col-12">
59
- <div class="card">
60
- <div class="card-header bg-primary text-white">
61
- <h4 class="mb-0"><i class="fas fa-chart-bar me-2"></i>Processing Statistics</h4>
62
- </div>
63
- <div class="card-body">
64
- <div class="row text-center">
65
- <div class="col-md-3">
66
- <div class="stat-item">
67
- <h3 class="text-primary">{len(text_input)}</h3>
68
- <p class="text-muted mb-0">Characters</p>
69
- </div>
70
- </div>
71
- <div class="col-md-3">
72
- <div class="stat-item">
73
- <h3 class="text-success">{len(sentences)}</h3>
74
- <p class="text-muted mb-0">Text Segments</p>
75
- </div>
76
- </div>
77
- <div class="col-md-3">
78
- <div class="stat-item">
79
- <h3 class="text-info">{embeddings.shape[1]}</h3>
80
- <p class="text-muted mb-0">Vector Dimensions</p>
81
- </div>
82
- </div>
83
- <div class="col-md-3">
84
- <div class="stat-item">
85
- <h3 class="text-warning">{embeddings.shape[0]}</h3>
86
- <p class="text-muted mb-0">Embedding Vectors</p>
87
- </div>
88
- </div>
89
- </div>
90
- </div>
91
- </div>
92
- </div>
93
- </div>
94
- """)
95
-
96
- # Text Segments Display
97
- output_html.append("""
98
- <div class="row mb-4">
99
- <div class="col-12">
100
- <div class="card">
101
- <div class="card-header bg-info text-white">
102
- <h4 class="mb-0"><i class="fas fa-list me-2"></i>Text Segments</h4>
103
- </div>
104
- <div class="card-body">
105
- <div class="row">
106
- """)
107
-
108
- for i, sentence in enumerate(sentences[:6]): # Show max 6 segments
109
- output_html.append(f"""
110
- <div class="col-md-6 mb-3">
111
- <div class="p-3 border rounded bg-light">
112
- <h6 class="text-primary mb-2">Segment {i+1}</h6>
113
- <p class="mb-0 small">{sentence}</p>
114
- </div>
115
- </div>
116
- """)
117
-
118
- output_html.append("""
119
- </div>
120
- </div>
121
- </div>
122
- </div>
123
- </div>
124
- """)
125
-
126
- # Semantic Search Interface
127
- output_html.append("""
128
- <div class="row mb-4">
129
- <div class="col-12">
130
- <div class="card border-warning">
131
- <div class="card-header bg-warning text-dark">
132
- <h4 class="mb-0"><i class="fas fa-search me-2"></i>Semantic Search</h4>
133
- </div>
134
- <div class="card-body">
135
- <p class="mb-3">Search for content by meaning, not just keywords. The system will find the most semantically similar text segments.</p>
136
-
137
- <div class="row mb-3">
138
- <div class="col-md-10">
139
- <input type="text" id="search-input" class="form-control form-control-lg" placeholder="Enter a search query to find similar content...">
140
- </div>
141
- <div class="col-md-2">
142
- <button onclick="performSemanticSearch()" class="btn btn-warning btn-lg w-100">
143
- <i class="fas fa-search me-1"></i>Search
144
- </button>
145
- </div>
146
- </div>
147
-
148
- <div class="mb-3">
149
- <h6 class="mb-2"><i class="fas fa-lightbulb me-2"></i>Try these example searches:</h6>
150
- <div class="d-flex flex-wrap gap-2">
151
- <button onclick="document.getElementById('search-input').value = 'space research'; performSemanticSearch();"
152
- class="btn btn-outline-secondary btn-sm">
153
- <i class="fas fa-rocket me-1"></i>space research
154
- </button>
155
- <button onclick="document.getElementById('search-input').value = 'scientific collaboration'; performSemanticSearch();"
156
- class="btn btn-outline-secondary btn-sm">
157
- <i class="fas fa-users me-1"></i>scientific collaboration
158
- </button>
159
- <button onclick="document.getElementById('search-input').value = 'international project'; performSemanticSearch();"
160
- class="btn btn-outline-secondary btn-sm">
161
- <i class="fas fa-globe me-1"></i>international project
162
- </button>
163
- <button onclick="document.getElementById('search-input').value = 'laboratory experiments'; performSemanticSearch();"
164
- class="btn btn-outline-secondary btn-sm">
165
- <i class="fas fa-flask me-1"></i>laboratory experiments
166
- </button>
167
- <button onclick="document.getElementById('search-input').value = 'space agencies'; performSemanticSearch();"
168
- class="btn btn-outline-secondary btn-sm">
169
- <i class="fas fa-building me-1"></i>space agencies
170
- </button>
171
- <button onclick="document.getElementById('search-input').value = 'microgravity environment'; performSemanticSearch();"
172
- class="btn btn-outline-secondary btn-sm">
173
- <i class="fas fa-weight me-1"></i>microgravity environment
174
- </button>
175
- </div>
176
- </div>
177
-
178
- <div id="search-results" style="display: none;">
179
- <hr>
180
- <h5><i class="fas fa-list-ol me-2"></i>Search Results:</h5>
181
- <div id="results-container" class="border rounded p-3 bg-light" style="max-height: 400px; overflow-y: auto;">
182
- </div>
183
- </div>
184
- </div>
185
- </div>
186
- </div>
187
- </div>
188
- """)
189
-
190
- except Exception as e:
191
- output_html.append(f"""
192
- <div class="alert alert-danger">
193
- <h4><i class="fas fa-exclamation-triangle me-2"></i>Error</h4>
194
- <p>Could not generate embeddings: {str(e)}</p>
195
- </div>
196
- """)
197
-
198
- # Close result-area div
199
- output_html.append('</div>')
200
- return '\n'.join(output_html)
201
-
202
- def perform_semantic_search(context, query):
203
- """Perform semantic search on the given context with the query."""
204
- try:
205
- # Load model
206
- model = load_embedding_model()
207
-
208
- # Split context into sentences
209
- import spacy
210
- nlp = spacy.load("en_core_web_sm")
211
- doc = nlp(context)
212
- sentences = [sent.text.strip() for sent in doc.sents if len(sent.text.strip()) > 5]
213
-
214
- # Create embeddings
215
- sentence_embeddings = model.encode(sentences)
216
- query_embedding = model.encode([query])[0]
217
-
218
- # Calculate similarities
219
- from sentence_transformers import util
220
- similarities = util.pytorch_cos_sim(query_embedding, sentence_embeddings)[0].cpu().numpy()
221
-
222
- # Create result pairs (sentence, similarity)
223
- results = [(sentences[i], float(similarities[i])) for i in range(len(sentences))]
224
-
225
- # Sort by similarity (descending)
226
- results.sort(key=lambda x: x[1], reverse=True)
227
-
228
- # Return top results
229
- return {
230
- "success": True,
231
- "results": [
232
- {"text": text, "score": score}
233
- for text, score in results[:5] # Return top 5 results
234
- ]
235
- }
236
-
237
- except Exception as e:
238
- return {
239
- "success": False,
240
- "error": str(e)
241
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import matplotlib
2
+ matplotlib.use('Agg') # Use non-GUI backend
3
+ import matplotlib.pyplot as plt
4
+ import numpy as np
5
+ import pandas as pd
6
+ import spacy
7
+ import time
8
+ import faiss
9
+ from sentence_transformers import SentenceTransformer, util
10
+ from sklearn.decomposition import PCA
11
+ import textwrap
12
+ from sklearn.metrics.pairwise import cosine_similarity
13
+
14
+ from utils.model_loader import load_embedding_model
15
+ from utils.helpers import fig_to_html, df_to_html_table
16
+
17
+ def vector_embeddings_handler(text_input, search_query=""):
18
+ """Show vector embeddings and semantic search capabilities."""
19
+ output_html = []
20
+
21
+ # Add result area container
22
+ output_html.append('<div class="result-area">')
23
+ output_html.append('<h2 class="task-header">Vector Embeddings Analysis Results</h2>')
24
+
25
+ output_html.append("""
26
+ <div class="alert alert-success">
27
+ <h4><i class="fas fa-check-circle me-2"></i>Embeddings Generated Successfully!</h4>
28
+ <p class="mb-0">Your text has been processed and converted into high-dimensional vector representations.</p>
29
+ </div>
30
+ """)
31
+
32
+ # Load model and create embeddings
33
+ try:
34
+ model = load_embedding_model()
35
+
36
+ # Split the text into chunks (sentences)
37
+ import spacy
38
+ nlp = spacy.load("en_core_web_sm")
39
+ doc = nlp(text_input)
40
+ sentences = [sent.text.strip() for sent in doc.sents if len(sent.text.strip()) > 10]
41
+
42
+ # If we have too few sentences, create artificial chunks
43
+ if len(sentences) < 3:
44
+ words = text_input.split()
45
+ chunk_size = max(10, len(words) // 3)
46
+ sentences = [' '.join(words[i:i+chunk_size]) for i in range(0, len(words), chunk_size) if i+chunk_size <= len(words)]
47
+
48
+ # Limit to 10 sentences to avoid overwhelming the visualization
49
+ if len(sentences) > 10:
50
+ sentences = sentences[:10]
51
+
52
+ # Create embeddings
53
+ embeddings = model.encode(sentences)
54
+
55
+ # Text Statistics
56
+ output_html.append(f"""
57
+ <div class="row mb-4">
58
+ <div class="col-12">
59
+ <div class="card">
60
+ <div class="card-header bg-primary text-white">
61
+ <h4 class="mb-0"><i class="fas fa-chart-bar me-2"></i>Processing Statistics</h4>
62
+ </div>
63
+ <div class="card-body">
64
+ <div class="row text-center">
65
+ <div class="col-md-3">
66
+ <div class="stat-item">
67
+ <h3 class="text-primary">{len(text_input)}</h3>
68
+ <p class="text-muted mb-0">Characters</p>
69
+ </div>
70
+ </div>
71
+ <div class="col-md-3">
72
+ <div class="stat-item">
73
+ <h3 class="text-success">{len(sentences)}</h3>
74
+ <p class="text-muted mb-0">Text Segments</p>
75
+ </div>
76
+ </div>
77
+ <div class="col-md-3">
78
+ <div class="stat-item">
79
+ <h3 class="text-info">{embeddings.shape[1]}</h3>
80
+ <p class="text-muted mb-0">Vector Dimensions</p>
81
+ </div>
82
+ </div>
83
+ <div class="col-md-3">
84
+ <div class="stat-item">
85
+ <h3 class="text-warning">{embeddings.shape[0]}</h3>
86
+ <p class="text-muted mb-0">Embedding Vectors</p>
87
+ </div>
88
+ </div>
89
+ </div>
90
+ </div>
91
+ </div>
92
+ </div>
93
+ </div>
94
+ """)
95
+
96
+ # Text Segments Display
97
+ output_html.append("""
98
+ <div class="row mb-4">
99
+ <div class="col-12">
100
+ <div class="card">
101
+ <div class="card-header bg-info text-white">
102
+ <h4 class="mb-0"><i class="fas fa-list me-2"></i>Text Segments</h4>
103
+ </div>
104
+ <div class="card-body">
105
+ <div class="row">
106
+ """)
107
+
108
+ for i, sentence in enumerate(sentences[:6]): # Show max 6 segments
109
+ output_html.append(f"""
110
+ <div class="col-md-6 mb-3">
111
+ <div class="p-3 border rounded bg-light">
112
+ <h6 class="text-primary mb-2">Segment {i+1}</h6>
113
+ <p class="mb-0 small">{sentence}</p>
114
+ </div>
115
+ </div>
116
+ """)
117
+
118
+ output_html.append("""
119
+ </div>
120
+ </div>
121
+ </div>
122
+ </div>
123
+ </div>
124
+ """)
125
+
126
+ # Semantic Search Interface
127
+ output_html.append("""
128
+ <div class="row mb-4">
129
+ <div class="col-12">
130
+ <div class="card border-warning">
131
+ <div class="card-header bg-warning text-dark">
132
+ <h4 class="mb-0"><i class="fas fa-search me-2"></i>Semantic Search</h4>
133
+ </div>
134
+ <div class="card-body">
135
+ <p class="mb-3">Search for content by meaning, not just keywords. The system will find the most semantically similar text segments.</p>
136
+
137
+ <div class="row mb-3">
138
+ <div class="col-md-10">
139
+ <input type="text" id="search-input" class="form-control form-control-lg" placeholder="Enter a search query to find similar content...">
140
+ </div>
141
+ <div class="col-md-2">
142
+ <button onclick="performSemanticSearch()" class="btn btn-warning btn-lg w-100">
143
+ <i class="fas fa-search me-1"></i>Search
144
+ </button>
145
+ </div>
146
+ </div>
147
+
148
+ <div class="mb-3">
149
+ <h6 class="mb-2"><i class="fas fa-lightbulb me-2"></i>Try these example searches:</h6>
150
+ <div class="d-flex flex-wrap gap-2">
151
+ <button onclick="document.getElementById('search-input').value = 'space research'; performSemanticSearch();"
152
+ class="btn btn-outline-secondary btn-sm">
153
+ <i class="fas fa-rocket me-1"></i>space research
154
+ </button>
155
+ <button onclick="document.getElementById('search-input').value = 'scientific collaboration'; performSemanticSearch();"
156
+ class="btn btn-outline-secondary btn-sm">
157
+ <i class="fas fa-users me-1"></i>scientific collaboration
158
+ </button>
159
+ <button onclick="document.getElementById('search-input').value = 'international project'; performSemanticSearch();"
160
+ class="btn btn-outline-secondary btn-sm">
161
+ <i class="fas fa-globe me-1"></i>international project
162
+ </button>
163
+ <button onclick="document.getElementById('search-input').value = 'laboratory experiments'; performSemanticSearch();"
164
+ class="btn btn-outline-secondary btn-sm">
165
+ <i class="fas fa-flask me-1"></i>laboratory experiments
166
+ </button>
167
+ <button onclick="document.getElementById('search-input').value = 'space agencies'; performSemanticSearch();"
168
+ class="btn btn-outline-secondary btn-sm">
169
+ <i class="fas fa-building me-1"></i>space agencies
170
+ </button>
171
+ <button onclick="document.getElementById('search-input').value = 'microgravity environment'; performSemanticSearch();"
172
+ class="btn btn-outline-secondary btn-sm">
173
+ <i class="fas fa-weight me-1"></i>microgravity environment
174
+ </button>
175
+ </div>
176
+ </div>
177
+
178
+ <div id="search-results" style="display: none;">
179
+ <hr>
180
+ <h5><i class="fas fa-list-ol me-2"></i>Search Results:</h5>
181
+ <div id="results-container" class="border rounded p-3 bg-light" style="max-height: 400px; overflow-y: auto;">
182
+ </div>
183
+ </div>
184
+ </div>
185
+ </div>
186
+ </div>
187
+ </div>
188
+ """)
189
+
190
+ except Exception as e:
191
+ output_html.append(f"""
192
+ <div class="alert alert-danger">
193
+ <h4><i class="fas fa-exclamation-triangle me-2"></i>Error</h4>
194
+ <p>Could not generate embeddings: {str(e)}</p>
195
+ </div>
196
+ """)
197
+
198
+ # Close result-area div
199
+ output_html.append('</div>')
200
+
201
+ # Add About section at the end
202
+ output_html.append(get_about_section())
203
+
204
+ return '\n'.join(output_html)
205
+
206
+ def perform_semantic_search(context, query):
207
+ """Perform semantic search on the given context with the query."""
208
+ try:
209
+ # Load model
210
+ model = load_embedding_model()
211
+
212
+ # Split context into sentences
213
+ import spacy
214
+ nlp = spacy.load("en_core_web_sm")
215
+ doc = nlp(context)
216
+ sentences = [sent.text.strip() for sent in doc.sents if len(sent.text.strip()) > 5]
217
+
218
+ # Create embeddings
219
+ sentence_embeddings = model.encode(sentences)
220
+ query_embedding = model.encode([query])[0]
221
+
222
+ # Calculate similarities
223
+ from sentence_transformers import util
224
+ similarities = util.pytorch_cos_sim(query_embedding, sentence_embeddings)[0].cpu().numpy()
225
+
226
+ # Create result pairs (sentence, similarity)
227
+ results = [(sentences[i], float(similarities[i])) for i in range(len(sentences))]
228
+
229
+ # Sort by similarity (descending)
230
+ results.sort(key=lambda x: x[1], reverse=True)
231
+
232
+ # Return top results
233
+ return {
234
+ "success": True,
235
+ "results": [
236
+ {"text": text, "score": score}
237
+ for text, score in results[:5] # Return top 5 results
238
+ ]
239
+ }
240
+
241
+ except Exception as e:
242
+ return {
243
+ "success": False,
244
+ "error": str(e)
245
+ }
246
+
247
+ def get_about_section():
248
+ """Generate the About Vector Embeddings section"""
249
+ return """
250
+ <div class="card mt-4">
251
+ <div class="card-header bg-primary text-white">
252
+ <h4><i class="fas fa-info-circle me-2"></i>About Vector Embeddings</h4>
253
+ </div>
254
+ <div class="card-body">
255
+ <h5>What are Vector Embeddings?</h5>
256
+ <p>Vector embeddings are numerical representations of text that capture semantic meaning in high-dimensional space. They convert words, sentences, or documents into dense vectors where similar content has similar vector representations.</p>
257
+
258
+ <h5>Applications of Vector Embeddings:</h5>
259
+ <ul>
260
+ <li><strong>Semantic Search</strong> - Finding content based on meaning rather than exact keyword matches</li>
261
+ <li><strong>Document Similarity</strong> - Comparing documents for content similarity and clustering</li>
262
+ <li><strong>Recommendation Systems</strong> - Suggesting similar content based on user preferences</li>
263
+ <li><strong>Question Answering</strong> - Finding relevant passages to answer questions</li>
264
+ <li><strong>Content Classification</strong> - Automatically categorizing text based on semantic content</li>
265
+ <li><strong>Language Translation</strong> - Mapping concepts across different languages</li>
266
+ </ul>
267
+
268
+ <h5>How It Works:</h5>
269
+ <p>Our system uses the SentenceTransformer model to create embeddings that capture the semantic meaning of your text. The cosine similarity between vectors determines how related different pieces of content are, enabling powerful semantic search capabilities.</p>
270
+ </div>
271
+ </div>
272
+ """