davidtran999 commited on
Commit
c330e24
·
verified ·
1 Parent(s): 60a2a10

Upload hue_portal/core/pure_semantic_search.py with huggingface_hub

Browse files
hue_portal/core/pure_semantic_search.py ADDED
@@ -0,0 +1,322 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Pure Semantic Search - 100% vector search with multi-query support.
3
+
4
+ This module implements pure semantic search (no BM25) which is the recommended
5
+ approach when using Query Rewrite Strategy + BGE-M3. All top systems have moved
6
+ away from hybrid search (BM25 + Vector) to pure semantic search since Oct 2025.
7
+ """
8
+ import logging
9
+ from typing import List, Tuple, Optional, Dict, Any, Set
10
+ from concurrent.futures import ThreadPoolExecutor, as_completed
11
+ from django.db.models import QuerySet
12
+
13
+ from .embeddings import (
14
+ get_embedding_model,
15
+ generate_embedding,
16
+ cosine_similarity
17
+ )
18
+ from .embedding_utils import load_embedding
19
+
20
+ logger = logging.getLogger(__name__)
21
+
22
+ # Minimum vector score threshold
23
+ DEFAULT_MIN_VECTOR_SCORE = 0.1
24
+
25
+
26
+ def get_vector_scores(
27
+ queryset: QuerySet,
28
+ query: str,
29
+ top_k: int = 20
30
+ ) -> List[Tuple[Any, float]]:
31
+ """
32
+ Get vector similarity scores for queryset.
33
+
34
+ This is extracted from hybrid_search.py for use in pure semantic search.
35
+
36
+ Args:
37
+ queryset: Django QuerySet to search.
38
+ query: Search query string.
39
+ top_k: Maximum number of results.
40
+
41
+ Returns:
42
+ List of (object, vector_score) tuples.
43
+ """
44
+ if not query or not query.strip():
45
+ return []
46
+
47
+ # Generate query embedding
48
+ model = get_embedding_model()
49
+ if model is None:
50
+ return []
51
+
52
+ query_embedding = generate_embedding(query, model=model)
53
+ if query_embedding is None:
54
+ return []
55
+
56
+ # Get all objects with embeddings
57
+ all_objects = list(queryset)
58
+ if not all_objects:
59
+ return []
60
+
61
+ # Check dimension compatibility first
62
+ query_dim = len(query_embedding)
63
+ dimension_mismatch = False
64
+
65
+ # Calculate similarities
66
+ scores = []
67
+ for obj in all_objects:
68
+ obj_embedding = load_embedding(obj)
69
+ if obj_embedding is not None:
70
+ obj_dim = len(obj_embedding)
71
+ if obj_dim != query_dim:
72
+ # Dimension mismatch - skip vector search for this object
73
+ if not dimension_mismatch:
74
+ logger.warning(
75
+ f"Dimension mismatch: query={query_dim}, stored={obj_dim}. Skipping vector search."
76
+ )
77
+ dimension_mismatch = True
78
+ continue
79
+ similarity = cosine_similarity(query_embedding, obj_embedding)
80
+ if similarity >= DEFAULT_MIN_VECTOR_SCORE:
81
+ scores.append((obj, similarity))
82
+
83
+ # If dimension mismatch detected, return empty
84
+ if dimension_mismatch and not scores:
85
+ return []
86
+
87
+ # Sort by score descending
88
+ scores.sort(key=lambda x: x[1], reverse=True)
89
+ return scores[:top_k * 2] # Get more for merging with other queries
90
+
91
+
92
+ def calculate_exact_match_boost(obj: Any, query: str, text_fields: List[str]) -> float:
93
+ """
94
+ Calculate boost score for exact keyword matches in title/name fields.
95
+
96
+ This ensures exact matches are prioritized even in pure semantic search.
97
+
98
+ Args:
99
+ obj: Django model instance.
100
+ query: Search query string.
101
+ text_fields: List of field names to check (first 2 are usually title/name).
102
+
103
+ Returns:
104
+ Boost score (0.0 to 1.0).
105
+ """
106
+ if not query or not text_fields:
107
+ return 0.0
108
+
109
+ query_lower = query.lower().strip()
110
+ # Extract key phrases (2-3 word combinations) from query
111
+ query_words = query_lower.split()
112
+ key_phrases = []
113
+ for i in range(len(query_words) - 1):
114
+ phrase = " ".join(query_words[i:i+2])
115
+ if len(phrase) > 3:
116
+ key_phrases.append(phrase)
117
+ for i in range(len(query_words) - 2):
118
+ phrase = " ".join(query_words[i:i+3])
119
+ if len(phrase) > 5:
120
+ key_phrases.append(phrase)
121
+
122
+ # Also add individual words (longer than 2 chars)
123
+ query_words_set = set(word for word in query_words if len(word) > 2)
124
+
125
+ boost = 0.0
126
+
127
+ # Check primary fields (title, name) for exact matches
128
+ # First 2 fields are usually title/name
129
+ for field in text_fields[:2]:
130
+ if hasattr(obj, field):
131
+ field_value = str(getattr(obj, field, "")).lower()
132
+ if field_value:
133
+ # Check for key phrases first (highest priority)
134
+ for phrase in key_phrases:
135
+ if phrase in field_value:
136
+ # Major boost for phrase match
137
+ boost += 0.5
138
+ # Extra boost if it's the exact field value
139
+ if field_value.strip() == phrase.strip():
140
+ boost += 0.3
141
+
142
+ # Check for full query match
143
+ if query_lower in field_value:
144
+ boost += 0.4
145
+
146
+ # Count matched individual words
147
+ matched_words = sum(1 for word in query_words_set if word in field_value)
148
+ if matched_words > 0:
149
+ # Moderate boost for word matches
150
+ boost += 0.1 * min(matched_words, 3) # Cap at 3 words
151
+
152
+ return min(boost, 1.0) # Cap at 1.0 for very strong matches
153
+
154
+
155
+ def parallel_vector_search(
156
+ queries: List[str],
157
+ queryset: QuerySet,
158
+ top_k_per_query: int = 5,
159
+ final_top_k: int = 7,
160
+ text_fields: Optional[List[str]] = None
161
+ ) -> List[Tuple[Any, float]]:
162
+ """
163
+ Search with multiple queries in parallel, then merge results.
164
+
165
+ This is the core of Query Rewrite Strategy - run multiple vector searches
166
+ in parallel and merge results to get the best documents.
167
+
168
+ Args:
169
+ queries: List of rewritten queries (3-5 queries from Query Rewrite).
170
+ queryset: Django QuerySet to search.
171
+ top_k_per_query: Top K results per query (default: 5).
172
+ final_top_k: Final top K results after merging (default: 7).
173
+ text_fields: Optional list of field names for exact match boost.
174
+
175
+ Returns:
176
+ List of (object, combined_score) tuples, sorted by score descending.
177
+
178
+ Example:
179
+ queries = [
180
+ "nội dung điều 12",
181
+ "quy định điều 12",
182
+ "điều 12 quy định về"
183
+ ]
184
+ results = parallel_vector_search(queries, LegalSection.objects.all())
185
+ # Returns top 7 sections with highest combined scores
186
+ """
187
+ if not queries or not queries[0].strip():
188
+ return []
189
+
190
+ if len(queries) == 1:
191
+ # Single query - use direct vector search
192
+ return _single_query_search(queries[0], queryset, top_k=final_top_k, text_fields=text_fields)
193
+
194
+ # Multiple queries - run in parallel
195
+ all_results: Dict[Any, float] = {} # object -> max_score
196
+
197
+ # Use ThreadPoolExecutor for parallel searches
198
+ with ThreadPoolExecutor(max_workers=min(len(queries), 5)) as executor:
199
+ # Submit all searches
200
+ future_to_query = {
201
+ executor.submit(get_vector_scores, queryset, query, top_k=top_k_per_query): query
202
+ for query in queries
203
+ }
204
+
205
+ # Collect results as they complete
206
+ for future in as_completed(future_to_query):
207
+ query = future_to_query[future]
208
+ try:
209
+ results = future.result()
210
+ # Merge results: use max score for each object
211
+ for obj, score in results:
212
+ if obj in all_results:
213
+ # Keep the maximum score from all queries
214
+ all_results[obj] = max(all_results[obj], score)
215
+ else:
216
+ all_results[obj] = score
217
+ except Exception as e:
218
+ logger.warning(f"[PARALLEL_SEARCH] Error searching with query '{query}': {e}")
219
+
220
+ # Apply exact match boost if text_fields provided
221
+ if text_fields:
222
+ boosted_results = []
223
+ for obj, score in all_results.items():
224
+ boost = calculate_exact_match_boost(obj, queries[0], text_fields) # Use first query for boost
225
+ # Combine vector score with exact match boost (weighted)
226
+ combined_score = score * 0.8 + boost * 0.2 # 80% vector, 20% exact match
227
+ boosted_results.append((obj, combined_score))
228
+ all_results_list = boosted_results
229
+ else:
230
+ all_results_list = list(all_results.items())
231
+
232
+ # Sort by score descending
233
+ all_results_list.sort(key=lambda x: x[1], reverse=True)
234
+
235
+ return all_results_list[:final_top_k]
236
+
237
+
238
+ def _single_query_search(
239
+ query: str,
240
+ queryset: QuerySet,
241
+ top_k: int = 20,
242
+ text_fields: Optional[List[str]] = None
243
+ ) -> List[Tuple[Any, float]]:
244
+ """
245
+ Single query vector search with exact match boost.
246
+
247
+ Args:
248
+ query: Search query string.
249
+ queryset: Django QuerySet to search.
250
+ top_k: Maximum number of results.
251
+ text_fields: Optional list of field names for exact match boost.
252
+
253
+ Returns:
254
+ List of (object, score) tuples, sorted by score descending.
255
+ """
256
+ # Get vector scores
257
+ vector_results = get_vector_scores(queryset, query, top_k=top_k)
258
+
259
+ if not text_fields:
260
+ return vector_results[:top_k]
261
+
262
+ # Apply exact match boost
263
+ boosted_results = []
264
+ for obj, score in vector_results:
265
+ boost = calculate_exact_match_boost(obj, query, text_fields)
266
+ # Combine vector score with exact match boost (weighted)
267
+ combined_score = score * 0.8 + boost * 0.2 # 80% vector, 20% exact match
268
+ boosted_results.append((obj, combined_score))
269
+
270
+ # Sort by combined score
271
+ boosted_results.sort(key=lambda x: x[1], reverse=True)
272
+ return boosted_results[:top_k]
273
+
274
+
275
+ def pure_semantic_search(
276
+ queries: List[str],
277
+ queryset: QuerySet,
278
+ top_k: int = 20,
279
+ text_fields: Optional[List[str]] = None
280
+ ) -> List[Any]:
281
+ """
282
+ Pure semantic search (100% vector, no BM25).
283
+
284
+ This is the recommended search strategy when using Query Rewrite + BGE-M3.
285
+ All top systems have moved away from hybrid search to pure semantic since Oct 2025.
286
+
287
+ Args:
288
+ queries: List of queries (1 query or 3-5 queries from Query Rewrite).
289
+ queryset: Django QuerySet to search.
290
+ top_k: Maximum number of results.
291
+ text_fields: Optional list of field names for exact match boost.
292
+
293
+ Returns:
294
+ List of objects sorted by score (highest first).
295
+
296
+ Usage:
297
+ # Single query
298
+ results = pure_semantic_search(["mức phạt vi phạm"], queryset, top_k=20)
299
+
300
+ # Multiple queries (from Query Rewrite)
301
+ rewritten_queries = query_rewriter.rewrite_query("mức phạt vi phạm")
302
+ results = pure_semantic_search(rewritten_queries, queryset, top_k=20)
303
+ """
304
+ if not queries:
305
+ return []
306
+
307
+ if len(queries) == 1:
308
+ # Single query - direct search
309
+ results = _single_query_search(queries[0], queryset, top_k=top_k, text_fields=text_fields)
310
+ else:
311
+ # Multiple queries - parallel search
312
+ results = parallel_vector_search(
313
+ queries,
314
+ queryset,
315
+ top_k_per_query=max(5, top_k // len(queries)),
316
+ final_top_k=top_k,
317
+ text_fields=text_fields
318
+ )
319
+
320
+ # Return just the objects (without scores)
321
+ return [obj for obj, _ in results]
322
+