davidtran999 commited on
Commit
778e101
·
verified ·
1 Parent(s): fbf84fe

Upload hue_portal/core/hybrid_search.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. hue_portal/core/hybrid_search.py +636 -0
hue_portal/core/hybrid_search.py ADDED
@@ -0,0 +1,636 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Hybrid search combining BM25 and vector similarity.
3
+
4
+ NOTE: This module is being phased out in favor of pure semantic search.
5
+ Pure semantic search (100% vector) is recommended when using Query Rewrite Strategy + BGE-M3.
6
+ See pure_semantic_search.py for the new implementation.
7
+ """
8
+ from typing import List, Tuple, Optional, Dict, Any
9
+ import numpy as np
10
+ from django.db import connection
11
+ from django.db.models import QuerySet, F
12
+ from django.contrib.postgres.search import SearchQuery, SearchRank
13
+
14
+ from .embeddings import (
15
+ get_embedding_model,
16
+ generate_embedding,
17
+ cosine_similarity
18
+ )
19
+ from .embedding_utils import load_embedding
20
+ from .search_ml import expand_query_with_synonyms
21
+
22
+ # Import get_vector_scores from pure_semantic_search for backward compatibility
23
+ try:
24
+ from .pure_semantic_search import get_vector_scores as _get_vector_scores_from_pure
25
+ except ImportError:
26
+ _get_vector_scores_from_pure = None
27
+
28
+
29
+ # Default weights for hybrid search
30
+ DEFAULT_BM25_WEIGHT = 0.4
31
+ DEFAULT_VECTOR_WEIGHT = 0.6
32
+
33
+ # Minimum scores
34
+ DEFAULT_MIN_BM25_SCORE = 0.0
35
+ DEFAULT_MIN_VECTOR_SCORE = 0.1
36
+
37
+
38
+ def calculate_exact_match_boost(obj: Any, query: str, text_fields: List[str]) -> float:
39
+ """
40
+ Calculate boost score for exact keyword matches in title/name fields.
41
+
42
+ Args:
43
+ obj: Django model instance.
44
+ query: Search query string.
45
+ text_fields: List of field names to check (first 2 are usually title/name).
46
+
47
+ Returns:
48
+ Boost score (0.0 to 1.0).
49
+ """
50
+ if not query or not text_fields:
51
+ return 0.0
52
+
53
+ query_lower = query.lower().strip()
54
+ # Extract key phrases (2-3 word combinations) from query
55
+ query_words = query_lower.split()
56
+ key_phrases = []
57
+ for i in range(len(query_words) - 1):
58
+ phrase = " ".join(query_words[i:i+2])
59
+ if len(phrase) > 3:
60
+ key_phrases.append(phrase)
61
+ for i in range(len(query_words) - 2):
62
+ phrase = " ".join(query_words[i:i+3])
63
+ if len(phrase) > 5:
64
+ key_phrases.append(phrase)
65
+
66
+ # Also add individual words (longer than 2 chars)
67
+ query_words_set = set(word for word in query_words if len(word) > 2)
68
+
69
+ boost = 0.0
70
+
71
+ # Check primary fields (title, name) for exact matches
72
+ # First 2 fields are usually title/name
73
+ for field in text_fields[:2]:
74
+ if hasattr(obj, field):
75
+ field_value = str(getattr(obj, field, "")).lower()
76
+ if field_value:
77
+ # Check for key phrases first (highest priority)
78
+ for phrase in key_phrases:
79
+ if phrase in field_value:
80
+ # Major boost for phrase match
81
+ boost += 0.5
82
+ # Extra boost if it's the exact field value
83
+ if field_value.strip() == phrase.strip():
84
+ boost += 0.3
85
+
86
+ # Check for full query match
87
+ if query_lower in field_value:
88
+ boost += 0.4
89
+
90
+ # Count matched individual words
91
+ matched_words = sum(1 for word in query_words_set if word in field_value)
92
+ if matched_words > 0:
93
+ # Moderate boost for word matches
94
+ boost += 0.1 * min(matched_words, 3) # Cap at 3 words
95
+
96
+ return min(boost, 1.0) # Cap at 1.0 for very strong matches
97
+
98
+
99
+ def get_bm25_scores(
100
+ queryset: QuerySet,
101
+ query: str,
102
+ top_k: int = 20
103
+ ) -> List[Tuple[Any, float]]:
104
+ """
105
+ Get BM25 scores for queryset.
106
+
107
+ Args:
108
+ queryset: Django QuerySet to search.
109
+ query: Search query string.
110
+ top_k: Maximum number of results.
111
+
112
+ Returns:
113
+ List of (object, bm25_score) tuples.
114
+ """
115
+ if not query or connection.vendor != "postgresql":
116
+ return []
117
+
118
+ if not hasattr(queryset.model, "tsv_body"):
119
+ return []
120
+
121
+ try:
122
+ import sys
123
+ # Increase recursion limit for query expansion
124
+ old_limit = sys.getrecursionlimit()
125
+ try:
126
+ sys.setrecursionlimit(3000) # Increase limit for query expansion
127
+ expanded_queries = expand_query_with_synonyms(query)
128
+ # Limit expanded queries to prevent too many variants
129
+ expanded_queries = expanded_queries[:5] # Max 5 variants
130
+
131
+ combined_query = None
132
+ for q_variant in expanded_queries:
133
+ variant_query = SearchQuery(q_variant, config="simple")
134
+ combined_query = variant_query if combined_query is None else combined_query | variant_query
135
+
136
+ if combined_query is not None:
137
+ ranked_qs = (
138
+ queryset
139
+ .annotate(rank=SearchRank(F("tsv_body"), combined_query))
140
+ .filter(rank__gt=DEFAULT_MIN_BM25_SCORE)
141
+ .order_by("-rank")
142
+ )
143
+ results = list(ranked_qs[:top_k * 2]) # Get more for hybrid ranking
144
+ return [(obj, float(getattr(obj, "rank", 0.0))) for obj in results]
145
+ finally:
146
+ sys.setrecursionlimit(old_limit) # Restore original limit
147
+ except RecursionError as e:
148
+ print(f"Error in BM25 search (recursion): {e}", flush=True)
149
+ # Fallback: use original query without expansion
150
+ try:
151
+ variant_query = SearchQuery(query, config="simple")
152
+ ranked_qs = (
153
+ queryset
154
+ .annotate(rank=SearchRank(F("tsv_body"), variant_query))
155
+ .filter(rank__gt=DEFAULT_MIN_BM25_SCORE)
156
+ .order_by("-rank")
157
+ )
158
+ results = list(ranked_qs[:top_k * 2])
159
+ return [(obj, float(getattr(obj, "rank", 0.0))) for obj in results]
160
+ except Exception as fallback_e:
161
+ print(f"Error in BM25 search fallback: {fallback_e}", flush=True)
162
+ except Exception as e:
163
+ print(f"Error in BM25 search: {e}", flush=True)
164
+
165
+ return []
166
+
167
+
168
+ def get_vector_scores(
169
+ queryset: QuerySet,
170
+ query: str,
171
+ top_k: int = 20
172
+ ) -> List[Tuple[Any, float]]:
173
+ """
174
+ Get vector similarity scores for queryset.
175
+
176
+ DEPRECATED: Use pure_semantic_search.get_vector_scores() instead.
177
+ This function is kept for backward compatibility.
178
+
179
+ Args:
180
+ queryset: Django QuerySet to search.
181
+ query: Search query string.
182
+ top_k: Maximum number of results.
183
+
184
+ Returns:
185
+ List of (object, vector_score) tuples.
186
+ """
187
+ # Try to use the new implementation from pure_semantic_search
188
+ if _get_vector_scores_from_pure:
189
+ return _get_vector_scores_from_pure(queryset, query, top_k)
190
+
191
+ # Fallback to original implementation
192
+ if not query:
193
+ return []
194
+
195
+ # Generate query embedding
196
+ model = get_embedding_model()
197
+ if model is None:
198
+ return []
199
+
200
+ query_embedding = generate_embedding(query, model=model)
201
+ if query_embedding is None:
202
+ return []
203
+
204
+ # Get all objects with embeddings
205
+ all_objects = list(queryset)
206
+ if not all_objects:
207
+ return []
208
+
209
+ # Check dimension compatibility first
210
+ query_dim = len(query_embedding)
211
+ dimension_mismatch = False
212
+
213
+ # Calculate similarities
214
+ scores = []
215
+ for obj in all_objects:
216
+ obj_embedding = load_embedding(obj)
217
+ if obj_embedding is not None:
218
+ obj_dim = len(obj_embedding)
219
+ if obj_dim != query_dim:
220
+ # Dimension mismatch - skip vector search for this object
221
+ if not dimension_mismatch:
222
+ print(f"⚠️ Dimension mismatch: query={query_dim}, stored={obj_dim}. Skipping vector search.")
223
+ dimension_mismatch = True
224
+ continue
225
+ similarity = cosine_similarity(query_embedding, obj_embedding)
226
+ if similarity >= DEFAULT_MIN_VECTOR_SCORE:
227
+ scores.append((obj, similarity))
228
+
229
+ # If dimension mismatch detected, return empty to fall back to BM25 + exact match
230
+ if dimension_mismatch and not scores:
231
+ return []
232
+
233
+ # Sort by score descending
234
+ scores.sort(key=lambda x: x[1], reverse=True)
235
+ return scores[:top_k * 2] # Get more for hybrid ranking
236
+
237
+
238
+ def normalize_scores(scores: List[Tuple[Any, float]]) -> Dict[Any, float]:
239
+ """
240
+ Normalize scores to 0-1 range.
241
+
242
+ Args:
243
+ scores: List of (object, score) tuples.
244
+
245
+ Returns:
246
+ Dictionary mapping object to normalized score.
247
+ """
248
+ if not scores:
249
+ return {}
250
+
251
+ max_score = max(score for _, score in scores) if scores else 1.0
252
+ min_score = min(score for _, score in scores) if scores else 0.0
253
+
254
+ if max_score == min_score:
255
+ # All scores are the same, return uniform distribution
256
+ return {obj: 1.0 for obj, _ in scores}
257
+
258
+ # Normalize to 0-1
259
+ normalized = {}
260
+ for obj, score in scores:
261
+ normalized[obj] = (score - min_score) / (max_score - min_score)
262
+
263
+ return normalized
264
+
265
+
266
+ def hybrid_search(
267
+ queryset: QuerySet,
268
+ query: str,
269
+ top_k: int = 20,
270
+ bm25_weight: float = DEFAULT_BM25_WEIGHT,
271
+ vector_weight: float = DEFAULT_VECTOR_WEIGHT,
272
+ min_hybrid_score: float = 0.1,
273
+ text_fields: Optional[List[str]] = None
274
+ ) -> List[Any]:
275
+ """
276
+ Perform hybrid search combining BM25 and vector similarity.
277
+
278
+ Args:
279
+ queryset: Django QuerySet to search.
280
+ query: Search query string.
281
+ top_k: Maximum number of results.
282
+ bm25_weight: Weight for BM25 score (0-1).
283
+ vector_weight: Weight for vector score (0-1).
284
+ min_hybrid_score: Minimum combined score threshold.
285
+ text_fields: List of field names for exact match boost (optional).
286
+
287
+ Returns:
288
+ List of objects sorted by hybrid score.
289
+ """
290
+ if not query:
291
+ return list(queryset[:top_k])
292
+
293
+ # Normalize weights
294
+ total_weight = bm25_weight + vector_weight
295
+ if total_weight > 0:
296
+ bm25_weight = bm25_weight / total_weight
297
+ vector_weight = vector_weight / total_weight
298
+ else:
299
+ bm25_weight = 0.5
300
+ vector_weight = 0.5
301
+
302
+ # Get BM25 scores
303
+ bm25_results = get_bm25_scores(queryset, query, top_k=top_k)
304
+ bm25_scores = normalize_scores(bm25_results)
305
+
306
+ # Get vector scores
307
+ vector_results = get_vector_scores(queryset, query, top_k=top_k)
308
+ vector_scores = normalize_scores(vector_results)
309
+
310
+ # Combine scores
311
+ combined_scores = {}
312
+ all_objects = set()
313
+
314
+ # Add BM25 objects
315
+ for obj, _ in bm25_results:
316
+ all_objects.add(obj)
317
+ combined_scores[obj] = bm25_scores.get(obj, 0.0) * bm25_weight
318
+
319
+ # Add vector objects
320
+ for obj, _ in vector_results:
321
+ all_objects.add(obj)
322
+ if obj in combined_scores:
323
+ combined_scores[obj] += vector_scores.get(obj, 0.0) * vector_weight
324
+ else:
325
+ combined_scores[obj] = vector_scores.get(obj, 0.0) * vector_weight
326
+
327
+ # CRITICAL: Find exact matches FIRST using icontains, then apply boost
328
+ # This ensures exact matches are always found and prioritized
329
+ if text_fields:
330
+ query_lower = query.lower()
331
+ # Extract key phrases (2-word and 3-word) from query
332
+ query_words = query_lower.split()
333
+ key_phrases = []
334
+ # 2-word phrases
335
+ for i in range(len(query_words) - 1):
336
+ phrase = " ".join(query_words[i:i+2])
337
+ if len(phrase) > 3:
338
+ key_phrases.append(phrase)
339
+ # 3-word phrases
340
+ for i in range(len(query_words) - 2):
341
+ phrase = " ".join(query_words[i:i+3])
342
+ if len(phrase) > 5:
343
+ key_phrases.append(phrase)
344
+
345
+ # Find potential exact matches using icontains on name/title field
346
+ # This ensures we don't miss exact matches even if BM25/vector don't find them
347
+ exact_match_candidates = set()
348
+ primary_field = text_fields[0] if text_fields else "name"
349
+ if hasattr(queryset.model, primary_field):
350
+ # Search for key phrases in the primary field
351
+ for phrase in key_phrases:
352
+ filter_kwargs = {f"{primary_field}__icontains": phrase}
353
+ candidates = queryset.filter(**filter_kwargs)[:top_k * 2]
354
+ exact_match_candidates.update(candidates)
355
+
356
+ # Apply exact match boost to all candidates
357
+ for obj in exact_match_candidates:
358
+ if obj not in all_objects:
359
+ all_objects.add(obj)
360
+ combined_scores[obj] = 0.0
361
+
362
+ # Apply exact match boost (this should dominate)
363
+ boost = calculate_exact_match_boost(obj, query, text_fields)
364
+ if boost > 0:
365
+ # Exact match boost should dominate - set it high
366
+ combined_scores[obj] = max(combined_scores.get(obj, 0.0), boost)
367
+
368
+ # Also check objects already in results for exact matches
369
+ for obj in list(all_objects):
370
+ boost = calculate_exact_match_boost(obj, query, text_fields)
371
+ if boost > 0:
372
+ # Boost existing scores
373
+ combined_scores[obj] = max(combined_scores.get(obj, 0.0), boost)
374
+
375
+ # Filter by minimum score and sort
376
+ filtered_scores = [
377
+ (obj, score) for obj, score in combined_scores.items()
378
+ if score >= min_hybrid_score
379
+ ]
380
+ filtered_scores.sort(key=lambda x: x[1], reverse=True)
381
+
382
+ # Return top k
383
+ results = [obj for obj, _ in filtered_scores[:top_k]]
384
+
385
+ # Store hybrid score on objects for reference
386
+ for obj, score in filtered_scores[:top_k]:
387
+ obj._hybrid_score = score
388
+ obj._bm25_score = bm25_scores.get(obj, 0.0)
389
+ obj._vector_score = vector_scores.get(obj, 0.0)
390
+ # Store exact match boost if applied
391
+ if text_fields:
392
+ obj._exact_match_boost = calculate_exact_match_boost(obj, query, text_fields)
393
+ else:
394
+ obj._exact_match_boost = 0.0
395
+
396
+ return results
397
+
398
+
399
+ def semantic_query_expansion(query: str, top_n: int = 3) -> List[str]:
400
+ """
401
+ Expand query with semantically similar terms using embeddings.
402
+
403
+ Args:
404
+ query: Original query string.
405
+ top_n: Number of similar terms to add.
406
+
407
+ Returns:
408
+ List of expanded query variations.
409
+ """
410
+ try:
411
+ from hue_portal.chatbot.query_expansion import expand_query_semantically
412
+ return expand_query_semantically(query, context=None)
413
+ except Exception:
414
+ # Fallback to basic synonym expansion
415
+ return expand_query_with_synonyms(query)
416
+
417
+
418
+ def rerank_results(query: str, results: List[Any], text_fields: List[str], top_k: int = 5) -> List[Any]:
419
+ """
420
+ Rerank results using cross-encoder approach (recalculate similarity with query).
421
+
422
+ Args:
423
+ query: Search query.
424
+ results: List of result objects.
425
+ text_fields: List of field names to use for reranking.
426
+ top_k: Number of top results to return.
427
+
428
+ Returns:
429
+ Reranked list of results.
430
+ """
431
+ if not results or not query:
432
+ return results[:top_k]
433
+
434
+ try:
435
+ # Generate query embedding
436
+ model = get_embedding_model()
437
+ if model is None:
438
+ return results[:top_k]
439
+
440
+ query_embedding = generate_embedding(query, model=model)
441
+ if query_embedding is None:
442
+ return results[:top_k]
443
+
444
+ # Calculate similarity for each result
445
+ scored_results = []
446
+ for obj in results:
447
+ # Create text representation from text_fields
448
+ text_parts = []
449
+ for field in text_fields:
450
+ if hasattr(obj, field):
451
+ value = getattr(obj, field, "")
452
+ if value:
453
+ text_parts.append(str(value))
454
+
455
+ if not text_parts:
456
+ continue
457
+
458
+ obj_text = " ".join(text_parts)
459
+ obj_embedding = generate_embedding(obj_text, model=model)
460
+
461
+ if obj_embedding is not None:
462
+ similarity = cosine_similarity(query_embedding, obj_embedding)
463
+ scored_results.append((obj, similarity))
464
+
465
+ # Sort by similarity and return top_k
466
+ scored_results.sort(key=lambda x: x[1], reverse=True)
467
+ return [obj for obj, _ in scored_results[:top_k]]
468
+ except Exception as e:
469
+ print(f"Error in reranking: {e}")
470
+ return results[:top_k]
471
+
472
+
473
+ def diversify_results(results: List[Any], top_k: int = 5, similarity_threshold: float = 0.8) -> List[Any]:
474
+ """
475
+ Ensure diversity in results by removing very similar items.
476
+
477
+ Args:
478
+ results: List of result objects.
479
+ top_k: Number of results to return.
480
+ similarity_threshold: Maximum similarity allowed between results.
481
+
482
+ Returns:
483
+ Diversified list of results.
484
+ """
485
+ if len(results) <= top_k:
486
+ return results
487
+
488
+ try:
489
+ model = get_embedding_model()
490
+ if model is None:
491
+ return results[:top_k]
492
+
493
+ # Generate embeddings for all results
494
+ result_embeddings = []
495
+ valid_results = []
496
+
497
+ for obj in results:
498
+ # Try to get embedding from object
499
+ obj_embedding = load_embedding(obj)
500
+ if obj_embedding is not None:
501
+ result_embeddings.append(obj_embedding)
502
+ valid_results.append(obj)
503
+
504
+ if len(valid_results) <= top_k:
505
+ return valid_results
506
+
507
+ # Select diverse results using Maximal Marginal Relevance (MMR)
508
+ selected = [valid_results[0]] # Always include first (highest score)
509
+ selected_indices = {0}
510
+ selected_embeddings = [result_embeddings[0]]
511
+
512
+ for _ in range(min(top_k - 1, len(valid_results) - 1)):
513
+ best_score = -1
514
+ best_idx = -1
515
+
516
+ for i, (obj, emb) in enumerate(zip(valid_results, result_embeddings)):
517
+ if i in selected_indices:
518
+ continue
519
+
520
+ # Calculate max similarity to already selected results
521
+ max_sim = 0.0
522
+ for sel_emb in selected_embeddings:
523
+ sim = cosine_similarity(emb, sel_emb)
524
+ max_sim = max(max_sim, sim)
525
+
526
+ # Score: prefer results with lower similarity to selected ones
527
+ score = 1.0 - max_sim
528
+
529
+ if score > best_score:
530
+ best_score = score
531
+ best_idx = i
532
+
533
+ if best_idx >= 0:
534
+ selected.append(valid_results[best_idx])
535
+ selected_indices.add(best_idx)
536
+ selected_embeddings.append(result_embeddings[best_idx])
537
+
538
+ return selected
539
+ except Exception as e:
540
+ print(f"Error in diversifying results: {e}")
541
+ return results[:top_k]
542
+
543
+
544
+ def search_with_hybrid(
545
+ queryset: QuerySet,
546
+ query: str,
547
+ text_fields: List[str],
548
+ top_k: int = 20,
549
+ min_score: float = 0.1,
550
+ use_hybrid: bool = True,
551
+ bm25_weight: float = DEFAULT_BM25_WEIGHT,
552
+ vector_weight: float = DEFAULT_VECTOR_WEIGHT,
553
+ use_reranking: bool = False,
554
+ use_diversification: bool = False
555
+ ) -> QuerySet:
556
+ """
557
+ Search with hybrid BM25 + vector, with fallback to BM25-only or TF-IDF.
558
+
559
+ Args:
560
+ queryset: Django QuerySet to search.
561
+ query: Search query string.
562
+ text_fields: List of field names (for fallback).
563
+ top_k: Maximum number of results.
564
+ min_score: Minimum score threshold.
565
+ use_hybrid: Whether to use hybrid search.
566
+ bm25_weight: Weight for BM25 in hybrid search.
567
+ vector_weight: Weight for vector in hybrid search.
568
+
569
+ Returns:
570
+ Filtered and ranked QuerySet.
571
+ """
572
+ if not query:
573
+ return queryset[:top_k]
574
+
575
+ # Try hybrid search if enabled
576
+ if use_hybrid:
577
+ try:
578
+ hybrid_results = hybrid_search(
579
+ queryset,
580
+ query,
581
+ top_k=top_k,
582
+ bm25_weight=bm25_weight,
583
+ vector_weight=vector_weight,
584
+ min_hybrid_score=min_score,
585
+ text_fields=text_fields
586
+ )
587
+
588
+ if hybrid_results:
589
+ # Apply reranking if enabled
590
+ if use_reranking and len(hybrid_results) > top_k:
591
+ hybrid_results = rerank_results(query, hybrid_results, text_fields, top_k=top_k * 2)
592
+
593
+ # Apply diversification if enabled
594
+ if use_diversification:
595
+ hybrid_results = diversify_results(hybrid_results, top_k=top_k)
596
+
597
+ # Convert to QuerySet with preserved order
598
+ result_ids = [obj.id for obj in hybrid_results[:top_k]]
599
+ if result_ids:
600
+ from django.db.models import Case, When, IntegerField
601
+ preserved = Case(
602
+ *[When(pk=pk, then=pos) for pos, pk in enumerate(result_ids)],
603
+ output_field=IntegerField()
604
+ )
605
+ return queryset.filter(id__in=result_ids).order_by(preserved)
606
+ except Exception as e:
607
+ print(f"Hybrid search failed, falling back: {e}")
608
+
609
+ # Fallback to BM25-only
610
+ if connection.vendor == "postgresql" and hasattr(queryset.model, "tsv_body"):
611
+ try:
612
+ expanded_queries = expand_query_with_synonyms(query)
613
+ combined_query = None
614
+ for q_variant in expanded_queries:
615
+ variant_query = SearchQuery(q_variant, config="simple")
616
+ combined_query = variant_query if combined_query is None else combined_query | variant_query
617
+
618
+ if combined_query is not None:
619
+ ranked_qs = (
620
+ queryset
621
+ .annotate(rank=SearchRank(F("tsv_body"), combined_query))
622
+ .filter(rank__gt=0)
623
+ .order_by("-rank")
624
+ )
625
+ results = list(ranked_qs[:top_k])
626
+ if results:
627
+ for obj in results:
628
+ obj._ml_score = getattr(obj, "rank", 0.0)
629
+ return results
630
+ except Exception:
631
+ pass
632
+
633
+ # Final fallback: import and use original search_with_ml
634
+ from .search_ml import search_with_ml
635
+ return search_with_ml(queryset, query, text_fields, top_k=top_k, min_score=min_score)
636
+