Spaces:

davidtran999
/

hue-portal-backend

Paused

App Files Files Community

davidtran999 commited on 20 days ago

Commit

778e101

verified ·

1 Parent(s): fbf84fe

Upload hue_portal/core/hybrid_search.py with huggingface_hub

Browse files

Files changed (1) hide show

hue_portal/core/hybrid_search.py +636 -0

hue_portal/core/hybrid_search.py ADDED Viewed

	@@ -0,0 +1,636 @@

+"""
+Hybrid search combining BM25 and vector similarity.
+NOTE: This module is being phased out in favor of pure semantic search.
+Pure semantic search (100% vector) is recommended when using Query Rewrite Strategy + BGE-M3.
+See pure_semantic_search.py for the new implementation.
+"""
+from typing import List, Tuple, Optional, Dict, Any
+import numpy as np
+from django.db import connection
+from django.db.models import QuerySet, F
+from django.contrib.postgres.search import SearchQuery, SearchRank
+from .embeddings import (
+    get_embedding_model,
+    generate_embedding,
+    cosine_similarity
+)
+from .embedding_utils import load_embedding
+from .search_ml import expand_query_with_synonyms
+# Import get_vector_scores from pure_semantic_search for backward compatibility
+try:
+    from .pure_semantic_search import get_vector_scores as _get_vector_scores_from_pure
+except ImportError:
+    _get_vector_scores_from_pure = None
+# Default weights for hybrid search
+DEFAULT_BM25_WEIGHT = 0.4
+DEFAULT_VECTOR_WEIGHT = 0.6
+# Minimum scores
+DEFAULT_MIN_BM25_SCORE = 0.0
+DEFAULT_MIN_VECTOR_SCORE = 0.1
+def calculate_exact_match_boost(obj: Any, query: str, text_fields: List[str]) -> float:
+    """
+    Calculate boost score for exact keyword matches in title/name fields.
+    Args:
+        obj: Django model instance.
+        query: Search query string.
+        text_fields: List of field names to check (first 2 are usually title/name).
+    Returns:
+        Boost score (0.0 to 1.0).
+    """
+    if not query or not text_fields:
+        return 0.0
+    query_lower = query.lower().strip()
+    # Extract key phrases (2-3 word combinations) from query
+    query_words = query_lower.split()
+    key_phrases = []
+    for i in range(len(query_words) - 1):
+        phrase = " ".join(query_words[i:i+2])
+        if len(phrase) > 3:
+            key_phrases.append(phrase)
+    for i in range(len(query_words) - 2):
+        phrase = " ".join(query_words[i:i+3])
+        if len(phrase) > 5:
+            key_phrases.append(phrase)
+    # Also add individual words (longer than 2 chars)
+    query_words_set = set(word for word in query_words if len(word) > 2)
+    boost = 0.0
+    # Check primary fields (title, name) for exact matches
+    # First 2 fields are usually title/name
+    for field in text_fields[:2]:
+        if hasattr(obj, field):
+            field_value = str(getattr(obj, field, "")).lower()
+            if field_value:
+                # Check for key phrases first (highest priority)
+                for phrase in key_phrases:
+                    if phrase in field_value:
+                        # Major boost for phrase match
+                        boost += 0.5
+                        # Extra boost if it's the exact field value
+                        if field_value.strip() == phrase.strip():
+                            boost += 0.3
+                # Check for full query match
+                if query_lower in field_value:
+                    boost += 0.4
+                # Count matched individual words
+                matched_words = sum(1 for word in query_words_set if word in field_value)
+                if matched_words > 0:
+                    # Moderate boost for word matches
+                    boost += 0.1 * min(matched_words, 3)  # Cap at 3 words
+    return min(boost, 1.0)  # Cap at 1.0 for very strong matches
+def get_bm25_scores(
+    queryset: QuerySet,
+    query: str,
+    top_k: int = 20
+) -> List[Tuple[Any, float]]:
+    """
+    Get BM25 scores for queryset.
+    Args:
+        queryset: Django QuerySet to search.
+        query: Search query string.
+        top_k: Maximum number of results.
+    Returns:
+        List of (object, bm25_score) tuples.
+    """
+    if not query or connection.vendor != "postgresql":
+        return []
+    if not hasattr(queryset.model, "tsv_body"):
+        return []
+    try:
+        import sys
+        # Increase recursion limit for query expansion
+        old_limit = sys.getrecursionlimit()
+        try:
+            sys.setrecursionlimit(3000)  # Increase limit for query expansion
+            expanded_queries = expand_query_with_synonyms(query)
+            # Limit expanded queries to prevent too many variants
+            expanded_queries = expanded_queries[:5]  # Max 5 variants
+            combined_query = None
+            for q_variant in expanded_queries:
+                variant_query = SearchQuery(q_variant, config="simple")
+                combined_query = variant_query if combined_query is None else combined_query | variant_query
+            if combined_query is not None:
+                ranked_qs = (
+                    queryset
+                    .annotate(rank=SearchRank(F("tsv_body"), combined_query))
+                    .filter(rank__gt=DEFAULT_MIN_BM25_SCORE)
+                    .order_by("-rank")
+                )
+                results = list(ranked_qs[:top_k * 2])  # Get more for hybrid ranking
+                return [(obj, float(getattr(obj, "rank", 0.0))) for obj in results]
+        finally:
+            sys.setrecursionlimit(old_limit)  # Restore original limit
+    except RecursionError as e:
+        print(f"Error in BM25 search (recursion): {e}", flush=True)
+        # Fallback: use original query without expansion
+        try:
+            variant_query = SearchQuery(query, config="simple")
+            ranked_qs = (
+                queryset
+                .annotate(rank=SearchRank(F("tsv_body"), variant_query))
+                .filter(rank__gt=DEFAULT_MIN_BM25_SCORE)
+                .order_by("-rank")
+            )
+            results = list(ranked_qs[:top_k * 2])
+            return [(obj, float(getattr(obj, "rank", 0.0))) for obj in results]
+        except Exception as fallback_e:
+            print(f"Error in BM25 search fallback: {fallback_e}", flush=True)
+    except Exception as e:
+        print(f"Error in BM25 search: {e}", flush=True)
+    return []
+def get_vector_scores(
+    queryset: QuerySet,
+    query: str,
+    top_k: int = 20
+) -> List[Tuple[Any, float]]:
+    """
+    Get vector similarity scores for queryset.
+    DEPRECATED: Use pure_semantic_search.get_vector_scores() instead.
+    This function is kept for backward compatibility.
+    Args:
+        queryset: Django QuerySet to search.
+        query: Search query string.
+        top_k: Maximum number of results.
+    Returns:
+        List of (object, vector_score) tuples.
+    """
+    # Try to use the new implementation from pure_semantic_search
+    if _get_vector_scores_from_pure:
+        return _get_vector_scores_from_pure(queryset, query, top_k)
+    # Fallback to original implementation
+    if not query:
+        return []
+    # Generate query embedding
+    model = get_embedding_model()
+    if model is None:
+        return []
+    query_embedding = generate_embedding(query, model=model)
+    if query_embedding is None:
+        return []
+    # Get all objects with embeddings
+    all_objects = list(queryset)
+    if not all_objects:
+        return []
+    # Check dimension compatibility first
+    query_dim = len(query_embedding)
+    dimension_mismatch = False
+    # Calculate similarities
+    scores = []
+    for obj in all_objects:
+        obj_embedding = load_embedding(obj)
+        if obj_embedding is not None:
+            obj_dim = len(obj_embedding)
+            if obj_dim != query_dim:
+                # Dimension mismatch - skip vector search for this object
+                if not dimension_mismatch:
+                    print(f"⚠️ Dimension mismatch: query={query_dim}, stored={obj_dim}. Skipping vector search.")
+                    dimension_mismatch = True
+                continue
+            similarity = cosine_similarity(query_embedding, obj_embedding)
+            if similarity >= DEFAULT_MIN_VECTOR_SCORE:
+                scores.append((obj, similarity))
+    # If dimension mismatch detected, return empty to fall back to BM25 + exact match
+    if dimension_mismatch and not scores:
+        return []
+    # Sort by score descending
+    scores.sort(key=lambda x: x[1], reverse=True)
+    return scores[:top_k * 2]  # Get more for hybrid ranking
+def normalize_scores(scores: List[Tuple[Any, float]]) -> Dict[Any, float]:
+    """
+    Normalize scores to 0-1 range.
+    Args:
+        scores: List of (object, score) tuples.
+    Returns:
+        Dictionary mapping object to normalized score.
+    """
+    if not scores:
+        return {}
+    max_score = max(score for _, score in scores) if scores else 1.0
+    min_score = min(score for _, score in scores) if scores else 0.0
+    if max_score == min_score:
+        # All scores are the same, return uniform distribution
+        return {obj: 1.0 for obj, _ in scores}
+    # Normalize to 0-1
+    normalized = {}
+    for obj, score in scores:
+        normalized[obj] = (score - min_score) / (max_score - min_score)
+    return normalized
+def hybrid_search(
+    queryset: QuerySet,
+    query: str,
+    top_k: int = 20,
+    bm25_weight: float = DEFAULT_BM25_WEIGHT,
+    vector_weight: float = DEFAULT_VECTOR_WEIGHT,
+    min_hybrid_score: float = 0.1,
+    text_fields: Optional[List[str]] = None
+) -> List[Any]:
+    """
+    Perform hybrid search combining BM25 and vector similarity.
+    Args:
+        queryset: Django QuerySet to search.
+        query: Search query string.
+        top_k: Maximum number of results.
+        bm25_weight: Weight for BM25 score (0-1).
+        vector_weight: Weight for vector score (0-1).
+        min_hybrid_score: Minimum combined score threshold.
+        text_fields: List of field names for exact match boost (optional).
+    Returns:
+        List of objects sorted by hybrid score.
+    """
+    if not query:
+        return list(queryset[:top_k])
+    # Normalize weights
+    total_weight = bm25_weight + vector_weight
+    if total_weight > 0:
+        bm25_weight = bm25_weight / total_weight
+        vector_weight = vector_weight / total_weight
+    else:
+        bm25_weight = 0.5
+        vector_weight = 0.5
+    # Get BM25 scores
+    bm25_results = get_bm25_scores(queryset, query, top_k=top_k)
+    bm25_scores = normalize_scores(bm25_results)
+    # Get vector scores
+    vector_results = get_vector_scores(queryset, query, top_k=top_k)
+    vector_scores = normalize_scores(vector_results)
+    # Combine scores
+    combined_scores = {}
+    all_objects = set()
+    # Add BM25 objects
+    for obj, _ in bm25_results:
+        all_objects.add(obj)
+        combined_scores[obj] = bm25_scores.get(obj, 0.0) * bm25_weight
+    # Add vector objects
+    for obj, _ in vector_results:
+        all_objects.add(obj)
+        if obj in combined_scores:
+            combined_scores[obj] += vector_scores.get(obj, 0.0) * vector_weight
+        else:
+            combined_scores[obj] = vector_scores.get(obj, 0.0) * vector_weight
+    # CRITICAL: Find exact matches FIRST using icontains, then apply boost
+    # This ensures exact matches are always found and prioritized
+    if text_fields:
+        query_lower = query.lower()
+        # Extract key phrases (2-word and 3-word) from query
+        query_words = query_lower.split()
+        key_phrases = []
+        # 2-word phrases
+        for i in range(len(query_words) - 1):
+            phrase = " ".join(query_words[i:i+2])
+            if len(phrase) > 3:
+                key_phrases.append(phrase)
+        # 3-word phrases
+        for i in range(len(query_words) - 2):
+            phrase = " ".join(query_words[i:i+3])
+            if len(phrase) > 5:
+                key_phrases.append(phrase)
+        # Find potential exact matches using icontains on name/title field
+        # This ensures we don't miss exact matches even if BM25/vector don't find them
+        exact_match_candidates = set()
+        primary_field = text_fields[0] if text_fields else "name"
+        if hasattr(queryset.model, primary_field):
+            # Search for key phrases in the primary field
+            for phrase in key_phrases:
+                filter_kwargs = {f"{primary_field}__icontains": phrase}
+                candidates = queryset.filter(**filter_kwargs)[:top_k * 2]
+                exact_match_candidates.update(candidates)
+        # Apply exact match boost to all candidates
+        for obj in exact_match_candidates:
+            if obj not in all_objects:
+                all_objects.add(obj)
+                combined_scores[obj] = 0.0
+            # Apply exact match boost (this should dominate)
+            boost = calculate_exact_match_boost(obj, query, text_fields)
+            if boost > 0:
+                # Exact match boost should dominate - set it high
+                combined_scores[obj] = max(combined_scores.get(obj, 0.0), boost)
+        # Also check objects already in results for exact matches
+        for obj in list(all_objects):
+            boost = calculate_exact_match_boost(obj, query, text_fields)
+            if boost > 0:
+                # Boost existing scores
+                combined_scores[obj] = max(combined_scores.get(obj, 0.0), boost)
+    # Filter by minimum score and sort
+    filtered_scores = [
+        (obj, score) for obj, score in combined_scores.items()
+        if score >= min_hybrid_score
+    ]
+    filtered_scores.sort(key=lambda x: x[1], reverse=True)
+    # Return top k
+    results = [obj for obj, _ in filtered_scores[:top_k]]
+    # Store hybrid score on objects for reference
+    for obj, score in filtered_scores[:top_k]:
+        obj._hybrid_score = score
+        obj._bm25_score = bm25_scores.get(obj, 0.0)
+        obj._vector_score = vector_scores.get(obj, 0.0)
+        # Store exact match boost if applied
+        if text_fields:
+            obj._exact_match_boost = calculate_exact_match_boost(obj, query, text_fields)
+        else:
+            obj._exact_match_boost = 0.0
+    return results
+def semantic_query_expansion(query: str, top_n: int = 3) -> List[str]:
+    """
+    Expand query with semantically similar terms using embeddings.
+    Args:
+        query: Original query string.
+        top_n: Number of similar terms to add.
+    Returns:
+        List of expanded query variations.
+    """
+    try:
+        from hue_portal.chatbot.query_expansion import expand_query_semantically
+        return expand_query_semantically(query, context=None)
+    except Exception:
+        # Fallback to basic synonym expansion
+        return expand_query_with_synonyms(query)
+def rerank_results(query: str, results: List[Any], text_fields: List[str], top_k: int = 5) -> List[Any]:
+    """
+    Rerank results using cross-encoder approach (recalculate similarity with query).
+    Args:
+        query: Search query.
+        results: List of result objects.
+        text_fields: List of field names to use for reranking.
+        top_k: Number of top results to return.
+    Returns:
+        Reranked list of results.
+    """
+    if not results or not query:
+        return results[:top_k]
+    try:
+        # Generate query embedding
+        model = get_embedding_model()
+        if model is None:
+            return results[:top_k]
+        query_embedding = generate_embedding(query, model=model)
+        if query_embedding is None:
+            return results[:top_k]
+        # Calculate similarity for each result
+        scored_results = []
+        for obj in results:
+            # Create text representation from text_fields
+            text_parts = []
+            for field in text_fields:
+                if hasattr(obj, field):
+                    value = getattr(obj, field, "")
+                    if value:
+                        text_parts.append(str(value))
+            if not text_parts:
+                continue
+            obj_text = " ".join(text_parts)
+            obj_embedding = generate_embedding(obj_text, model=model)
+            if obj_embedding is not None:
+                similarity = cosine_similarity(query_embedding, obj_embedding)
+                scored_results.append((obj, similarity))
+        # Sort by similarity and return top_k
+        scored_results.sort(key=lambda x: x[1], reverse=True)
+        return [obj for obj, _ in scored_results[:top_k]]
+    except Exception as e:
+        print(f"Error in reranking: {e}")
+        return results[:top_k]
+def diversify_results(results: List[Any], top_k: int = 5, similarity_threshold: float = 0.8) -> List[Any]:
+    """
+    Ensure diversity in results by removing very similar items.
+    Args:
+        results: List of result objects.
+        top_k: Number of results to return.
+        similarity_threshold: Maximum similarity allowed between results.
+    Returns:
+        Diversified list of results.
+    """
+    if len(results) <= top_k:
+        return results
+    try:
+        model = get_embedding_model()
+        if model is None:
+            return results[:top_k]
+        # Generate embeddings for all results
+        result_embeddings = []
+        valid_results = []
+        for obj in results:
+            # Try to get embedding from object
+            obj_embedding = load_embedding(obj)
+            if obj_embedding is not None:
+                result_embeddings.append(obj_embedding)
+                valid_results.append(obj)
+        if len(valid_results) <= top_k:
+            return valid_results
+        # Select diverse results using Maximal Marginal Relevance (MMR)
+        selected = [valid_results[0]]  # Always include first (highest score)
+        selected_indices = {0}
+        selected_embeddings = [result_embeddings[0]]
+        for _ in range(min(top_k - 1, len(valid_results) - 1)):
+            best_score = -1
+            best_idx = -1
+            for i, (obj, emb) in enumerate(zip(valid_results, result_embeddings)):
+                if i in selected_indices:
+                    continue
+                # Calculate max similarity to already selected results
+                max_sim = 0.0
+                for sel_emb in selected_embeddings:
+                    sim = cosine_similarity(emb, sel_emb)
+                    max_sim = max(max_sim, sim)
+                # Score: prefer results with lower similarity to selected ones
+                score = 1.0 - max_sim
+                if score > best_score:
+                    best_score = score
+                    best_idx = i
+            if best_idx >= 0:
+                selected.append(valid_results[best_idx])
+                selected_indices.add(best_idx)
+                selected_embeddings.append(result_embeddings[best_idx])
+        return selected
+    except Exception as e:
+        print(f"Error in diversifying results: {e}")
+        return results[:top_k]
+def search_with_hybrid(
+    queryset: QuerySet,
+    query: str,
+    text_fields: List[str],
+    top_k: int = 20,
+    min_score: float = 0.1,
+    use_hybrid: bool = True,
+    bm25_weight: float = DEFAULT_BM25_WEIGHT,
+    vector_weight: float = DEFAULT_VECTOR_WEIGHT,
+    use_reranking: bool = False,
+    use_diversification: bool = False
+) -> QuerySet:
+    """
+    Search with hybrid BM25 + vector, with fallback to BM25-only or TF-IDF.
+    Args:
+        queryset: Django QuerySet to search.
+        query: Search query string.
+        text_fields: List of field names (for fallback).
+        top_k: Maximum number of results.
+        min_score: Minimum score threshold.
+        use_hybrid: Whether to use hybrid search.
+        bm25_weight: Weight for BM25 in hybrid search.
+        vector_weight: Weight for vector in hybrid search.
+    Returns:
+        Filtered and ranked QuerySet.
+    """
+    if not query:
+        return queryset[:top_k]
+    # Try hybrid search if enabled
+    if use_hybrid:
+        try:
+            hybrid_results = hybrid_search(
+                queryset,
+                query,
+                top_k=top_k,
+                bm25_weight=bm25_weight,
+                vector_weight=vector_weight,
+                min_hybrid_score=min_score,
+                text_fields=text_fields
+            )
+            if hybrid_results:
+                # Apply reranking if enabled
+                if use_reranking and len(hybrid_results) > top_k:
+                    hybrid_results = rerank_results(query, hybrid_results, text_fields, top_k=top_k * 2)
+                # Apply diversification if enabled
+                if use_diversification:
+                    hybrid_results = diversify_results(hybrid_results, top_k=top_k)
+                # Convert to QuerySet with preserved order
+                result_ids = [obj.id for obj in hybrid_results[:top_k]]
+                if result_ids:
+                    from django.db.models import Case, When, IntegerField
+                    preserved = Case(
+                        *[When(pk=pk, then=pos) for pos, pk in enumerate(result_ids)],
+                        output_field=IntegerField()
+                    )
+                    return queryset.filter(id__in=result_ids).order_by(preserved)
+        except Exception as e:
+            print(f"Hybrid search failed, falling back: {e}")
+    # Fallback to BM25-only
+    if connection.vendor == "postgresql" and hasattr(queryset.model, "tsv_body"):
+        try:
+            expanded_queries = expand_query_with_synonyms(query)
+            combined_query = None
+            for q_variant in expanded_queries:
+                variant_query = SearchQuery(q_variant, config="simple")
+                combined_query = variant_query if combined_query is None else combined_query | variant_query
+            if combined_query is not None:
+                ranked_qs = (
+                    queryset
+                    .annotate(rank=SearchRank(F("tsv_body"), combined_query))
+                    .filter(rank__gt=0)
+                    .order_by("-rank")
+                )
+                results = list(ranked_qs[:top_k])
+                if results:
+                    for obj in results:
+                        obj._ml_score = getattr(obj, "rank", 0.0)
+                    return results
+        except Exception:
+            pass
+    # Final fallback: import and use original search_with_ml
+    from .search_ml import search_with_ml
+    return search_with_ml(queryset, query, text_fields, top_k=top_k, min_score=min_score)