davidtran999's picture
Upload backend/chatbot/query_expansion.py with huggingface_hub
eac17f7 verified
raw
history blame
6.82 kB
"""
Query expansion and paraphrasing utilities for improving search recall.
"""
import re
import unicodedata
from typing import List, Dict, Any, Optional, Set
from hue_portal.core.models import Synonym
from hue_portal.core.search_ml import expand_query_with_synonyms
def normalize_vietnamese_query(query: str) -> str:
"""
Normalize Vietnamese text by handling diacritics variants.
Args:
query: Input query string.
Returns:
Normalized query string.
"""
if not query:
return ""
# Remove extra spaces
query = re.sub(r'\s+', ' ', query.strip())
# Lowercase
query = query.lower()
return query
def extract_key_phrases(query: str) -> List[str]:
"""
Extract key phrases from query.
Args:
query: Input query string.
Returns:
List of key phrases.
"""
if not query:
return []
# Remove common stopwords
stopwords = {
"là", "gì", "bao nhiêu", "như thế nào", "ở đâu", "của", "và", "hoặc",
"tôi", "bạn", "có", "không", "được", "một", "các", "với", "cho"
}
# Split into words
words = re.findall(r'\b\w+\b', query.lower())
# Filter stopwords and short words
key_words = [w for w in words if w not in stopwords and len(w) > 2]
# Extract bigrams (2-word phrases)
phrases = []
for i in range(len(key_words) - 1):
phrase = f"{key_words[i]} {key_words[i+1]}"
phrases.append(phrase)
# Combine single words and phrases
all_phrases = key_words + phrases
return all_phrases
def expand_query_semantically(query: str, context: Optional[Dict[str, Any]] = None) -> List[str]:
"""
Expand query with synonyms and related terms.
Args:
query: Original query string.
context: Optional context dictionary with entities, intents, etc.
Returns:
List of expanded query variations.
"""
expanded = [query]
# Use existing synonym expansion
synonym_expanded = expand_query_with_synonyms(query)
expanded.extend(synonym_expanded)
# Add context-based expansions
if context:
entities = context.get("entities", {})
# If fine_code in context, add fine name variations
if "fine_code" in entities:
fine_code = entities["fine_code"]
# Could look up fine name from database and add variations
expanded.append(f"{query} {fine_code}")
# If procedure_name in context, add procedure variations
if "procedure_name" in entities:
procedure_name = entities["procedure_name"]
expanded.append(f"{query} {procedure_name}")
# Add common Vietnamese variations
variations = _get_vietnamese_variations(query)
expanded.extend(variations)
# Remove duplicates while preserving order
seen = set()
unique_expanded = []
for q in expanded:
q_normalized = normalize_vietnamese_query(q)
if q_normalized not in seen:
seen.add(q_normalized)
unique_expanded.append(q)
return unique_expanded
def _get_vietnamese_variations(query: str) -> List[str]:
"""
Get common Vietnamese query variations.
Args:
query: Input query.
Returns:
List of variations.
"""
variations = []
query_lower = query.lower()
# Common synonym mappings
synonym_map = {
"mức phạt": ["tiền phạt", "phạt", "xử phạt"],
"thủ tục": ["hồ sơ", "giấy tờ", "quy trình"],
"địa chỉ": ["nơi", "chỗ", "điểm"],
"số điện thoại": ["điện thoại", "số liên hệ", "hotline"],
"giờ làm việc": ["thời gian", "giờ", "lịch làm việc"],
"cảnh báo": ["thông báo", "lưu ý", "chú ý"],
"lừa đảo": ["scam", "gian lận", "lừa"],
}
for key, synonyms in synonym_map.items():
if key in query_lower:
for synonym in synonyms:
variation = query_lower.replace(key, synonym)
if variation != query_lower:
variations.append(variation)
return variations
def paraphrase_query(query: str) -> List[str]:
"""
Generate paraphrases of the query to increase recall.
Args:
query: Original query string.
Returns:
List of paraphrased queries.
"""
paraphrases = [query]
query_lower = query.lower()
# Common paraphrasing patterns for Vietnamese
patterns = [
# Question variations
(r"mức phạt (.+) là bao nhiêu", r"phạt \1 bao nhiêu tiền"),
(r"thủ tục (.+) cần gì", r"làm thủ tục \1 cần giấy tờ gì"),
(r"địa chỉ (.+) ở đâu", r"\1 ở đâu"),
(r"(.+) như thế nào", r"cách \1"),
]
for pattern, replacement in patterns:
if re.search(pattern, query_lower):
paraphrase = re.sub(pattern, replacement, query_lower)
if paraphrase != query_lower:
paraphrases.append(paraphrase)
# Add question word variations
if "bao nhiêu" in query_lower:
paraphrases.append(query_lower.replace("bao nhiêu", "mức"))
paraphrases.append(query_lower.replace("bao nhiêu", "giá"))
if "như thế nào" in query_lower:
paraphrases.append(query_lower.replace("như thế nào", "cách"))
paraphrases.append(query_lower.replace("như thế nào", "quy trình"))
# Remove duplicates
return list(dict.fromkeys(paraphrases))
def enhance_query_with_context(query: str, context: Optional[Dict[str, Any]] = None) -> str:
"""
Enhance query with context information.
Args:
query: Original query string.
context: Optional context dictionary.
Returns:
Enhanced query string.
"""
if not context:
return query
enhanced_parts = [query]
# Add entities from context
entities = context.get("entities", {})
if "fine_code" in entities:
enhanced_parts.append(entities["fine_code"])
if "procedure_name" in entities:
enhanced_parts.append(entities["procedure_name"])
if "office_name" in entities:
enhanced_parts.append(entities["office_name"])
# Add intent-based keywords
intent = context.get("intent", "")
if intent == "search_fine":
enhanced_parts.append("mức phạt vi phạm")
elif intent == "search_procedure":
enhanced_parts.append("thủ tục hành chính")
elif intent == "search_office":
enhanced_parts.append("đơn vị công an")
return " ".join(enhanced_parts)