# utils/vector_utils.py import numpy as np from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.metrics.pairwise import cosine_similarity import re # Initialize the vectorizer _vectorizer = None def _get_vectorizer(): """ Get or initialize the TF-IDF vectorizer """ global _vectorizer if _vectorizer is None: _vectorizer = TfidfVectorizer( max_features=5000, stop_words='english', ngram_range=(1, 2) ) return _vectorizer def preprocess_text(text): """ Preprocess Vietnamese text for vectorization Args: text (str): Raw Vietnamese text Returns: str: Preprocessed text """ # Convert to lowercase (preserving Vietnamese diacritical marks) text = text.lower() # Remove URLs text = re.sub(r'https?://\S+|www\.\S+', '', text) # Remove HTML tags text = re.sub(r'<.*?>', '', text) # For Vietnamese text, we need to preserve diacritical marks # Only remove punctuation that doesn't affect meaning text = re.sub(r'[.,;:!?()"\'\[\]/\\]', ' ', text) text = re.sub(r'\d+', '', text) # Remove extra whitespace text = re.sub(r'\s+', ' ', text).strip() # Use Vietnamese-specific tokenization if available try: from underthesea import word_tokenize text = word_tokenize(text, format="text") except ImportError: # Fallback if underthesea is not available pass return text def extract_features(text): """ Extract feature vector from text Args: text (str): Input text Returns: numpy.ndarray: Feature vector """ # Get vectorizer vectorizer = _get_vectorizer() # Preprocess text processed_text = preprocess_text(text) # Fit vectorizer if not trained if not hasattr(vectorizer, 'vocabulary_'): vectorizer.fit([processed_text]) # Transform text to vector vector = vectorizer.transform([processed_text]) # Return as dense array return vector.toarray()[0] def compute_similarity(vec1, vec2): """ Compute cosine similarity between two vectors Args: vec1 (numpy.ndarray): First vector vec2 (numpy.ndarray): Second vector Returns: float: Similarity score (0-1) """ # Reshape vectors if needed if len(vec1.shape) == 1: vec1 = vec1.reshape(1, -1) if len(vec2.shape) == 1: vec2 = vec2.reshape(1, -1) # Compute similarity sim = cosine_similarity(vec1, vec2)[0][0] return float(sim) def find_similar_vectors(query_vector, vectors, threshold=0.7, top_n=10): """ Find similar vectors to a query vector Args: query_vector (numpy.ndarray): Query vector vectors (List[numpy.ndarray]): List of vectors to compare against threshold (float): Minimum similarity threshold top_n (int): Maximum number of results to return Returns: List[Tuple[int, float]]: List of (index, similarity) tuples """ similarities = [] for i, vec in enumerate(vectors): sim = compute_similarity(query_vector, vec) if sim >= threshold: similarities.append((i, sim)) # Sort by similarity (descending) similarities.sort(key=lambda x: x[1], reverse=True) # Return top N results return similarities[:top_n]