Spaces:
Sleeping
Sleeping
File size: 3,479 Bytes
e965645 4f6e61d e965645 4f6e61d e965645 4f6e61d e965645 4f6e61d e965645 2989a5c 4f6e61d 2989a5c 4f6e61d e965645 4f6e61d 2989a5c e965645 2989a5c e965645 2989a5c e965645 4f6e61d e965645 4f6e61d e965645 4f6e61d e965645 4f6e61d e965645 4f6e61d e965645 4f6e61d e965645 4f6e61d e965645 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 | # utils/vector_utils.py
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import re
# Initialize the vectorizer
_vectorizer = None
def _get_vectorizer():
"""
Get or initialize the TF-IDF vectorizer
"""
global _vectorizer
if _vectorizer is None:
_vectorizer = TfidfVectorizer(
max_features=5000,
stop_words='english',
ngram_range=(1, 2)
)
return _vectorizer
def preprocess_text(text):
"""
Preprocess Vietnamese text for vectorization
Args:
text (str): Raw Vietnamese text
Returns:
str: Preprocessed text
"""
# Convert to lowercase (preserving Vietnamese diacritical marks)
text = text.lower()
# Remove URLs
text = re.sub(r'https?://\S+|www\.\S+', '', text)
# Remove HTML tags
text = re.sub(r'<.*?>', '', text)
# For Vietnamese text, we need to preserve diacritical marks
# Only remove punctuation that doesn't affect meaning
text = re.sub(r'[.,;:!?()"\'\[\]/\\]', ' ', text)
text = re.sub(r'\d+', '', text)
# Remove extra whitespace
text = re.sub(r'\s+', ' ', text).strip()
# Use Vietnamese-specific tokenization if available
try:
from underthesea import word_tokenize
text = word_tokenize(text, format="text")
except ImportError:
# Fallback if underthesea is not available
pass
return text
def extract_features(text):
"""
Extract feature vector from text
Args:
text (str): Input text
Returns:
numpy.ndarray: Feature vector
"""
# Get vectorizer
vectorizer = _get_vectorizer()
# Preprocess text
processed_text = preprocess_text(text)
# Fit vectorizer if not trained
if not hasattr(vectorizer, 'vocabulary_'):
vectorizer.fit([processed_text])
# Transform text to vector
vector = vectorizer.transform([processed_text])
# Return as dense array
return vector.toarray()[0]
def compute_similarity(vec1, vec2):
"""
Compute cosine similarity between two vectors
Args:
vec1 (numpy.ndarray): First vector
vec2 (numpy.ndarray): Second vector
Returns:
float: Similarity score (0-1)
"""
# Reshape vectors if needed
if len(vec1.shape) == 1:
vec1 = vec1.reshape(1, -1)
if len(vec2.shape) == 1:
vec2 = vec2.reshape(1, -1)
# Compute similarity
sim = cosine_similarity(vec1, vec2)[0][0]
return float(sim)
def find_similar_vectors(query_vector, vectors, threshold=0.7, top_n=10):
"""
Find similar vectors to a query vector
Args:
query_vector (numpy.ndarray): Query vector
vectors (List[numpy.ndarray]): List of vectors to compare against
threshold (float): Minimum similarity threshold
top_n (int): Maximum number of results to return
Returns:
List[Tuple[int, float]]: List of (index, similarity) tuples
"""
similarities = []
for i, vec in enumerate(vectors):
sim = compute_similarity(query_vector, vec)
if sim >= threshold:
similarities.append((i, sim))
# Sort by similarity (descending)
similarities.sort(key=lambda x: x[1], reverse=True)
# Return top N results
return similarities[:top_n] |