import faiss import numpy as np from sklearn.feature_extraction.text import TfidfVectorizer def semantic_clusters(documents, k=5): """ documents: list[str] returns: list[int] cluster ids """ if len(documents) < 2: return [0] * len(documents) vectorizer = TfidfVectorizer(max_features=512, stop_words="english") vectors = vectorizer.fit_transform(documents).toarray().astype("float32") dim = vectors.shape[1] index = faiss.IndexFlatL2(dim) index.add(vectors) _, labels = index.search(vectors, 1) return labels.flatten().tolist()