Spaces:
Sleeping
Sleeping
File size: 587 Bytes
96eb5a4 9109e81 96eb5a4 9109e81 96eb5a4 9109e81 96eb5a4 82a7780 96eb5a4 9109e81 96eb5a4 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 |
import faiss
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
def semantic_clusters(documents, k=5):
"""
documents: list[str]
returns: list[int] cluster ids
"""
if len(documents) < 2:
return [0] * len(documents)
vectorizer = TfidfVectorizer(max_features=512, stop_words="english")
vectors = vectorizer.fit_transform(documents).toarray().astype("float32")
dim = vectors.shape[1]
index = faiss.IndexFlatL2(dim)
index.add(vectors)
_, labels = index.search(vectors, 1)
return labels.flatten().tolist() |