File size: 587 Bytes
96eb5a4
 
 
9109e81
96eb5a4
 
 
 
 
 
 
9109e81
96eb5a4
 
9109e81
96eb5a4
82a7780
96eb5a4
9109e81
96eb5a4
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
import faiss
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer

def semantic_clusters(documents, k=5):
    """
    documents: list[str]
    returns: list[int] cluster ids
    """
    if len(documents) < 2:
        return [0] * len(documents)

    vectorizer = TfidfVectorizer(max_features=512, stop_words="english")
    vectors = vectorizer.fit_transform(documents).toarray().astype("float32")

    dim = vectors.shape[1]
    index = faiss.IndexFlatL2(dim)
    index.add(vectors)

    _, labels = index.search(vectors, 1)
    return labels.flatten().tolist()