FOIA_Doc_Search / ingest /cluster.py
GodsDevProject's picture
Create ingest/cluster.py
96eb5a4 verified
import faiss
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
def semantic_clusters(documents, k=5):
"""
documents: list[str]
returns: list[int] cluster ids
"""
if len(documents) < 2:
return [0] * len(documents)
vectorizer = TfidfVectorizer(max_features=512, stop_words="english")
vectors = vectorizer.fit_transform(documents).toarray().astype("float32")
dim = vectors.shape[1]
index = faiss.IndexFlatL2(dim)
index.add(vectors)
_, labels = index.search(vectors, 1)
return labels.flatten().tolist()