CTRS / utils /filtering.py
quantumbit's picture
initial commit
025f187
import requests
import numpy as np
from typing import List
from sklearn.metrics.pairwise import cosine_distances
from sklearn.cluster import DBSCAN
import os
from dotenv import load_dotenv
load_dotenv()
API_KEY = os.getenv("REQUESTY_API_KEY")
API_URL = "https://router.requesty.ai/v1/embeddings"
def get_embeddings(texts: List[str]) -> np.ndarray:
response = requests.post(
API_URL,
headers={
"Authorization": f"Bearer {API_KEY}",
"Content-Type": "application/json",
},
json={
"input": texts,
"model": "openai/text-embedding-3-small",
"encoding_format": "float"
},
timeout=30
)
if response.status_code != 200:
raise Exception(f"Embedding API error: {response.text}")
data = response.json()
embeddings = [item["embedding"] for item in data["data"]]
return np.array(embeddings)
def batched_embeddings(texts: List[str], batch_size=50):
all_embeddings = []
for i in range(0, len(texts), batch_size):
batch = texts[i:i + batch_size]
emb = get_embeddings(batch)
all_embeddings.append(emb)
return np.vstack(all_embeddings)
def get_representatives(texts: List[str], eps: float, min_samples: int):
embeddings = batched_embeddings(texts)
distance_matrix = cosine_distances(embeddings)
clustering = DBSCAN(
eps=eps,
min_samples=min_samples,
metric="precomputed"
).fit(distance_matrix)
labels = clustering.labels_
clusters = {}
for idx, label in enumerate(labels):
if label == -1:
clusters[f"noise_{idx}"] = [idx]
else:
clusters.setdefault(label, []).append(idx)
representatives = []
for _, indices in clusters.items():
cluster_embeddings = embeddings[indices]
centroid = np.mean(cluster_embeddings, axis=0)
distances = cosine_distances(
cluster_embeddings, centroid.reshape(1, -1)
).flatten()
best_idx = indices[np.argmin(distances)]
representatives.append(texts[best_idx])
return representatives