Spaces:
Sleeping
Sleeping
File size: 2,150 Bytes
025f187 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 | import requests
import numpy as np
from typing import List
from sklearn.metrics.pairwise import cosine_distances
from sklearn.cluster import DBSCAN
import os
from dotenv import load_dotenv
load_dotenv()
API_KEY = os.getenv("REQUESTY_API_KEY")
API_URL = "https://router.requesty.ai/v1/embeddings"
def get_embeddings(texts: List[str]) -> np.ndarray:
response = requests.post(
API_URL,
headers={
"Authorization": f"Bearer {API_KEY}",
"Content-Type": "application/json",
},
json={
"input": texts,
"model": "openai/text-embedding-3-small",
"encoding_format": "float"
},
timeout=30
)
if response.status_code != 200:
raise Exception(f"Embedding API error: {response.text}")
data = response.json()
embeddings = [item["embedding"] for item in data["data"]]
return np.array(embeddings)
def batched_embeddings(texts: List[str], batch_size=50):
all_embeddings = []
for i in range(0, len(texts), batch_size):
batch = texts[i:i + batch_size]
emb = get_embeddings(batch)
all_embeddings.append(emb)
return np.vstack(all_embeddings)
def get_representatives(texts: List[str], eps: float, min_samples: int):
embeddings = batched_embeddings(texts)
distance_matrix = cosine_distances(embeddings)
clustering = DBSCAN(
eps=eps,
min_samples=min_samples,
metric="precomputed"
).fit(distance_matrix)
labels = clustering.labels_
clusters = {}
for idx, label in enumerate(labels):
if label == -1:
clusters[f"noise_{idx}"] = [idx]
else:
clusters.setdefault(label, []).append(idx)
representatives = []
for _, indices in clusters.items():
cluster_embeddings = embeddings[indices]
centroid = np.mean(cluster_embeddings, axis=0)
distances = cosine_distances(
cluster_embeddings, centroid.reshape(1, -1)
).flatten()
best_idx = indices[np.argmin(distances)]
representatives.append(texts[best_idx])
return representatives |