File size: 2,150 Bytes
025f187
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
import requests
import numpy as np
from typing import List

from sklearn.metrics.pairwise import cosine_distances
from sklearn.cluster import DBSCAN
import os

from dotenv import load_dotenv


load_dotenv()


API_KEY = os.getenv("REQUESTY_API_KEY")
API_URL = "https://router.requesty.ai/v1/embeddings"



def get_embeddings(texts: List[str]) -> np.ndarray:
    response = requests.post(
        API_URL,
        headers={
            "Authorization": f"Bearer {API_KEY}",
            "Content-Type": "application/json",
        },
        json={
            "input": texts,
            "model": "openai/text-embedding-3-small",
            "encoding_format": "float"
        },
        timeout=30
    )

    if response.status_code != 200:
        raise Exception(f"Embedding API error: {response.text}")

    data = response.json()
    embeddings = [item["embedding"] for item in data["data"]]
    return np.array(embeddings)


def batched_embeddings(texts: List[str], batch_size=50):
    all_embeddings = []

    for i in range(0, len(texts), batch_size):
        batch = texts[i:i + batch_size]
        emb = get_embeddings(batch)
        all_embeddings.append(emb)

    return np.vstack(all_embeddings)

def get_representatives(texts: List[str], eps: float, min_samples: int):
    embeddings = batched_embeddings(texts)

    distance_matrix = cosine_distances(embeddings)

    clustering = DBSCAN(
        eps=eps,
        min_samples=min_samples,
        metric="precomputed"
    ).fit(distance_matrix)

    labels = clustering.labels_
    clusters = {}

    for idx, label in enumerate(labels):
        if label == -1:
            clusters[f"noise_{idx}"] = [idx]
        else:
            clusters.setdefault(label, []).append(idx)

    representatives = []

    for _, indices in clusters.items():
        cluster_embeddings = embeddings[indices]
        centroid = np.mean(cluster_embeddings, axis=0)

        distances = cosine_distances(
            cluster_embeddings, centroid.reshape(1, -1)
        ).flatten()

        best_idx = indices[np.argmin(distances)]
        representatives.append(texts[best_idx])

    return representatives