GodsDevProject commited on
Commit
82a7780
·
verified ·
1 Parent(s): 2632d51

Create ingest/cluster.py

Browse files
Files changed (1) hide show
  1. ingest/cluster.py +16 -29
ingest/cluster.py CHANGED
@@ -1,33 +1,20 @@
1
- from typing import List, Dict
2
-
3
- def build_embeddings(results: List[Dict]):
4
- try:
5
- from sentence_transformers import SentenceTransformer
6
- model = SentenceTransformer("all-MiniLM-L6-v2")
7
- texts = [r.get("snippet", "") for r in results]
8
- return model.encode(texts, show_progress_bar=False)
9
- except Exception:
10
- return None
11
 
 
12
 
13
- def cluster_embeddings(results: List[Dict], embeddings):
14
- if embeddings is None or len(results) == 0:
15
- return {}
16
-
17
- try:
18
- import faiss
19
- import numpy as np
20
 
21
- dim = embeddings.shape[1]
22
- index = faiss.IndexFlatL2(dim)
23
- index.add(embeddings)
 
24
 
25
- clusters = {}
26
- for i, r in enumerate(results):
27
- clusters.setdefault(r["source"], []).append({
28
- "title": r["title"],
29
- "index": i
30
- })
31
- return clusters
32
- except Exception:
33
- return {}
 
1
+ from sentence_transformers import SentenceTransformer
2
+ import faiss
3
+ import numpy as np
 
 
 
 
 
 
 
4
 
5
+ model = SentenceTransformer("all-MiniLM-L6-v2")
6
 
7
+ def semantic_cluster(results):
8
+ texts = [r["title"] + " " + r["snippet"] for r in results]
9
+ if not texts:
10
+ return None
 
 
 
11
 
12
+ embeddings = model.encode(texts)
13
+ dim = embeddings.shape[1]
14
+ index = faiss.IndexFlatL2(dim)
15
+ index.add(np.array(embeddings))
16
 
17
+ return {
18
+ "points": embeddings.tolist(),
19
+ "labels": list(range(len(results)))
20
+ }