EYEDOL commited on
Commit
3ac55fd
·
verified ·
1 Parent(s): 30a56dc

Create build_index.py

Browse files
Files changed (1) hide show
  1. build_index.py +94 -0
build_index.py ADDED
@@ -0,0 +1,94 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # build_index.py
2
+ import os
3
+ import json
4
+ import numpy as np
5
+ import torch
6
+ import torch.nn.functional as F
7
+ from tqdm import tqdm
8
+ from datasets import concatenate_datasets, load_dataset
9
+ from transformers import AutoProcessor, AutoModel
10
+ from PIL import Image
11
+
12
+ # ========== CONFIG ==========
13
+ MODEL_ID = "EYEDOL/siglipFULL-agri-finetuned"
14
+ DATASET_NAMES = [f"EYEDOL/AGRILLAVA-image-text{i}" for i in range(1, 16)]
15
+ BATCH_SIZE = 16
16
+ OUT_DIR = "faiss_data"
17
+ INDEX_FILE = os.path.join(OUT_DIR, "texts.faiss")
18
+ METADATA_FILE = os.path.join(OUT_DIR, "texts_meta.json")
19
+ EMBEDS_FILE = os.path.join(OUT_DIR, "text_embeds.npy")
20
+ DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
21
+ USE_FAISS_GPU = False # set True if you have faiss-gpu and want GPU index building
22
+ # ============================
23
+
24
+ os.makedirs(OUT_DIR, exist_ok=True)
25
+
26
+ print("Loading datasets...")
27
+ all_splits = [load_dataset(name)["train"] for name in DATASET_NAMES]
28
+ dataset = concatenate_datasets(all_splits)
29
+ texts = list(dataset["text"])
30
+ print(f"Total texts: {len(texts)}")
31
+
32
+ print("Loading model & processor...")
33
+ processor = AutoProcessor.from_pretrained(MODEL_ID)
34
+ model = AutoModel.from_pretrained(MODEL_ID).to(DEVICE)
35
+ model.eval()
36
+
37
+ # Compute text embeddings (in batches) and normalize
38
+ all_embeds = []
39
+ for i in tqdm(range(0, len(texts), BATCH_SIZE), desc="Encoding texts"):
40
+ batch_texts = texts[i:i + BATCH_SIZE]
41
+ inputs = processor(text=batch_texts, padding=True, truncation=True, return_tensors="pt").to(DEVICE)
42
+ with torch.no_grad():
43
+ text_embeds = model.get_text_features(**inputs) # shape (bs, dim)
44
+ text_embeds = text_embeds / text_embeds.norm(p=2, dim=-1, keepdim=True)
45
+ all_embeds.append(text_embeds.cpu())
46
+ del inputs, text_embeds
47
+ if DEVICE == "cuda":
48
+ torch.cuda.empty_cache()
49
+
50
+ all_embeds = torch.cat(all_embeds, dim=0).numpy().astype(np.float32) # (N, D)
51
+ print("Embeddings shape:", all_embeds.shape)
52
+
53
+ # Save numpy embeddings (optional, useful for re-indexing)
54
+ np.save(EMBEDS_FILE, all_embeds)
55
+ print(f"Saved embeddings to {EMBEDS_FILE}")
56
+
57
+ # ========== Build FAISS index ==========
58
+ try:
59
+ import faiss # type: ignore
60
+ except Exception as e:
61
+ raise ImportError("Please install faiss (faiss-cpu or faiss-gpu).") from e
62
+
63
+ d = all_embeds.shape[1]
64
+ # We'll use inner-product on L2-normalized vectors to get cosine similarity.
65
+ index = faiss.IndexFlatIP(d) # exact index; change if you want IVF/HNSW for large corpora
66
+
67
+ # If you have faiss-gpu and want to move to GPU:
68
+ if USE_FAISS_GPU:
69
+ res = faiss.StandardGpuResources()
70
+ index = faiss.index_cpu_to_gpu(res, 0, index)
71
+
72
+ print("Adding to index...")
73
+ index.add(all_embeds) # adds vectors in order, ids 0..N-1
74
+ print("Index ntotal:", index.ntotal)
75
+
76
+ # Save index
77
+ faiss.write_index(faiss.index_gpu_to_cpu(index) if USE_FAISS_GPU else index, INDEX_FILE)
78
+ print(f"Saved FAISS index to {INDEX_FILE}")
79
+
80
+ # Save metadata (texts)
81
+ meta = {
82
+ "count": len(texts),
83
+ "texts_file": "texts.jsonl"
84
+ }
85
+ with open(METADATA_FILE, "w") as f:
86
+ json.dump(meta, f, indent=2)
87
+ # Save texts list (one-per-line) for easy lookup
88
+ texts_file = os.path.join(OUT_DIR, "texts.jsonl")
89
+ with open(texts_file, "w", encoding="utf-8") as f:
90
+ for t in texts:
91
+ f.write(json.dumps({"text": t}, ensure_ascii=False) + "\n")
92
+
93
+ print(f"Saved texts metadata to {texts_file}")
94
+ print("Done.")