| | import os |
| | |
| | os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" |
| | os.environ["CUDA_VISIBLE_DEVICES"] = "2" |
| | import torch |
| | from datasets import load_dataset |
| | from sentence_transformers import SentenceTransformer |
| | import faiss |
| | import numpy as np |
| |
|
| | |
| | model_id = "Qwen/Qwen3-Embedding-4B" |
| | lang_code = "en" |
| | save_path = f"/home/mshahidul/readctrl/data/vector_db/qwen_em/{lang_code}_wikipedia_qwen3_index.faiss" |
| | batch_size = 8 |
| |
|
| | |
| | |
| | model = SentenceTransformer(model_id, trust_remote_code=True, model_kwargs={"torch_dtype": torch.bfloat16}) |
| |
|
| | |
| | ds = load_dataset("wikimedia/wikipedia", f"20231101.{lang_code}", split='train', streaming=True) |
| |
|
| | def embed_wikipedia(dataset, model, batch_size): |
| | index = None |
| | metadata = [] |
| | |
| | batch_texts = [] |
| | print("Starting embedding process...") |
| | |
| | for i, item in enumerate(dataset): |
| | batch_texts.append(item['text']) |
| | |
| | if len(batch_texts) == batch_size: |
| | |
| | embeddings = model.encode(batch_texts, show_progress_bar=False) |
| | embeddings = np.array(embeddings).astype('float32') |
| | |
| | |
| | if index is None: |
| | dimension = embeddings.shape[1] |
| | index = faiss.IndexFlatL2(dimension) |
| | |
| | index.add(embeddings) |
| | |
| | |
| | |
| | |
| | |
| | batch_texts = [] |
| | |
| | if i % 100 == 0: |
| | print(f"Processed {i} documents...") |
| |
|
| | return index |
| |
|
| | |
| | vector_index = embed_wikipedia(ds, model, batch_size) |
| | faiss.write_index(vector_index, save_path) |
| | print(f"Index saved to {save_path}") |