import os # Environment Setup os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" os.environ["CUDA_VISIBLE_DEVICES"] = "2" import torch from datasets import load_dataset from sentence_transformers import SentenceTransformer import faiss import numpy as np # 1. Configuration model_id = "Qwen/Qwen3-Embedding-4B" lang_code = "en" # Change to your desired language save_path = f"/home/mshahidul/readctrl/data/vector_db/qwen_em/{lang_code}_wikipedia_qwen3_index.faiss" batch_size = 8 # Adjust based on your GPU VRAM (4B model is heavy) # 2. Load Model # Note: Qwen3 might require trust_remote_code=True depending on the implementation model = SentenceTransformer(model_id, trust_remote_code=True, model_kwargs={"torch_dtype": torch.bfloat16}) # Use bfloat16 for Qwen3 # 3. Load Dataset (Streaming) ds = load_dataset("wikimedia/wikipedia", f"20231101.{lang_code}", split='train', streaming=True) def embed_wikipedia(dataset, model, batch_size): index = None metadata = [] # To store text or IDs batch_texts = [] print("Starting embedding process...") for i, item in enumerate(dataset): batch_texts.append(item['text']) if len(batch_texts) == batch_size: # Generate Embeddings embeddings = model.encode(batch_texts, show_progress_bar=False) embeddings = np.array(embeddings).astype('float32') # Initialize FAISS index on first batch if index is None: dimension = embeddings.shape[1] index = faiss.IndexFlatL2(dimension) index.add(embeddings) # Optional: Store metadata (Warning: Wikipedia is huge, # storing all text in RAM might crash your system) # metadata.extend(batch_texts) batch_texts = [] if i % 100 == 0: print(f"Processed {i} documents...") return index # 4. Run and Save vector_index = embed_wikipedia(ds, model, batch_size) faiss.write_index(vector_index, save_path) print(f"Index saved to {save_path}")