readctrl / code /vectordb_build /qwen_embed.py
shahidul034's picture
Add files using upload-large-folder tool
1db7196 verified
import os
# Environment Setup
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = "2"
import torch
from datasets import load_dataset
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np
# 1. Configuration
model_id = "Qwen/Qwen3-Embedding-4B"
lang_code = "en" # Change to your desired language
save_path = f"/home/mshahidul/readctrl/data/vector_db/qwen_em/{lang_code}_wikipedia_qwen3_index.faiss"
batch_size = 8 # Adjust based on your GPU VRAM (4B model is heavy)
# 2. Load Model
# Note: Qwen3 might require trust_remote_code=True depending on the implementation
model = SentenceTransformer(model_id, trust_remote_code=True, model_kwargs={"torch_dtype": torch.bfloat16}) # Use bfloat16 for Qwen3
# 3. Load Dataset (Streaming)
ds = load_dataset("wikimedia/wikipedia", f"20231101.{lang_code}", split='train', streaming=True)
def embed_wikipedia(dataset, model, batch_size):
index = None
metadata = [] # To store text or IDs
batch_texts = []
print("Starting embedding process...")
for i, item in enumerate(dataset):
batch_texts.append(item['text'])
if len(batch_texts) == batch_size:
# Generate Embeddings
embeddings = model.encode(batch_texts, show_progress_bar=False)
embeddings = np.array(embeddings).astype('float32')
# Initialize FAISS index on first batch
if index is None:
dimension = embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(embeddings)
# Optional: Store metadata (Warning: Wikipedia is huge,
# storing all text in RAM might crash your system)
# metadata.extend(batch_texts)
batch_texts = []
if i % 100 == 0:
print(f"Processed {i} documents...")
return index
# 4. Run and Save
vector_index = embed_wikipedia(ds, model, batch_size)
faiss.write_index(vector_index, save_path)
print(f"Index saved to {save_path}")