import os import sqlite3 import torch import numpy as np from tqdm import tqdm from transformers import AutoTokenizer, AutoModel def find_db_path(): """Tự động tìm file .db trong thư mục input của Kaggle""" for dirname, _, filenames in os.walk('/kaggle/input'): for filename in filenames: if filename.endswith('.db'): path = os.path.join(dirname, filename) print(f"[*] Found Database at: {path}") return path return None def embed_on_kaggle(): # 1. Tự động tìm đường dẫn DB db_path = find_db_path() if not db_path: print("[ERROR] No .db file found in /kaggle/input. Please 'Add Data' to your notebook.") return output_dir = "/kaggle/working/raw_vectors" os.makedirs(output_dir, exist_ok=True) # 2. Cấu hình Model device = "cuda" model_name = "intfloat/multilingual-e5-base" print(f"[*] Loading model {model_name} on {device.upper()}...") tokenizer = AutoTokenizer.from_pretrained(model_name) model = AutoModel.from_pretrained(model_name).to(device) # Ép kiểu sang FP16 để tối ưu bộ nhớ model = model.half() model.eval() # 3. Kết nối Database ở chế độ Read-Only (Rất quan trọng trên Kaggle) # Sử dụng URI mode=ro để tránh lỗi 'unable to open database file' try: conn = sqlite3.connect(f"file:{db_path}?mode=ro", uri=True) cursor = conn.cursor() cursor.execute("SELECT count(*) FROM chunks") total_chunks = cursor.fetchone()[0] print(f"[*] Total chunks to process: {total_chunks:,}") except Exception as e: print(f"[ERROR] Could not open database: {e}") return # 4. Thực hiện nhúng (Embedding) batch_size = 1024 cursor.execute("SELECT text FROM chunks ORDER BY id ASC") pbar = tqdm(total=total_chunks) all_embeddings = [] count = 0 file_idx = 0 while True: rows = cursor.fetchmany(batch_size) if not rows: break texts = [f"passage: {row[0]}" for row in rows] with torch.no_grad(): inputs = tokenizer(texts, padding=True, truncation=True, max_length=256, return_tensors="pt").to(device) outputs = model(**inputs) mask = inputs.attention_mask.unsqueeze(-1).expand(outputs.last_hidden_state.size()).to(outputs.last_hidden_state.dtype) embeddings = torch.sum(outputs.last_hidden_state * mask, 1) / torch.clamp(mask.sum(1), min=1e-9) embeddings = torch.nn.functional.normalize(embeddings, p=2, dim=1) all_embeddings.append(embeddings.cpu().numpy()) count += len(rows) pbar.update(len(rows)) # Lưu block mỗi 500k vector if count >= 500000: raw_file = os.path.join(output_dir, f"system_raw_{file_idx}.npy") np.save(raw_file, np.vstack(all_embeddings)) print(f"\n[+] Saved block {file_idx}") all_embeddings = [] count = 0 file_idx += 1 if all_embeddings: raw_file = os.path.join(output_dir, f"system_raw_{file_idx}.npy") np.save(raw_file, np.vstack(all_embeddings)) print(f"\n[SUCCESS] Embedding Complete! Files saved in /kaggle/working/raw_vectors") conn.close() if __name__ == "__main__": embed_on_kaggle()