import os
import sqlite3
import torch
import numpy as np
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModel

def find_db_path():
    """Tự động tìm file .db trong thư mục input của Kaggle"""
    for dirname, _, filenames in os.walk('/kaggle/input'):
        for filename in filenames:
            if filename.endswith('.db'):
                path = os.path.join(dirname, filename)
                print(f"[*] Found Database at: {path}")
                return path
    return None

def embed_on_kaggle():
    # 1. Tự động tìm đường dẫn DB
    db_path = find_db_path()
    if not db_path:
        print("[ERROR] No .db file found in /kaggle/input. Please 'Add Data' to your notebook.")
        return

    output_dir = "/kaggle/working/raw_vectors"
    os.makedirs(output_dir, exist_ok=True)

    # 2. Cấu hình Model
    device = "cuda"
    model_name = "intfloat/multilingual-e5-base"
    print(f"[*] Loading model {model_name} on {device.upper()}...")
    
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModel.from_pretrained(model_name).to(device)
    
    # Ép kiểu sang FP16 để tối ưu bộ nhớ
    model = model.half()
    model.eval()

    # 3. Kết nối Database ở chế độ Read-Only (Rất quan trọng trên Kaggle)
    # Sử dụng URI mode=ro để tránh lỗi 'unable to open database file'
    try:
        conn = sqlite3.connect(f"file:{db_path}?mode=ro", uri=True)
        cursor = conn.cursor()
        cursor.execute("SELECT count(*) FROM chunks")
        total_chunks = cursor.fetchone()[0]
        print(f"[*] Total chunks to process: {total_chunks:,}")
    except Exception as e:
        print(f"[ERROR] Could not open database: {e}")
        return

    # 4. Thực hiện nhúng (Embedding)
    batch_size = 1024 
    cursor.execute("SELECT text FROM chunks ORDER BY id ASC")
    
    pbar = tqdm(total=total_chunks)
    all_embeddings = []
    count = 0
    file_idx = 0
    
    while True:
        rows = cursor.fetchmany(batch_size)
        if not rows:
            break
            
        texts = [f"passage: {row[0]}" for row in rows]
        
        with torch.no_grad():
            inputs = tokenizer(texts, padding=True, truncation=True, max_length=256, return_tensors="pt").to(device)
            outputs = model(**inputs)
            
            mask = inputs.attention_mask.unsqueeze(-1).expand(outputs.last_hidden_state.size()).to(outputs.last_hidden_state.dtype)
            embeddings = torch.sum(outputs.last_hidden_state * mask, 1) / torch.clamp(mask.sum(1), min=1e-9)
            embeddings = torch.nn.functional.normalize(embeddings, p=2, dim=1)
            
            all_embeddings.append(embeddings.cpu().numpy())
        
        count += len(rows)
        pbar.update(len(rows))
        
        # Lưu block mỗi 500k vector
        if count >= 500000:
            raw_file = os.path.join(output_dir, f"system_raw_{file_idx}.npy")
            np.save(raw_file, np.vstack(all_embeddings))
            print(f"\n[+] Saved block {file_idx}")
            all_embeddings = []
            count = 0
            file_idx += 1

    if all_embeddings:
        raw_file = os.path.join(output_dir, f"system_raw_{file_idx}.npy")
        np.save(raw_file, np.vstack(all_embeddings))

    print(f"\n[SUCCESS] Embedding Complete! Files saved in /kaggle/working/raw_vectors")
    conn.close()

if __name__ == "__main__":
    embed_on_kaggle()