Spaces:

Neshaki
/

ARQ-RAG-Turboquant

Sleeping

neshaki091

Deploy TurboQuant Backend (Cleaned history & optimized for HF Spaces)

ba86059 12 days ago

3.48 kB

	import os
	import sqlite3
	import torch
	import numpy as np
	from tqdm import tqdm
	from transformers import AutoTokenizer, AutoModel

	def find_db_path():
	"""Tự động tìm file .db trong thư mục input của Kaggle"""
	for dirname, _, filenames in os.walk('/kaggle/input'):
	for filename in filenames:
	if filename.endswith('.db'):
	path = os.path.join(dirname, filename)
	print(f"[*] Found Database at: {path}")
	return path
	return None

	def embed_on_kaggle():
	# 1. Tự động tìm đường dẫn DB
	db_path = find_db_path()
	if not db_path:
	print("[ERROR] No .db file found in /kaggle/input. Please 'Add Data' to your notebook.")
	return

	output_dir = "/kaggle/working/raw_vectors"
	os.makedirs(output_dir, exist_ok=True)

	# 2. Cấu hình Model
	device = "cuda"
	model_name = "intfloat/multilingual-e5-base"
	print(f"[*] Loading model {model_name} on {device.upper()}...")

	tokenizer = AutoTokenizer.from_pretrained(model_name)
	model = AutoModel.from_pretrained(model_name).to(device)

	# Ép kiểu sang FP16 để tối ưu bộ nhớ
	model = model.half()
	model.eval()

	# 3. Kết nối Database ở chế độ Read-Only (Rất quan trọng trên Kaggle)
	# Sử dụng URI mode=ro để tránh lỗi 'unable to open database file'
	try:
	conn = sqlite3.connect(f"file:{db_path}?mode=ro", uri=True)
	cursor = conn.cursor()
	cursor.execute("SELECT count(*) FROM chunks")
	total_chunks = cursor.fetchone()[0]
	print(f"[*] Total chunks to process: {total_chunks:,}")
	except Exception as e:
	print(f"[ERROR] Could not open database: {e}")
	return

	# 4. Thực hiện nhúng (Embedding)
	batch_size = 1024
	cursor.execute("SELECT text FROM chunks ORDER BY id ASC")

	pbar = tqdm(total=total_chunks)
	all_embeddings = []
	count = 0
	file_idx = 0

	while True:
	rows = cursor.fetchmany(batch_size)
	if not rows:
	break

	texts = [f"passage: {row[0]}" for row in rows]

	with torch.no_grad():
	inputs = tokenizer(texts, padding=True, truncation=True, max_length=256, return_tensors="pt").to(device)
	outputs = model(**inputs)

	mask = inputs.attention_mask.unsqueeze(-1).expand(outputs.last_hidden_state.size()).to(outputs.last_hidden_state.dtype)
	embeddings = torch.sum(outputs.last_hidden_state * mask, 1) / torch.clamp(mask.sum(1), min=1e-9)
	embeddings = torch.nn.functional.normalize(embeddings, p=2, dim=1)

	all_embeddings.append(embeddings.cpu().numpy())

	count += len(rows)
	pbar.update(len(rows))

	# Lưu block mỗi 500k vector
	if count >= 500000:
	raw_file = os.path.join(output_dir, f"system_raw_{file_idx}.npy")
	np.save(raw_file, np.vstack(all_embeddings))
	print(f"\n[+] Saved block {file_idx}")
	all_embeddings = []
	count = 0
	file_idx += 1

	if all_embeddings:
	raw_file = os.path.join(output_dir, f"system_raw_{file_idx}.npy")
	np.save(raw_file, np.vstack(all_embeddings))

	print(f"\n[SUCCESS] Embedding Complete! Files saved in /kaggle/working/raw_vectors")
	conn.close()

	if __name__ == "__main__":
	embed_on_kaggle()