ARQ-RAG-Turboquant / scripts /build_system_index.py
neshaki091
Deploy TurboQuant Backend (Cleaned history & optimized for HF Spaces)
ba86059
import os
import sys
import torch
import numpy as np
import gc
# Thêm đường dẫn backend vào sys.path để load TQ_engine_lib
backend_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
sys.path.insert(0, backend_dir)
from TQ_engine_lib.quantizer import TQEngine
def build_system_index():
data_dir = os.path.join(backend_dir, "data")
raw_dir = os.path.join(data_dir, "RAW")
output_path = os.path.join(data_dir, "tq_index_4bit_np4096_system")
if not os.path.exists(raw_dir):
print(f"[ERROR] RAW directory not found at {raw_dir}")
return
# 1. Tìm các file RAW
raw_files = sorted([f for f in os.listdir(raw_dir) if f.startswith("system_raw_") and f.endswith(".npy")],
key=lambda x: int(x.split("_")[-1].split(".")[0]))
if not raw_files:
print(f"[ERROR] No raw files found in {raw_dir}")
return
print(f"[*] Found {len(raw_files)} raw blocks. Creating a unified memmap...")
# 2. Tạo một file memmap tổng hợp để engine.index xử lý hiệu quả
total_vectors = 0
dim = 0
for f in raw_files:
temp = np.load(os.path.join(raw_dir, f), mmap_mode='r')
total_vectors += temp.shape[0]
dim = temp.shape[1]
combined_raw_path = os.path.join(data_dir, "combined_raw_temp.npy")
from numpy.lib.format import open_memmap
combined_mm = open_memmap(combined_raw_path, mode='w+', dtype=np.float32, shape=(total_vectors, dim))
curr = 0
for f in raw_files:
print(f" Merging {f} into combined memmap...")
block = np.load(os.path.join(raw_dir, f))
# Chuẩn hóa ngay khi merge để tiết kiệm bước sau
norms = np.linalg.norm(block, axis=1, keepdims=True) + 1e-10
combined_mm[curr:curr+len(block)] = block / norms
curr += len(block)
del block
gc.collect()
combined_mm.flush()
print(f"[+] Combined memmap created at: {combined_raw_path} ({total_vectors:,} vectors)")
# 3. Chạy Engine Index (Sử dụng chính file combined_mm)
print(f"[*] Phase 2: Starting TurboQuant Indexing (4-bit, IVF 4096)...")
engine = TQEngine(dim=dim, bits=4, use_ivf=True, ivf_nlist=4096)
# Truyền memmap vào, engine.index sẽ xử lý theo từng chunk để không tràn RAM
engine.index(combined_mm, save_path=output_path)
# 4. Dọn dẹp
del combined_mm
gc.collect()
if os.path.exists(combined_raw_path):
try:
os.remove(combined_raw_path)
except:
print(f"⚠️ Cảnh báo: Không thể xóa file tạm {combined_raw_path}, hãy xóa thủ công sau.")
print("\n[SUCCESS] System Index built successfully!")
print(f"Location: {output_path}")
if __name__ == "__main__":
build_system_index()