Spaces:
Running
Running
| import os | |
| import sys | |
| import torch | |
| import numpy as np | |
| import gc | |
| # Thêm đường dẫn backend vào sys.path để load TQ_engine_lib | |
| backend_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) | |
| sys.path.insert(0, backend_dir) | |
| from TQ_engine_lib.quantizer import TQEngine | |
| def build_system_index(): | |
| data_dir = os.path.join(backend_dir, "data") | |
| raw_dir = os.path.join(data_dir, "RAW") | |
| output_path = os.path.join(data_dir, "tq_index_4bit_np4096_system") | |
| if not os.path.exists(raw_dir): | |
| print(f"[ERROR] RAW directory not found at {raw_dir}") | |
| return | |
| # 1. Tìm các file RAW | |
| raw_files = sorted([f for f in os.listdir(raw_dir) if f.startswith("system_raw_") and f.endswith(".npy")], | |
| key=lambda x: int(x.split("_")[-1].split(".")[0])) | |
| if not raw_files: | |
| print(f"[ERROR] No raw files found in {raw_dir}") | |
| return | |
| print(f"[*] Found {len(raw_files)} raw blocks. Creating a unified memmap...") | |
| # 2. Tạo một file memmap tổng hợp để engine.index xử lý hiệu quả | |
| total_vectors = 0 | |
| dim = 0 | |
| for f in raw_files: | |
| temp = np.load(os.path.join(raw_dir, f), mmap_mode='r') | |
| total_vectors += temp.shape[0] | |
| dim = temp.shape[1] | |
| combined_raw_path = os.path.join(data_dir, "combined_raw_temp.npy") | |
| from numpy.lib.format import open_memmap | |
| combined_mm = open_memmap(combined_raw_path, mode='w+', dtype=np.float32, shape=(total_vectors, dim)) | |
| curr = 0 | |
| for f in raw_files: | |
| print(f" Merging {f} into combined memmap...") | |
| block = np.load(os.path.join(raw_dir, f)) | |
| # Chuẩn hóa ngay khi merge để tiết kiệm bước sau | |
| norms = np.linalg.norm(block, axis=1, keepdims=True) + 1e-10 | |
| combined_mm[curr:curr+len(block)] = block / norms | |
| curr += len(block) | |
| del block | |
| gc.collect() | |
| combined_mm.flush() | |
| print(f"[+] Combined memmap created at: {combined_raw_path} ({total_vectors:,} vectors)") | |
| # 3. Chạy Engine Index (Sử dụng chính file combined_mm) | |
| print(f"[*] Phase 2: Starting TurboQuant Indexing (4-bit, IVF 4096)...") | |
| engine = TQEngine(dim=dim, bits=4, use_ivf=True, ivf_nlist=4096) | |
| # Truyền memmap vào, engine.index sẽ xử lý theo từng chunk để không tràn RAM | |
| engine.index(combined_mm, save_path=output_path) | |
| # 4. Dọn dẹp | |
| del combined_mm | |
| gc.collect() | |
| if os.path.exists(combined_raw_path): | |
| try: | |
| os.remove(combined_raw_path) | |
| except: | |
| print(f"⚠️ Cảnh báo: Không thể xóa file tạm {combined_raw_path}, hãy xóa thủ công sau.") | |
| print("\n[SUCCESS] System Index built successfully!") | |
| print(f"Location: {output_path}") | |
| if __name__ == "__main__": | |
| build_system_index() | |