add test
Browse files- exp.txt +5 -3
- test_res_id_to_vec.py +146 -0
exp.txt
CHANGED
|
@@ -1,7 +1,9 @@
|
|
|
|
|
| 1 |
|
| 2 |
-
|
| 3 |
-
|
| 4 |
-
|
|
|
|
| 5 |
|
| 6 |
13G+内存
|
| 7 |
压缩数据12G
|
|
|
|
| 1 |
+
./apps/build_disk_index --data_type int8 --dist_fn l2 --data_path data/bigann_learn.bin --index_path_prefix data/disk_sift_bigann_learn_R64_L200 -R 64 -L200 -B 100.0 -M 30 --PQ_disk_bytes 16 --build_PQ_bytes 32
|
| 2 |
|
| 3 |
+
|
| 4 |
+
./apps/search_disk_index --data_type uint8 --dist_fn l2 --index_path_prefix data/disk_sift_bigann_learn_R64_L200 --query_file data/bigann_query.bin --gt_file data/bigann_query_learn_gt100 -K 10 -L 20 40 60 80 --num_nodes_to_cache 500000 --result_path data/res
|
| 5 |
+
./apps/search_disk_index --data_type uint8 --dist_fn l2 --index_path_prefix data_backup/disk_sift_bigann_learn_R64_L200 --query_file data_backup/bigann_query.bin --gt_file data_backup/bigann_query_learn_gt100 -K 10 -L 20 40 60 80 --num_nodes_to_cache 500000 --result_path data_backup/res
|
| 6 |
+
./apps/search_disk_index --data_type uint8 --dist_fn l2 --index_path_prefix data_backup_clean_test/disk_sift_bigann_learn_R64_L200 --query_file data_backup_clean_test/bigann_query.bin --gt_file data_backup_clean_test/bigann_query_learn_gt100 -K 10 -L 20 40 60 80 --num_nodes_to_cache 500000 --result_path data_backup_clean_test/res
|
| 7 |
|
| 8 |
13G+内存
|
| 9 |
压缩数据12G
|
test_res_id_to_vec.py
ADDED
|
@@ -0,0 +1,146 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import numpy as np
|
| 3 |
+
|
| 4 |
+
def load_aligned_bin(file_path, dtype=np.int8, alignment=8):
|
| 5 |
+
"""
|
| 6 |
+
读取并解析diskann规定的二进制向量文件,比如bigann_query.bin等
|
| 7 |
+
|
| 8 |
+
参数:
|
| 9 |
+
file_path (str): 二进制文件路径
|
| 10 |
+
dtype (np.dtype): 数据类型,默认为float32
|
| 11 |
+
alignment (int): 维度对齐的倍数,默认为8(读取数据集要对齐,保存res的时候没有对齐要求,所以读res的时候不用align)
|
| 12 |
+
|
| 13 |
+
返回:
|
| 14 |
+
np.ndarray: 读取并对齐后的向量数据
|
| 15 |
+
int: 向量数量
|
| 16 |
+
int: 原始向量维度
|
| 17 |
+
int: 对齐后的向量维度
|
| 18 |
+
"""
|
| 19 |
+
# 获取文件大小
|
| 20 |
+
actual_file_size = os.path.getsize(file_path)
|
| 21 |
+
|
| 22 |
+
with open(file_path, 'rb') as f:
|
| 23 |
+
# 读取向量数量和维度信息
|
| 24 |
+
npts = int(np.fromfile(f, dtype=np.int32, count=1)[0])
|
| 25 |
+
dim = int(np.fromfile(f, dtype=np.int32, count=1)[0])
|
| 26 |
+
# 计算预期的文件大小
|
| 27 |
+
expected_file_size = 2 * 4 + npts * dim * np.dtype(dtype).itemsize
|
| 28 |
+
if actual_file_size != expected_file_size:
|
| 29 |
+
raise ValueError(f"文件大小不匹配。实际大小: {actual_file_size} 字节,"
|
| 30 |
+
f"预期大小: {expected_file_size} 字节")
|
| 31 |
+
|
| 32 |
+
# 计算对齐后的维度
|
| 33 |
+
rounded_dim = ((dim + alignment - 1) // alignment) * alignment
|
| 34 |
+
# rounded_dim = dim
|
| 35 |
+
|
| 36 |
+
print(f"元数据: #向量 = {npts}, #维度 = {dim}, 对齐维度 = {rounded_dim}")
|
| 37 |
+
print(f"分配内存: {npts * rounded_dim * np.dtype(dtype).itemsize} 字节")
|
| 38 |
+
|
| 39 |
+
# 创建对齐后的数组(填充零)
|
| 40 |
+
data = np.zeros((npts, rounded_dim), dtype=dtype)
|
| 41 |
+
|
| 42 |
+
# 逐行读取向量数据
|
| 43 |
+
for i in range(npts):
|
| 44 |
+
# 从文件读取原始向量数据
|
| 45 |
+
vector = np.fromfile(f, dtype=dtype, count=dim)
|
| 46 |
+
# 存储到对齐后的数组位置
|
| 47 |
+
data[i, :dim] = vector
|
| 48 |
+
|
| 49 |
+
print("数据读取完成")
|
| 50 |
+
|
| 51 |
+
return data, npts, dim, rounded_dim
|
| 52 |
+
|
| 53 |
+
|
| 54 |
+
|
| 55 |
+
def load_bin(filename, dtype=np.float32):
|
| 56 |
+
"""
|
| 57 |
+
读取按照save_bin函数格式保存的二进制文件
|
| 58 |
+
|
| 59 |
+
参数:
|
| 60 |
+
filename (str): 二进制文件路径
|
| 61 |
+
dtype (np.dtype): 数据类型,根据文件内容选择np.uint32或np.float32
|
| 62 |
+
|
| 63 |
+
返回:
|
| 64 |
+
np.ndarray: 读取的数据数组
|
| 65 |
+
int: 向量数量(npts)
|
| 66 |
+
int: 每个向量的维度(ndims)
|
| 67 |
+
"""
|
| 68 |
+
with open(filename, 'rb') as f:
|
| 69 |
+
# 读取前两个整数(npts和ndims)
|
| 70 |
+
npts = np.fromfile(f, dtype=np.int32, count=1)[0]
|
| 71 |
+
ndims = np.fromfile(f, dtype=np.int32, count=1)[0]
|
| 72 |
+
|
| 73 |
+
print(f"读取元数据: #向量 = {npts}, #维度 = {ndims}")
|
| 74 |
+
|
| 75 |
+
# 读取实际数据
|
| 76 |
+
data = np.fromfile(f, dtype=dtype, count=npts * ndims)
|
| 77 |
+
# 重塑为二维数组
|
| 78 |
+
data = data.reshape(npts, ndims)
|
| 79 |
+
|
| 80 |
+
print(f"成功读取 {data.shape[0]} 个向量,每个向量维度为 {data.shape[1]}")
|
| 81 |
+
print(f"数据类型: {data.dtype}")
|
| 82 |
+
|
| 83 |
+
return data, npts, ndims
|
| 84 |
+
|
| 85 |
+
# 使用示例
|
| 86 |
+
if __name__ == "__main__":
|
| 87 |
+
# 读取索引结果文件
|
| 88 |
+
idx_filename = "/home/myw/wuchangli/yk/diskann_demo/DiskANN/build/data_backup_clean_test/res_20_idx_uint32.bin" # 替换为实际文件路径
|
| 89 |
+
idx_data, npts, ndims = load_bin(idx_filename, dtype=np.uint32)
|
| 90 |
+
|
| 91 |
+
# 读取距离结果文件
|
| 92 |
+
dist_filename = "/home/myw/wuchangli/yk/diskann_demo/DiskANN/build/data_backup_clean_test/res_20_dists_float.bin" # 替换为实际文件路径
|
| 93 |
+
dist_data, _, _ = load_bin(dist_filename, dtype=np.float32)
|
| 94 |
+
|
| 95 |
+
file_path = "/home/myw/wuchangli/yk/diskann_demo/DiskANN/build/data_backup_clean_test/bigann_query.bin"
|
| 96 |
+
query_data, query_npts, query_dim, query_rounded_dim = load_aligned_bin(file_path)
|
| 97 |
+
|
| 98 |
+
# 打印前5个查询结果
|
| 99 |
+
print("\n示例结果:")
|
| 100 |
+
for i in range(min(5, npts)):
|
| 101 |
+
print(f"查询 {i}:")
|
| 102 |
+
print(f" 最近邻索引: {idx_data[i, :]}")
|
| 103 |
+
print(f" 最近邻距离: {dist_data[i, :]}")
|
| 104 |
+
|
| 105 |
+
import pdb; pdb.set_trace()
|
| 106 |
+
|
| 107 |
+
print('load learn vectors')
|
| 108 |
+
base_path = "/home/myw/wuchangli/yk/diskann_demo/DiskANN/build/data_backup_clean_test/bigann_learn.bin"
|
| 109 |
+
base_vectors, base_npts, base_dim, base_rounded_dim = load_aligned_bin(base_path)
|
| 110 |
+
import pdb; pdb.set_trace()
|
| 111 |
+
|
| 112 |
+
|
| 113 |
+
|
| 114 |
+
# 查询 0:
|
| 115 |
+
# 最近邻索引: [98840598 23883997 96907701 96907734 96908621 3762989 53147362 88013120
|
| 116 |
+
# 53147127 96908488]
|
| 117 |
+
# 最近邻距离: [68103. 68524. 70149. 71367. 71500. 71746. 71859. 73190. 73243. 73430.]
|
| 118 |
+
# 查询 1:
|
| 119 |
+
# 最近邻索引: [26755808 34080910 3236088 15610118 66524798 72823982 27594517 37556745
|
| 120 |
+
# 26786213 39265851]
|
| 121 |
+
# 最近邻距离: [28587. 30958. 31851. 32047. 32234. 33895. 34205. 34521. 34699. 34823.]
|
| 122 |
+
# 查询 2:
|
| 123 |
+
# 最近邻索引: [71015134 21282510 71010508 6626649 96440577 38613912 71106199 76697998
|
| 124 |
+
# 53211679 52141940]
|
| 125 |
+
# 最近邻距离: [36378. 41223. 41241. 41245. 41867. 42108. 42120. 42183. 42383. 42662.]
|
| 126 |
+
# 查询 3:
|
| 127 |
+
# 最近邻索引: [21759610 34536818 21549958 84252617 2083654 30364765 15796826 11393460
|
| 128 |
+
# 52313741 10230325]
|
| 129 |
+
# 最近邻距离: [2474. 2684. 2701. 2725. 2768. 2869. 2918. 2937. 2967. 2996.]
|
| 130 |
+
# 查询 4:
|
| 131 |
+
# 最近邻索引: [44759921 57423326 30803610 85912171 93322848 38777531 64765128 50769267
|
| 132 |
+
# 33429914 98348143]
|
| 133 |
+
# 最近邻距离: [48352. 64203. 65424. 66350. 69264. 69772. 70490. 70679. 71253. 72685.]
|
| 134 |
+
|
| 135 |
+
|
| 136 |
+
# (Pdb) np.sum((base_vectors[98840598].astype('float32') - query_data[0].astype('float32'))**2)
|
| 137 |
+
# 68103.0
|
| 138 |
+
# (Pdb) np.sum((base_vectors[23883997].astype('float32') - query_data[0].astype('float32'))**2)
|
| 139 |
+
# 68524.0
|
| 140 |
+
# (Pdb) np.sum((base_vectors[96907701].astype('float32') - query_data[0].astype('float32'))**2)
|
| 141 |
+
# 70149.0
|
| 142 |
+
|
| 143 |
+
# (Pdb) np.sum((base_vectors[26755808].astype('float32') - query_data[1].astype('float32'))**2)
|
| 144 |
+
# 28587.0
|
| 145 |
+
# (Pdb) np.sum((base_vectors[44759921].astype('float32') - query_data[4].astype('float32'))**2)
|
| 146 |
+
# 48352.0
|