cocoshe commited on
Commit
f4aa3fa
·
1 Parent(s): 103f07d
Files changed (2) hide show
  1. exp.txt +5 -3
  2. test_res_id_to_vec.py +146 -0
exp.txt CHANGED
@@ -1,7 +1,9 @@
 
1
 
2
- ./apps/search_disk_index --data_type uint8 --dist_fn l2 --index_path_prefix data/disk_sift_bigann_learn_R64_L200 --query_file data/bigann_query.bin --gt_file data/bigann_query_learn_gt100 -K 10 -L 20 40 60 80 --num_nodes_to_cache 500000 --result_path data/res
3
- ./apps/search_disk_index --data_type uint8 --dist_fn l2 --index_path_prefix data_backup/disk_sift_bigann_learn_R64_L200 --query_file data_backup/bigann_query.bin --gt_file data_backup/bigann_query_learn_gt100 -K 10 -L 20 40 60 80 --num_nodes_to_cache 500000 --result_path data_backup/res
4
- ./apps/search_disk_index --data_type uint8 --dist_fn l2 --index_path_prefix data_backup_clean_test/disk_sift_bigann_learn_R64_L200 --query_file data_backup_clean_test/bigann_query.bin --gt_file data_backup_clean_test/bigann_query_learn_gt100 -K 10 -L 20 40 60 80 --num_nodes_to_cache 500000 --result_path data_backup_clean_test/res
 
5
 
6
  13G+内存
7
  压缩数据12G
 
1
+ ./apps/build_disk_index --data_type int8 --dist_fn l2 --data_path data/bigann_learn.bin --index_path_prefix data/disk_sift_bigann_learn_R64_L200 -R 64 -L200 -B 100.0 -M 30 --PQ_disk_bytes 16 --build_PQ_bytes 32
2
 
3
+
4
+ ./apps/search_disk_index --data_type uint8 --dist_fn l2 --index_path_prefix data/disk_sift_bigann_learn_R64_L200 --query_file data/bigann_query.bin --gt_file data/bigann_query_learn_gt100 -K 10 -L 20 40 60 80 --num_nodes_to_cache 500000 --result_path data/res
5
+ ./apps/search_disk_index --data_type uint8 --dist_fn l2 --index_path_prefix data_backup/disk_sift_bigann_learn_R64_L200 --query_file data_backup/bigann_query.bin --gt_file data_backup/bigann_query_learn_gt100 -K 10 -L 20 40 60 80 --num_nodes_to_cache 500000 --result_path data_backup/res
6
+ ./apps/search_disk_index --data_type uint8 --dist_fn l2 --index_path_prefix data_backup_clean_test/disk_sift_bigann_learn_R64_L200 --query_file data_backup_clean_test/bigann_query.bin --gt_file data_backup_clean_test/bigann_query_learn_gt100 -K 10 -L 20 40 60 80 --num_nodes_to_cache 500000 --result_path data_backup_clean_test/res
7
 
8
  13G+内存
9
  压缩数据12G
test_res_id_to_vec.py ADDED
@@ -0,0 +1,146 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import numpy as np
3
+
4
+ def load_aligned_bin(file_path, dtype=np.int8, alignment=8):
5
+ """
6
+ 读取并解析diskann规定的二进制向量文件,比如bigann_query.bin等
7
+
8
+ 参数:
9
+ file_path (str): 二进制文件路径
10
+ dtype (np.dtype): 数据类型,默认为float32
11
+ alignment (int): 维度对齐的倍数,默认为8(读取数据集要对齐,保存res的时候没有对齐要求,所以读res的时候不用align)
12
+
13
+ 返回:
14
+ np.ndarray: 读取并对齐后的向量数据
15
+ int: 向量数量
16
+ int: 原始向量维度
17
+ int: 对齐后的向量维度
18
+ """
19
+ # 获取文件大小
20
+ actual_file_size = os.path.getsize(file_path)
21
+
22
+ with open(file_path, 'rb') as f:
23
+ # 读取向量数量和维度信息
24
+ npts = int(np.fromfile(f, dtype=np.int32, count=1)[0])
25
+ dim = int(np.fromfile(f, dtype=np.int32, count=1)[0])
26
+ # 计算预期的文件大小
27
+ expected_file_size = 2 * 4 + npts * dim * np.dtype(dtype).itemsize
28
+ if actual_file_size != expected_file_size:
29
+ raise ValueError(f"文件大小不匹配。实际大小: {actual_file_size} 字节,"
30
+ f"预期大小: {expected_file_size} 字节")
31
+
32
+ # 计算对齐后的维度
33
+ rounded_dim = ((dim + alignment - 1) // alignment) * alignment
34
+ # rounded_dim = dim
35
+
36
+ print(f"元数据: #向量 = {npts}, #维度 = {dim}, 对齐维度 = {rounded_dim}")
37
+ print(f"分配内存: {npts * rounded_dim * np.dtype(dtype).itemsize} 字节")
38
+
39
+ # 创建对齐后的数组(填充零)
40
+ data = np.zeros((npts, rounded_dim), dtype=dtype)
41
+
42
+ # 逐行读取向量数据
43
+ for i in range(npts):
44
+ # 从文件读取原始向量数据
45
+ vector = np.fromfile(f, dtype=dtype, count=dim)
46
+ # 存储到对齐后的数组位置
47
+ data[i, :dim] = vector
48
+
49
+ print("数据读取完成")
50
+
51
+ return data, npts, dim, rounded_dim
52
+
53
+
54
+
55
+ def load_bin(filename, dtype=np.float32):
56
+ """
57
+ 读取按照save_bin函数格式保存的二进制文件
58
+
59
+ 参数:
60
+ filename (str): 二进制文件路径
61
+ dtype (np.dtype): 数据类型,根据文件内容选择np.uint32或np.float32
62
+
63
+ 返回:
64
+ np.ndarray: 读取的数据数组
65
+ int: 向量数量(npts)
66
+ int: 每个向量的维度(ndims)
67
+ """
68
+ with open(filename, 'rb') as f:
69
+ # 读取前两个整数(npts和ndims)
70
+ npts = np.fromfile(f, dtype=np.int32, count=1)[0]
71
+ ndims = np.fromfile(f, dtype=np.int32, count=1)[0]
72
+
73
+ print(f"读取元数据: #向量 = {npts}, #维度 = {ndims}")
74
+
75
+ # 读取实际数据
76
+ data = np.fromfile(f, dtype=dtype, count=npts * ndims)
77
+ # 重塑为二维数组
78
+ data = data.reshape(npts, ndims)
79
+
80
+ print(f"成功读取 {data.shape[0]} 个向量,每个向量维度为 {data.shape[1]}")
81
+ print(f"数据类型: {data.dtype}")
82
+
83
+ return data, npts, ndims
84
+
85
+ # 使用示例
86
+ if __name__ == "__main__":
87
+ # 读取索引结果文件
88
+ idx_filename = "/home/myw/wuchangli/yk/diskann_demo/DiskANN/build/data_backup_clean_test/res_20_idx_uint32.bin" # 替换为实际文件路径
89
+ idx_data, npts, ndims = load_bin(idx_filename, dtype=np.uint32)
90
+
91
+ # 读取距离结果文件
92
+ dist_filename = "/home/myw/wuchangli/yk/diskann_demo/DiskANN/build/data_backup_clean_test/res_20_dists_float.bin" # 替换为实际文件路径
93
+ dist_data, _, _ = load_bin(dist_filename, dtype=np.float32)
94
+
95
+ file_path = "/home/myw/wuchangli/yk/diskann_demo/DiskANN/build/data_backup_clean_test/bigann_query.bin"
96
+ query_data, query_npts, query_dim, query_rounded_dim = load_aligned_bin(file_path)
97
+
98
+ # 打印前5个查询结果
99
+ print("\n示例结果:")
100
+ for i in range(min(5, npts)):
101
+ print(f"查询 {i}:")
102
+ print(f" 最近邻索引: {idx_data[i, :]}")
103
+ print(f" 最近邻距离: {dist_data[i, :]}")
104
+
105
+ import pdb; pdb.set_trace()
106
+
107
+ print('load learn vectors')
108
+ base_path = "/home/myw/wuchangli/yk/diskann_demo/DiskANN/build/data_backup_clean_test/bigann_learn.bin"
109
+ base_vectors, base_npts, base_dim, base_rounded_dim = load_aligned_bin(base_path)
110
+ import pdb; pdb.set_trace()
111
+
112
+
113
+
114
+ # 查询 0:
115
+ # 最近邻索引: [98840598 23883997 96907701 96907734 96908621 3762989 53147362 88013120
116
+ # 53147127 96908488]
117
+ # 最近邻距离: [68103. 68524. 70149. 71367. 71500. 71746. 71859. 73190. 73243. 73430.]
118
+ # 查询 1:
119
+ # 最近邻索引: [26755808 34080910 3236088 15610118 66524798 72823982 27594517 37556745
120
+ # 26786213 39265851]
121
+ # 最近邻距离: [28587. 30958. 31851. 32047. 32234. 33895. 34205. 34521. 34699. 34823.]
122
+ # 查询 2:
123
+ # 最近邻索引: [71015134 21282510 71010508 6626649 96440577 38613912 71106199 76697998
124
+ # 53211679 52141940]
125
+ # 最近邻距离: [36378. 41223. 41241. 41245. 41867. 42108. 42120. 42183. 42383. 42662.]
126
+ # 查询 3:
127
+ # 最近邻索引: [21759610 34536818 21549958 84252617 2083654 30364765 15796826 11393460
128
+ # 52313741 10230325]
129
+ # 最近邻距离: [2474. 2684. 2701. 2725. 2768. 2869. 2918. 2937. 2967. 2996.]
130
+ # 查询 4:
131
+ # 最近邻索引: [44759921 57423326 30803610 85912171 93322848 38777531 64765128 50769267
132
+ # 33429914 98348143]
133
+ # 最近邻距离: [48352. 64203. 65424. 66350. 69264. 69772. 70490. 70679. 71253. 72685.]
134
+
135
+
136
+ # (Pdb) np.sum((base_vectors[98840598].astype('float32') - query_data[0].astype('float32'))**2)
137
+ # 68103.0
138
+ # (Pdb) np.sum((base_vectors[23883997].astype('float32') - query_data[0].astype('float32'))**2)
139
+ # 68524.0
140
+ # (Pdb) np.sum((base_vectors[96907701].astype('float32') - query_data[0].astype('float32'))**2)
141
+ # 70149.0
142
+
143
+ # (Pdb) np.sum((base_vectors[26755808].astype('float32') - query_data[1].astype('float32'))**2)
144
+ # 28587.0
145
+ # (Pdb) np.sum((base_vectors[44759921].astype('float32') - query_data[4].astype('float32'))**2)
146
+ # 48352.0