ndjadjafbagk / filter_active_voxels.py
udbbdh's picture
Upload folder using huggingface_hub
7340df2 verified
import os
import numpy as np
from tqdm import tqdm
from concurrent.futures import ProcessPoolExecutor, as_completed
# === 配置 ===
cache_dir = "/gemini/user/private/zhaotianhao/dataset_cache/MERGED_DATASET_count_200_2000_100000_128to1024_819200"
# 1. Edge Voxels (512分辨率) 的筛选阈值
target_res_edge = 512
min_edge_voxels = 2000
max_edge_voxels = 75000
# 2. Active Voxels (64分辨率) 的筛选阈值
# 请根据你的需求调整这两个数值
target_res_active = 128
min_active_voxels = 2000 # 举例:最少要有100个粗糙体素
max_active_voxels = 326780 # 举例:最多8000个粗糙体素
save_txt_path = f"/gemini/user/private/zhaotianhao/Triposf/MERGED_DATASET_filtered_{min_edge_voxels}-{max_edge_voxels}edge_{min_active_voxels}-{max_active_voxels}active.txt"
# === 单文件统计函数 ===
def check_voxel_counts(npz_path):
try:
# 打开 npz 文件
with np.load(npz_path) as data:
# 键名定义
key_edge = f"combined_voxels_{target_res_edge}"
key_active = f"active_voxels_{target_res_active}"
# 检查键是否存在
if key_edge not in data or key_active not in data:
return None
# 获取数量
count_edge = len(data[key_edge])
count_active = len(data[key_active])
# === 核心筛选逻辑 (同时满足两个条件) ===
is_edge_valid = min_edge_voxels <= count_edge <= max_edge_voxels
is_active_valid = min_active_voxels <= count_active <= max_active_voxels
if is_edge_valid and is_active_valid:
base_name = os.path.basename(npz_path)
# 处理文件名
if base_name.endswith("_precombined.npz"):
original_name = base_name.replace("_precombined.npz", "")
else:
original_name = os.path.splitext(base_name)[0]
return (original_name, count_edge, count_active)
except Exception:
return None
return None
# === 获取所有 NPZ 文件 ===
if not os.path.exists(cache_dir):
print(f"错误: 缓存目录不存在 {cache_dir}")
exit()
npz_files = [os.path.join(cache_dir, f) for f in os.listdir(cache_dir) if f.endswith(".npz")]
print(f"共发现 {len(npz_files)} 个缓存文件。开始并行过滤...")
print(f"筛选条件:")
print(f" - Edge (512): {min_edge_voxels} ~ {max_edge_voxels}")
print(f" - Active (64): {min_active_voxels} ~ {max_active_voxels}")
# === 并行过滤 ===
filtered_files = []
counts_edge = []
counts_active = []
with ProcessPoolExecutor(max_workers=os.cpu_count()) as executor:
futures = {executor.submit(check_voxel_counts, path): path for path in npz_files}
for future in tqdm(as_completed(futures), total=len(futures), desc="Filtering"):
result = future.result()
if result is not None:
fname, c_edge, c_active = result
filtered_files.append(fname)
counts_edge.append(c_edge)
counts_active.append(c_active)
# === 保存结果 ===
os.makedirs(os.path.dirname(save_txt_path), exist_ok=True)
with open(save_txt_path, "w") as f:
for fname in filtered_files:
f.write(f"{fname}\n")
# === 打印统计信息 ===
print(f"\n✅ 筛选完成:")
print(f" 符合条件的文件数: {len(filtered_files)} / {len(npz_files)} (保留率: {len(filtered_files)/len(npz_files)*100:.2f}%)")
if counts_edge:
print(f"\n[统计 - Edge Voxels (512)]")
print(f" 最小值: {min(counts_edge)}")
print(f" 最大值: {max(counts_edge)}")
print(f" 平均值: {np.mean(counts_edge):.2f}")
if counts_active:
print(f"\n[统计 - Active Voxels (64)]")
print(f" 最小值: {min(counts_active)}")
print(f" 最大值: {max(counts_active)}")
print(f" 平均值: {np.mean(counts_active):.2f}")
print(f"\n 结果已保存到: {save_txt_path}")