udbbdh
/

ndjadjafbagk

Model card Files Files and versions

ndjadjafbagk / filter_active_voxels.py

udbbdh's picture

Upload folder using huggingface_hub

7340df2 verified 4 months ago

history blame contribute delete

4.02 kB

	import os
	import numpy as np
	from tqdm import tqdm
	from concurrent.futures import ProcessPoolExecutor, as_completed

	# === 配置 ===
	cache_dir = "/gemini/user/private/zhaotianhao/dataset_cache/MERGED_DATASET_count_200_2000_100000_128to1024_819200"

	# 1. Edge Voxels (512分辨率) 的筛选阈值
	target_res_edge = 512
	min_edge_voxels = 2000
	max_edge_voxels = 75000

	# 2. Active Voxels (64分辨率) 的筛选阈值
	# 请根据你的需求调整这两个数值
	target_res_active = 128
	min_active_voxels = 2000 # 举例：最少要有100个粗糙体素
	max_active_voxels = 326780 # 举例：最多8000个粗糙体素

	save_txt_path = f"/gemini/user/private/zhaotianhao/Triposf/MERGED_DATASET_filtered_{min_edge_voxels}-{max_edge_voxels}edge_{min_active_voxels}-{max_active_voxels}active.txt"

	# === 单文件统计函数 ===
	def check_voxel_counts(npz_path):
	try:
	# 打开 npz 文件
	with np.load(npz_path) as data:
	# 键名定义
	key_edge = f"combined_voxels_{target_res_edge}"
	key_active = f"active_voxels_{target_res_active}"

	# 检查键是否存在
	if key_edge not in data or key_active not in data:
	return None

	# 获取数量
	count_edge = len(data[key_edge])
	count_active = len(data[key_active])

	# === 核心筛选逻辑 (同时满足两个条件) ===
	is_edge_valid = min_edge_voxels <= count_edge <= max_edge_voxels
	is_active_valid = min_active_voxels <= count_active <= max_active_voxels

	if is_edge_valid and is_active_valid:
	base_name = os.path.basename(npz_path)
	# 处理文件名
	if base_name.endswith("_precombined.npz"):
	original_name = base_name.replace("_precombined.npz", "")
	else:
	original_name = os.path.splitext(base_name)[0]

	return (original_name, count_edge, count_active)

	except Exception:
	return None
	return None

	# === 获取所有 NPZ 文件 ===
	if not os.path.exists(cache_dir):
	print(f"错误: 缓存目录不存在 {cache_dir}")
	exit()

	npz_files = [os.path.join(cache_dir, f) for f in os.listdir(cache_dir) if f.endswith(".npz")]
	print(f"共发现 {len(npz_files)} 个缓存文件。开始并行过滤...")
	print(f"筛选条件:")
	print(f" - Edge (512): {min_edge_voxels} ~ {max_edge_voxels}")
	print(f" - Active (64): {min_active_voxels} ~ {max_active_voxels}")

	# === 并行过滤 ===
	filtered_files = []
	counts_edge = []
	counts_active = []

	with ProcessPoolExecutor(max_workers=os.cpu_count()) as executor:
	futures = {executor.submit(check_voxel_counts, path): path for path in npz_files}

	for future in tqdm(as_completed(futures), total=len(futures), desc="Filtering"):
	result = future.result()
	if result is not None:
	fname, c_edge, c_active = result
	filtered_files.append(fname)
	counts_edge.append(c_edge)
	counts_active.append(c_active)

	# === 保存结果 ===
	os.makedirs(os.path.dirname(save_txt_path), exist_ok=True)
	with open(save_txt_path, "w") as f:
	for fname in filtered_files:
	f.write(f"{fname}\n")

	# === 打印统计信息 ===
	print(f"\n✅ 筛选完成：")
	print(f" 符合条件的文件数: {len(filtered_files)} / {len(npz_files)} (保留率: {len(filtered_files)/len(npz_files)*100:.2f}%)")

	if counts_edge:
	print(f"\n[统计 - Edge Voxels (512)]")
	print(f" 最小值: {min(counts_edge)}")
	print(f" 最大值: {max(counts_edge)}")
	print(f" 平均值: {np.mean(counts_edge):.2f}")

	if counts_active:
	print(f"\n[统计 - Active Voxels (64)]")
	print(f" 最小值: {min(counts_active)}")
	print(f" 最大值: {max(counts_active)}")
	print(f" 平均值: {np.mean(counts_active):.2f}")

	print(f"\n 结果已保存到: {save_txt_path}")