siddharthdhara17's picture
Upload folder using huggingface_hub
457db56 verified
import os
import re
from collections import defaultdict
def parse_test_list_line(line):
# Example: LIDC-IDRI-slices/LIDC-IDRI-0004/nodule-0/images/slice-0.png
match = re.search(r'LIDC-IDRI-(\d+)/nodule-(\d+)/images/slice-(\d+)\.png', line.strip())
if match:
patient_id = match.group(1)
nodule_id = match.group(2)
slice_id = match.group(3)
return patient_id, nodule_id, slice_id
return None
def parse_sample_filename(filename):
# Example: LIDC-IDRI-0004_nodule-0_slice-0_sample_00.png
match = re.search(r'LIDC-IDRI-(\d+)_nodule-(\d+)_slice-(\d+)_(.+)\.png', filename)
if match:
patient_id = match.group(1)
nodule_id = match.group(2)
slice_id = match.group(3)
file_type = match.group(4)
return patient_id, nodule_id, slice_id, file_type
return None
def main():
# Paths
test_list_path = 'test_list.txt'
samples_dir = 'results/samples/samples'
# Read test_list.txt
expected = set()
with open(test_list_path, 'r') as f:
for line in f:
parsed = parse_test_list_line(line)
if parsed:
expected.add(parsed)
print(f"Total expected images: {len(expected)}")
# List samples directory
sample_files = os.listdir(samples_dir)
# Group by (patient_id, nodule_id, slice_id)
sample_groups = defaultdict(set)
for filename in sample_files:
parsed = parse_sample_filename(filename)
if parsed:
patient_id, nodule_id, slice_id, file_type = parsed
key = (patient_id, nodule_id, slice_id)
sample_groups[key].add(file_type)
# Count completed (have all 16 samples)
completed = 0
for key in expected:
types = sample_groups.get(key, set())
if all(f'sample_{i:02d}' in types for i in range(16)):
completed += 1
print(f"Completed images (with all 4 samples): {completed}")
print(f"Progress: {completed}/{len(expected)} ({completed/len(expected)*100:.2f}%)")
# Additional analysis
total_samples = sum(len(types) for types in sample_groups.values())
unique_patients = len(set(key[0] for key in sample_groups))
print(f"Total sample files: {total_samples}")
print(f"Unique patients in samples: {unique_patients}")
# Count types
type_counts = defaultdict(int)
for types in sample_groups.values():
for t in types:
type_counts[t] += 1
print("File type counts:")
for t, count in sorted(type_counts.items()):
print(f" {t}: {count}")
if __name__ == "__main__":
main()