azure-scripts / merge_v3.py
vivekvar's picture
azure home scripts: data gen, training, misc
a70eb3d verified
#!/usr/bin/env python3
# v3: use clean_merged_data (v2) as base, add Khadatkar + Learning to train
import os, shutil, glob
from pathlib import Path
HOME = os.path.expanduser('~')
OUT = f'{HOME}/merged_v3'
BASE = f'{HOME}/clean_merged_data'
# Fresh output
if os.path.exists(OUT): shutil.rmtree(OUT)
for s in ('train','valid','test'):
os.makedirs(f'{OUT}/{s}/images', exist_ok=True)
os.makedirs(f'{OUT}/{s}/labels', exist_ok=True)
stats = {s: {c: 0 for c in (0,1,2)} for s in ('train','valid','test')}
imgcount = {s: 0 for s in ('train','valid','test')}
def copy_split(src_img_dir, src_lbl_dir, target_split, cmap, name_suffix):
n = 0
for lbl_path in glob.glob(f'{src_lbl_dir}/*.txt'):
stem = Path(lbl_path).stem
img_path = None
for ext in ('.jpg','.jpeg','.png','.JPG','.PNG'):
p = f'{src_img_dir}/{stem}{ext}'
if os.path.exists(p): img_path = p; break
if img_path is None: continue
lines = []
with open(lbl_path) as f:
for line in f:
parts = line.strip().split()
if not parts: continue
cid = int(parts[0])
if cid not in cmap: continue
lines.append(' '.join([str(cmap[cid])] + parts[1:]))
if not lines: continue
new_stem = f'{stem}{name_suffix}'
ext = Path(img_path).suffix
dst_img = f'{OUT}/{target_split}/images/{new_stem}{ext}'
dst_lbl = f'{OUT}/{target_split}/labels/{new_stem}.txt'
if not os.path.exists(dst_img):
try: os.link(img_path, dst_img)
except: shutil.copy(img_path, dst_img)
with open(dst_lbl, 'w') as f:
f.write('\n'.join(lines) + '\n')
for ln in lines:
stats[target_split][int(ln.split()[0])] += 1
imgcount[target_split] += 1
n += 1
return n
# 1) Copy clean_merged_data AS-IS (identity mapping for 0,1,2), no extra suffix
# Images already have _cctv_dataset / _helmet_dataset / _yolo_project suffixes
print('--- base v2 data ---')
for s in ('train','valid','test'):
n = copy_split(f'{BASE}/{s}/images', f'{BASE}/{s}/labels', s, {0:0,1:1,2:2}, '')
print(f' base -> {s}: {n}')
# 2) Add Khadatkar + Learning ONLY to train split
EXTRAS = [
('khadatkar', f'{HOME}/extra_khadatkar', {0:1, 1:0}), # 0=With Helmet->1, 1=Without Helmet->0, drop 2=licence
('learning', f'{HOME}/extra_learning', {0:1, 1:0}), # 0=With Helmet->1, 1=Without Helmet->0
]
print('--- extras -> train ---')
for name, root, cmap in EXTRAS:
for src_split in ('train','valid','test'):
img_dir = f'{root}/{src_split}/images'
lbl_dir = f'{root}/{src_split}/labels'
if not os.path.isdir(lbl_dir): continue
n = copy_split(img_dir, lbl_dir, 'train', cmap, f'_{name}_{src_split}')
print(f' {name} {src_split} -> train: {n}')
yaml = f'''path: {OUT}
train: train/images
val: valid/images
test: test/images
nc: 3
names:
0: no-helmet
1: with-helmet
2: triple-riding
'''
with open(f'{OUT}/data.yaml','w') as f: f.write(yaml)
print('\n=== V3 MERGE COMPLETE ===')
for s in ('train','valid','test'):
tot = sum(stats[s].values())
print(f' {s:6s} images={imgcount[s]:5d} | no-helmet={stats[s][0]:5d} with-helmet={stats[s][1]:5d} triple={stats[s][2]:4d} | instances={tot}')