File size: 3,341 Bytes

a70eb3d

#!/usr/bin/env python3
# v3: use clean_merged_data (v2) as base, add Khadatkar + Learning to train
import os, shutil, glob
from pathlib import Path

HOME = os.path.expanduser('~')
OUT  = f'{HOME}/merged_v3'
BASE = f'{HOME}/clean_merged_data'

# Fresh output
if os.path.exists(OUT): shutil.rmtree(OUT)
for s in ('train','valid','test'):
    os.makedirs(f'{OUT}/{s}/images', exist_ok=True)
    os.makedirs(f'{OUT}/{s}/labels', exist_ok=True)

stats = {s: {c: 0 for c in (0,1,2)} for s in ('train','valid','test')}
imgcount = {s: 0 for s in ('train','valid','test')}

def copy_split(src_img_dir, src_lbl_dir, target_split, cmap, name_suffix):
    n = 0
    for lbl_path in glob.glob(f'{src_lbl_dir}/*.txt'):
        stem = Path(lbl_path).stem
        img_path = None
        for ext in ('.jpg','.jpeg','.png','.JPG','.PNG'):
            p = f'{src_img_dir}/{stem}{ext}'
            if os.path.exists(p): img_path = p; break
        if img_path is None: continue
        lines = []
        with open(lbl_path) as f:
            for line in f:
                parts = line.strip().split()
                if not parts: continue
                cid = int(parts[0])
                if cid not in cmap: continue
                lines.append(' '.join([str(cmap[cid])] + parts[1:]))
        if not lines: continue
        new_stem = f'{stem}{name_suffix}'
        ext = Path(img_path).suffix
        dst_img = f'{OUT}/{target_split}/images/{new_stem}{ext}'
        dst_lbl = f'{OUT}/{target_split}/labels/{new_stem}.txt'
        if not os.path.exists(dst_img):
            try: os.link(img_path, dst_img)
            except: shutil.copy(img_path, dst_img)
        with open(dst_lbl, 'w') as f:
            f.write('\n'.join(lines) + '\n')
        for ln in lines:
            stats[target_split][int(ln.split()[0])] += 1
        imgcount[target_split] += 1
        n += 1
    return n

# 1) Copy clean_merged_data AS-IS (identity mapping for 0,1,2), no extra suffix
#    Images already have _cctv_dataset / _helmet_dataset / _yolo_project suffixes
print('--- base v2 data ---')
for s in ('train','valid','test'):
    n = copy_split(f'{BASE}/{s}/images', f'{BASE}/{s}/labels', s, {0:0,1:1,2:2}, '')
    print(f'  base -> {s}: {n}')

# 2) Add Khadatkar + Learning ONLY to train split
EXTRAS = [
    ('khadatkar', f'{HOME}/extra_khadatkar', {0:1, 1:0}),  # 0=With Helmet->1, 1=Without Helmet->0, drop 2=licence
    ('learning',  f'{HOME}/extra_learning',  {0:1, 1:0}),  # 0=With Helmet->1, 1=Without Helmet->0
]
print('--- extras -> train ---')
for name, root, cmap in EXTRAS:
    for src_split in ('train','valid','test'):
        img_dir = f'{root}/{src_split}/images'
        lbl_dir = f'{root}/{src_split}/labels'
        if not os.path.isdir(lbl_dir): continue
        n = copy_split(img_dir, lbl_dir, 'train', cmap, f'_{name}_{src_split}')
        print(f'  {name} {src_split} -> train: {n}')

yaml = f'''path: {OUT}
train: train/images
val: valid/images
test: test/images
nc: 3
names:
  0: no-helmet
  1: with-helmet
  2: triple-riding
'''
with open(f'{OUT}/data.yaml','w') as f: f.write(yaml)

print('\n=== V3 MERGE COMPLETE ===')
for s in ('train','valid','test'):
    tot = sum(stats[s].values())
    print(f'  {s:6s} images={imgcount[s]:5d} | no-helmet={stats[s][0]:5d} with-helmet={stats[s][1]:5d} triple={stats[s][2]:4d} | instances={tot}')