import pandas as pd import json import re # ========================= # EDIT THESE TWO VARIABLES # ========================= INPUT_CSV = "raw_data.csv" OUTPUT_CSV = "cleaned_data.csv" def extract_video_id(filename: str): """'0111.mp4' -> '0111' (string, to match your existing style)""" m = re.match(r"(\d+)", str(filename).strip()) return m.group(1) if m else None def parse_video_labels(field): """ Parse Label Studio `videoLabels` JSON field. Returns list of dicts: [{"label": str, "start": int, "end": int}, ...] """ if pd.isna(field): return [] try: items = json.loads(field) except Exception: return [] out = [] for entry in items if isinstance(items, list) else [items]: labels = entry.get("timelinelabels") or entry.get("timelineLabels") or [] ranges = entry.get("ranges") or [] # normalize labels if isinstance(labels, str): labels = [labels] labels = [str(x).strip() for x in labels if str(x).strip()] for rr in ranges: try: start = int(rr.get("start")) end = int(rr.get("end")) except Exception: continue if start > end: start, end = end, start for lab in labels: out.append({"label": lab, "start": start, "end": end}) return out def main(): df = pd.read_csv(INPUT_CSV) records = [] for _, row in df.iterrows(): filename = str(row.get("filename", "")).strip() if not filename: continue video_id = extract_video_id(filename) video_path = f"videos/{filename}" labels = parse_video_labels(row.get("videoLabels", "[]")) # If you want to DROP videos with no labels, replace this block with: `if not labels: continue` if not labels: records.append( { "filename": filename, "video_id": video_id, "video_path": video_path, "label": None, "start": None, "end": None, } ) else: for lab in labels: records.append( { "filename": filename, "video_id": video_id, "video_path": video_path, "label": lab["label"], "start": lab["start"], "end": lab["end"], } ) cleaned = pd.DataFrame( records, columns=["filename", "video_id", "video_path", "label", "start", "end"], ) cleaned.to_csv(OUTPUT_CSV, index=False) print(f"✅ Wrote {len(cleaned)} rows -> {OUTPUT_CSV}") if __name__ == "__main__": main()