Spaces:

therarelab
/

codebook

Running

File size: 2,907 Bytes
import pandas as pd
import json
import re

# =========================
# EDIT THESE TWO VARIABLES
# =========================
INPUT_CSV = "raw_data.csv"
OUTPUT_CSV = "cleaned_data.csv"


def extract_video_id(filename: str):
    """'0111.mp4' -> '0111' (string, to match your existing style)"""
    m = re.match(r"(\d+)", str(filename).strip())
    return m.group(1) if m else None


def parse_video_labels(field):
    """
    Parse Label Studio `videoLabels` JSON field.
    Returns list of dicts: [{"label": str, "start": int, "end": int}, ...]
    """
    if pd.isna(field):
        return []

    try:
        items = json.loads(field)
    except Exception:
        return []

    out = []
    for entry in items if isinstance(items, list) else [items]:
        labels = entry.get("timelinelabels") or entry.get("timelineLabels") or []
        ranges = entry.get("ranges") or []

        # normalize labels
        if isinstance(labels, str):
            labels = [labels]
        labels = [str(x).strip() for x in labels if str(x).strip()]

        for rr in ranges:
            try:
                start = int(rr.get("start"))
                end = int(rr.get("end"))
            except Exception:
                continue

            if start > end:
                start, end = end, start

            for lab in labels:
                out.append({"label": lab, "start": start, "end": end})

    return out


def main():
    df = pd.read_csv(INPUT_CSV)

    records = []
    for _, row in df.iterrows():
        filename = str(row.get("filename", "")).strip()
        if not filename:
            continue

        video_id = extract_video_id(filename)
        video_path = f"videos/{filename}"

        labels = parse_video_labels(row.get("videoLabels", "[]"))

        # If you want to DROP videos with no labels, replace this block with: `if not labels: continue`
        if not labels:
            records.append(
                {
                    "filename": filename,
                    "video_id": video_id,
                    "video_path": video_path,
                    "label": None,
                    "start": None,
                    "end": None,
                }
            )
        else:
            for lab in labels:
                records.append(
                    {
                        "filename": filename,
                        "video_id": video_id,
                        "video_path": video_path,
                        "label": lab["label"],
                        "start": lab["start"],
                        "end": lab["end"],
                    }
                )

    cleaned = pd.DataFrame(
        records,
        columns=["filename", "video_id", "video_path", "label", "start", "end"],
    )

    cleaned.to_csv(OUTPUT_CSV, index=False)
    print(f"✅ Wrote {len(cleaned)} rows -> {OUTPUT_CSV}")


if __name__ == "__main__":
    main()