codebook / clean.py
vidhimudaliar's picture
Upload 8 files
5233510 verified
import pandas as pd
import json
import re
# =========================
# EDIT THESE TWO VARIABLES
# =========================
INPUT_CSV = "raw_data.csv"
OUTPUT_CSV = "cleaned_data.csv"
def extract_video_id(filename: str):
"""'0111.mp4' -> '0111' (string, to match your existing style)"""
m = re.match(r"(\d+)", str(filename).strip())
return m.group(1) if m else None
def parse_video_labels(field):
"""
Parse Label Studio `videoLabels` JSON field.
Returns list of dicts: [{"label": str, "start": int, "end": int}, ...]
"""
if pd.isna(field):
return []
try:
items = json.loads(field)
except Exception:
return []
out = []
for entry in items if isinstance(items, list) else [items]:
labels = entry.get("timelinelabels") or entry.get("timelineLabels") or []
ranges = entry.get("ranges") or []
# normalize labels
if isinstance(labels, str):
labels = [labels]
labels = [str(x).strip() for x in labels if str(x).strip()]
for rr in ranges:
try:
start = int(rr.get("start"))
end = int(rr.get("end"))
except Exception:
continue
if start > end:
start, end = end, start
for lab in labels:
out.append({"label": lab, "start": start, "end": end})
return out
def main():
df = pd.read_csv(INPUT_CSV)
records = []
for _, row in df.iterrows():
filename = str(row.get("filename", "")).strip()
if not filename:
continue
video_id = extract_video_id(filename)
video_path = f"videos/{filename}"
labels = parse_video_labels(row.get("videoLabels", "[]"))
# If you want to DROP videos with no labels, replace this block with: `if not labels: continue`
if not labels:
records.append(
{
"filename": filename,
"video_id": video_id,
"video_path": video_path,
"label": None,
"start": None,
"end": None,
}
)
else:
for lab in labels:
records.append(
{
"filename": filename,
"video_id": video_id,
"video_path": video_path,
"label": lab["label"],
"start": lab["start"],
"end": lab["end"],
}
)
cleaned = pd.DataFrame(
records,
columns=["filename", "video_id", "video_path", "label", "start", "end"],
)
cleaned.to_csv(OUTPUT_CSV, index=False)
print(f"✅ Wrote {len(cleaned)} rows -> {OUTPUT_CSV}")
if __name__ == "__main__":
main()