|
|
import pandas as pd |
|
|
import json |
|
|
import os |
|
|
|
|
|
|
|
|
BASE_DIR = "datasets" |
|
|
METADATA_DIR = os.path.join(BASE_DIR, "CondensedMovies_Metadata") |
|
|
VIDEO_DIR = os.path.join(BASE_DIR, "CondensedMovies_Videos") |
|
|
OUTPUT_JSON = os.path.join(BASE_DIR, "cmd_annotations.json") |
|
|
|
|
|
|
|
|
def main(): |
|
|
print("🚀 生成标准 CMD JSON...") |
|
|
|
|
|
|
|
|
df_clips = pd.read_csv(os.path.join(METADATA_DIR, "clips.csv")) |
|
|
df_desc = pd.read_csv(os.path.join(METADATA_DIR, "descriptions.csv")) |
|
|
df_merged = pd.merge(df_clips, df_desc, on="videoid", how="inner") |
|
|
|
|
|
|
|
|
existing_ids = set() |
|
|
for f in os.listdir(VIDEO_DIR): |
|
|
if f.endswith(".mp4"): |
|
|
existing_ids.add(os.path.splitext(f)[0]) |
|
|
|
|
|
print(f"✅ 本地找到 {len(existing_ids)} 个视频") |
|
|
|
|
|
|
|
|
annotations = [] |
|
|
for _, row in df_merged.iterrows(): |
|
|
vid = row['videoid'] |
|
|
if vid in existing_ids: |
|
|
|
|
|
annotations.append({ |
|
|
"image_id": vid, |
|
|
"caption": row['description'] |
|
|
}) |
|
|
|
|
|
|
|
|
with open(OUTPUT_JSON, 'w') as f: |
|
|
json.dump(annotations, f) |
|
|
print(f"🎉 JSON 生成完毕: {len(annotations)} 条数据") |
|
|
|
|
|
if __name__ == "__main__": |
|
|
main() |