File size: 1,473 Bytes
2bcab80
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
import pandas as pd
import json
import os

# ================= 配置 =================
BASE_DIR = "datasets"
METADATA_DIR = os.path.join(BASE_DIR, "CondensedMovies_Metadata")
VIDEO_DIR = os.path.join(BASE_DIR, "CondensedMovies_Videos")
OUTPUT_JSON = os.path.join(BASE_DIR, "cmd_annotations.json")
# ========================================

def main():
    print("🚀 生成标准 CMD JSON...")
    
    # 1. 读取 CSV
    df_clips = pd.read_csv(os.path.join(METADATA_DIR, "clips.csv"))
    df_desc = pd.read_csv(os.path.join(METADATA_DIR, "descriptions.csv"))
    df_merged = pd.merge(df_clips, df_desc, on="videoid", how="inner")

    # 2. 扫描本地视频 (现在它们都在根目录了,且都是 mp4)
    existing_ids = set()
    for f in os.listdir(VIDEO_DIR):
        if f.endswith(".mp4"):
            existing_ids.add(os.path.splitext(f)[0])
            
    print(f"✅ 本地找到 {len(existing_ids)} 个视频")

    # 3. 生成列表
    annotations = []
    for _, row in df_merged.iterrows():
        vid = row['videoid']
        if vid in existing_ids:
            # 只要 image_id 和 caption,完全符合原始代码要求
            annotations.append({
                "image_id": vid,
                "caption": row['description']
            })

    # 4. 保存
    with open(OUTPUT_JSON, 'w') as f:
        json.dump(annotations, f)
    print(f"🎉 JSON 生成完毕: {len(annotations)} 条数据")

if __name__ == "__main__":
    main()