| import json | |
| import os | |
| VIDEO_DIR = "datasets/stage3/videos" | |
| JSON_PATH = "datasets/stage3/video_instruct_data.json" | |
| OUTPUT_JSON = "datasets/stage3/video_instruct_data_clean.json" | |
| def main(): | |
| print("🚀 开始清洗 Stage 3 JSON...") | |
| # 1. 扫描本地视频 ID | |
| existing_ids = set() | |
| for f in os.listdir(VIDEO_DIR): | |
| if f.endswith(('.mp4', '.mkv', '.webm')): | |
| existing_ids.add(os.path.splitext(f)[0]) | |
| print(f"✅ 本地视频数: {len(existing_ids)}") | |
| # 2. 读取全量 JSON | |
| with open(JSON_PATH, 'r') as f: | |
| data = json.load(f) | |
| # 3. 过滤:只保留本地有的 | |
| clean_data = [] | |
| for item in data: | |
| # 兼容不同的键名情况 | |
| vid = item.get("video_id") or item.get("video_name") or item.get("image_id") | |
| if vid in existing_ids: | |
| clean_data.append(item) | |
| # 4. 保存 | |
| with open(OUTPUT_JSON, 'w') as f: | |
| json.dump(clean_data, f) | |
| print(f"🎉 清洗完毕!有效数据: {len(clean_data)} 条。已保存至 {OUTPUT_JSON}") | |
| if __name__ == "__main__": | |
| main() |