Minigpt4 / clean_stage3_json.py
ddy0126's picture
Upload folder using huggingface_hub
2bcab80 verified
import json
import os
VIDEO_DIR = "datasets/stage3/videos"
JSON_PATH = "datasets/stage3/video_instruct_data.json"
OUTPUT_JSON = "datasets/stage3/video_instruct_data_clean.json"
def main():
print("🚀 开始清洗 Stage 3 JSON...")
# 1. 扫描本地视频 ID
existing_ids = set()
for f in os.listdir(VIDEO_DIR):
if f.endswith(('.mp4', '.mkv', '.webm')):
existing_ids.add(os.path.splitext(f)[0])
print(f"✅ 本地视频数: {len(existing_ids)}")
# 2. 读取全量 JSON
with open(JSON_PATH, 'r') as f:
data = json.load(f)
# 3. 过滤:只保留本地有的
clean_data = []
for item in data:
# 兼容不同的键名情况
vid = item.get("video_id") or item.get("video_name") or item.get("image_id")
if vid in existing_ids:
clean_data.append(item)
# 4. 保存
with open(OUTPUT_JSON, 'w') as f:
json.dump(clean_data, f)
print(f"🎉 清洗完毕!有效数据: {len(clean_data)} 条。已保存至 {OUTPUT_JSON}")
if __name__ == "__main__":
main()