K-Sort-Arena

Running on Zero

K-Sort-Arena / get_webvid_prompt.py

Add Video

b6dc501 over 1 year ago

940 Bytes

	from datasets import load_dataset
	import pandas as pd
	import re
	# # Load the WebVid dataset
	# dataset = load_dataset('webvid', 'webvid-10m', split='train')
	# from datasets import load_dataset

	ds = load_dataset("TempoFunk/webvid-10M", cache_dir="/mnt/data/lizhikai/webvid/")
	v = ds['validation']['name']
	# 定义字符串长度的合理范围
	MIN_LENGTH = 30
	MAX_LENGTH = 300
	pattern = re.compile(r'^[a-zA-Z\s]+$')

	# 过滤掉空字符串和特别长特别短的字符串
	v = [s for s in v if len(s) >= MIN_LENGTH and len(s) <= MAX_LENGTH and pattern.match(s)]

	# 指定保存文件的路径
	file_path = 'webvid_prompt.txt'

	# 打开文件，以写入模式
	with open(file_path, 'w', encoding='utf-8') as file:
	# 遍历列表中的每个字符串并写入文件
	for item in v:
	if '\n' in item:
	continue
	else:
	file.write(item + '\n')

	print("字符串列表已成功保存到文件中。")