Spaces:
Running
on
Zero
Running
on
Zero
| from datasets import load_dataset | |
| import pandas as pd | |
| import re | |
| # # Load the WebVid dataset | |
| # dataset = load_dataset('webvid', 'webvid-10m', split='train') | |
| # from datasets import load_dataset | |
| ds = load_dataset("TempoFunk/webvid-10M", cache_dir="/mnt/data/lizhikai/webvid/") | |
| v = ds['validation']['name'] | |
| # 定义字符串长度的合理范围 | |
| MIN_LENGTH = 30 | |
| MAX_LENGTH = 300 | |
| pattern = re.compile(r'^[a-zA-Z\s]+$') | |
| # 过滤掉空字符串和特别长特别短的字符串 | |
| v = [s for s in v if len(s) >= MIN_LENGTH and len(s) <= MAX_LENGTH and pattern.match(s)] | |
| # 指定保存文件的路径 | |
| file_path = 'webvid_prompt.txt' | |
| # 打开文件,以写入模式 | |
| with open(file_path, 'w', encoding='utf-8') as file: | |
| # 遍历列表中的每个字符串并写入文件 | |
| for item in v: | |
| if '\n' in item: | |
| continue | |
| else: | |
| file.write(item + '\n') | |
| print("字符串列表已成功保存到文件中。") | |