Spaces:
Configuration error
Configuration error
| import torch | |
| import pandas as pd | |
| import os | |
| from tqdm import tqdm | |
| df = pd.read_parquet("../../Datasets/SakugaDataset/parquet/train_aesthetic/sakugadataset_train_aesthetic.parquet") | |
| df = df[['identifier', 'scene_start_time', 'scene_end_time', 'fps',"text_description","aesthetic_score","dynamic_score"]] | |
| # drop rows with nan | |
| df = df.dropna(subset=['scene_start_time', 'scene_end_time', 'fps',"text_description","aesthetic_score","dynamic_score"]) | |
| df['identifier_video'] = df['identifier'].apply(lambda x: int(x.split(':')[0])) | |
| base_path = '/home/cn/Datasets/SakugaDataset/split/train_aesthetic' | |
| rows_to_delete = [] | |
| print(df.shape) | |
| # 遍历数据框的每一行 | |
| for index, row in df.iterrows(): | |
| folder_path = os.path.join(base_path, str(row['identifier_video'])) | |
| #print(str(row['identifier_video'])) | |
| # 检查文件夹是否存在 | |
| #print(folder_path) | |
| if not os.path.exists(folder_path): | |
| print(folder_path) | |
| rows_to_delete.append(index) | |
| if os.path.exists(folder_path) and os.path.isdir(folder_path): | |
| # 检查文件夹中的文件数量 | |
| if len(os.listdir(folder_path)) == 0: | |
| rows_to_delete.append(index) | |
| #print(index) | |
| # 删除满足条件的行 | |
| df.drop(rows_to_delete, inplace=True) | |
| # 重置索引 | |
| df.reset_index(drop=True, inplace=True) | |
| print(df.shape) | |
| output_parquet_path = '/home/cn/Datasets/SakugaDataset/parquet/fliter_aesthetic.parquet' | |
| df.to_parquet(output_parquet_path, index=False) | |
| #132337 | |
| # 132102 | |
| # 删完无法读取的部分之后剩下了132067个 | |
| ''' | |
| print(df) | |
| i=10 | |
| df.iloc[i]['identifier_video'] | |
| print(df.columns) | |
| # 查看前几行数据 | |
| print(df.head()) | |
| print(df.iloc[1]) | |
| for index,row in df.iterrows(): | |
| print(f"Row {index}: {row.to_dict()}") | |
| break | |
| ''' |