| import pandas as pd | |
| import json | |
| with open("data/ccs_synthetic_filtered_large.json") as f: | |
| d = json.load(f) | |
| df = pd.DataFrame(d) | |
| df["index"] = df.index + 1 | |
| df["nr_words"] = df["caption"].apply(lambda x: len(x.split())) | |
| df.to_feather("data/ccs_synthetic.feather") | |