| from datasets import load_dataset | |
| import json | |
| import re | |
| from tqdm import tqdm | |
| from filter import filterdata | |
| ds = load_dataset("fka/awesome-chatgpt-prompts",split="train") | |
| convo = [] | |
| buffer = {} | |
| print("getting data...") | |
| for entry in tqdm(ds): | |
| print(entry) | |
| #convo.append({"text": f"^User: {buffer['user']}\nMiniGPT:{buffer['assistant']} <END>"}) | |
| print(f"Got {len(convo)} pairs/amount of q&a") | |
| print("Filtering data...") | |
| filterdata(convo) |