| from datasets import load_dataset | |
| import json | |
| import re | |
| from tqdm import tqdm | |
| from filter import filterdata # Custom filtering logic | |
| # Load 110k samples from OpenWebText | |
| print("๐ฆ Loading dataset (110k samples)...") | |
| ds = load_dataset("OpenAssistant/oasst1",split="train") | |
| convo = [] | |
| print("โ๏ธ Processing dataset into Q&A pairs...") | |
| for entry in tqdm(ds, unit='samples'): | |
| if entry.get("role") == "assistant" and entry.get("text") and entry.get("parent_id"): | |
| parent = next((x for x in ds if x["message_id"] == entry["parent_id"]), None) | |
| if parent and parent.get("role") == "user": | |
| convo.append({ | |
| "input": parent["text"], | |
| "output": entry["text"] | |
| }) | |
| #convo.append({ | |
| # "instruction": instruction, | |
| # "input": user_input, | |
| # "output": bot_response, | |
| # "text": full_instruction + "\n" + bot_response | |
| #}) | |
| print(f"โ Got {len(convo)} usable Q&A pairs.") | |
| # Save unfiltered data | |
| unfiltered_path = "./data/unfiltered_data.jsonl" | |
| with open(unfiltered_path, "w", encoding="utf-8") as f: | |
| for line in convo: | |
| f.write(json.dumps(line, ensure_ascii=False) + "\n") | |
| print(f"๐ Saved unfiltered data to {unfiltered_path}") | |
| # Run filtering | |
| print("๐ฟ Starting filtering...") | |
| filterdata(convo) | |