| import json | |
| import re | |
| from dataset import SimpleTokenizr | |
| from tqdm import tqdm | |
| tokenizer = SimpleTokenizr() | |
| def filterdata(data): | |
| filtered = [] | |
| unused = [] | |
| low_quality = [] | |
| long = [] | |
| filtered_lines = 0 | |
| unused_lines = 0 | |
| low_quality_lines = 0 | |
| long_lines = 0 | |
| for line in tqdm(data, unit='B', unit_scale=True, unit_divisor=1024): | |
| decoded = json.dumps(line) | |
| data = json.loads(decoded) | |
| text = data.get("text","") | |
| encoded = tokenizer.tokenize(text) | |
| if re.search(r"\d",text): | |
| unused_lines += 1 | |
| unused.append(line) | |
| else: | |
| if len(encoded) >= 64: | |
| filtered_lines += 1 | |
| filtered.append(line) | |
| if len(encoded) < 64: | |
| long_lines += 1 | |
| long.append(text) | |
| print(f"Filtered {filtered_lines} successfully!") | |
| print(f"Removed {unused_lines} from data.") | |
| print(f"Removed {long_lines} from data (too short).") | |
| #print(f"Removed {low_quality} from data (low quality).") | |
| with open("./data/filtered_data.jsonl", "w", encoding="utf-8") as f: | |
| for lines in filtered: | |
| dump = json.dumps(lines) | |
| decoded = json.loads(dump) | |
| f.write(json.dumps(decoded,ensure_ascii=False) + "\n") |