gpt-2-german / src /dataset_prep.py
cakiki's picture
Add dataset script
8a3d76c
import tarfile
from ast import literal_eval
from rich.progress import track
from pathlib import Path
import pandas as pd
def tar_file_to_string(filename):
with tarfile.open(filename, "r:gz") as tar:
for member in tar.getmembers():
f = tar.extractfile(member)
data = f.readline()
data = data.decode("utf-8")
data = data.split("{'url'")
data = [("{'url'" + item) for item in data]
data = data[1:]
return data
if __name__=="__main__":
data = Path('../HEAD')
for tar_gz in data.iterdir():
filename = tar_gz.name.split('.tar.gz')[0]
print(f"Now extracting {filename}")
text = tar_file_to_string(tar_gz)
filtered = []
for item in track(text):
try:
if literal_eval(item)['language_score'] > 0.98:
filtered.append(literal_eval(item))
except:
None
filtered = pd.DataFrame(filtered)
filtered.to_json(f'../HEAD_CLEAN/{filename}.jsonl', orient='records', lines=True)