| import tarfile |
| from ast import literal_eval |
| from rich.progress import track |
| from pathlib import Path |
| import pandas as pd |
|
|
| def tar_file_to_string(filename): |
| with tarfile.open(filename, "r:gz") as tar: |
| for member in tar.getmembers(): |
| f = tar.extractfile(member) |
| data = f.readline() |
| data = data.decode("utf-8") |
| data = data.split("{'url'") |
| data = [("{'url'" + item) for item in data] |
| data = data[1:] |
| return data |
|
|
| if __name__=="__main__": |
| data = Path('../HEAD') |
| for tar_gz in data.iterdir(): |
| filename = tar_gz.name.split('.tar.gz')[0] |
| print(f"Now extracting {filename}") |
| text = tar_file_to_string(tar_gz) |
| filtered = [] |
| for item in track(text): |
| try: |
| if literal_eval(item)['language_score'] > 0.98: |
| filtered.append(literal_eval(item)) |
| except: |
| None |
| filtered = pd.DataFrame(filtered) |
| filtered.to_json(f'../HEAD_CLEAN/{filename}.jsonl', orient='records', lines=True) |
|
|
|
|