| # curl -L -o ~/Downloads/arxiv.zip\ | |
| # https://www.kaggle.com/api/v1/datasets/download/Cornell-University/arxiv | |
| import jsonlines | |
| input_path = './data/arxiv-metadata-oai-snapshot.json' | |
| output_path = './data/arxiv.jsonl' | |
| new_data = [] | |
| with jsonlines.open(input_path, 'r') as reader: | |
| for item in reader: | |
| new_item = { | |
| 'bibkey': f"arxivid{item['id']}", | |
| 'text': f"Title: {item['title']}\nAbstract: {item['abstract']}\nAuthors: {item['authors']}", | |
| } | |
| new_data.append(new_item) | |
| with jsonlines.open(output_path, 'w') as writer: | |
| for item in new_data: | |
| writer.write(item) |