Spaces:
Sleeping
Sleeping
| import os | |
| import subprocess | |
| import zipfile | |
| import glob | |
| import pandas as pd | |
| def download_and_extract_dataset(dataset="Cornell-University/arxiv", dest_dir="arxiv_dataset"): | |
| os.makedirs(dest_dir, exist_ok=True) | |
| print("Downloading dataset...") | |
| try: | |
| subprocess.run( | |
| ["kaggle", "datasets", "download", "-d", dataset, "-p", dest_dir], | |
| check=True, | |
| ) | |
| print("Download finished") | |
| except subprocess.CalledProcessError as e: | |
| print("Downloading error: ", e) | |
| return | |
| zip_filename = os.path.join(dest_dir, "arxiv.zip") | |
| if os.path.exists(zip_filename): | |
| print("Unpacking dataset...") | |
| with zipfile.ZipFile(zip_filename, "r") as zip_ref: | |
| zip_ref.extractall(dest_dir) | |
| print("Unpacking finished") | |
| else: | |
| print("Zip-file is not found") | |
| def filter_csv(dest_dir="arxiv_dataset"): | |
| csv_file = glob.glob(os.path.join(dest_dir, "*.csv"))[0] | |
| df = pd.read_csv(csv_file) | |
| df = df[["title", "authors", "abstract", "categories"]] | |
| df.to_csv(os.path.join(dest_dir, "filtered_arxiv.csv"), index=False) | |
| if __name__ == "__main__": | |
| download_and_extract_dataset() | |
| filter_csv() | |