Spaces:
Runtime error
Runtime error
| import os | |
| import requests | |
| from humanfriendly import format_size | |
| import pandas as pd | |
| import glob | |
| from urllib.parse import unquote | |
| ## Reads parquet files in a folder into a pandas dataframe | |
| def read_parquet_files_as_df (parquet_dir): | |
| parquet_files = glob.glob(f'{parquet_dir}/*.parquet') | |
| # read each parquet file into a DataFrame and store in a list | |
| dfs = [pd.read_parquet (f) for f in parquet_files] | |
| # Concatenate all DataFrames into a single DataFrame | |
| data_df = pd.concat(dfs, ignore_index=True) | |
| return data_df | |
| def download_file(url, local_file, chunk_size=1024*1024): | |
| """ | |
| Downloads a remote URL to a local file. | |
| Args: | |
| url (str): The remote URL. | |
| local_filename (str): The name of the local file to save the downloaded content. | |
| chunk_size (int): The size in bytes of each chunk. Defaults to 1024. | |
| Returns: | |
| None | |
| Example usage: | |
| download_file('http://example.com/file.txt', 'file.txt', chunk_size=1024*1024) # Download in chunks of 1MB | |
| """ | |
| # Check if the local file already exists | |
| if os.path.exists(local_file): | |
| file_size = format_size(os.path.getsize(local_file)) | |
| print(f"Local file '{local_file}' ({file_size}) already exists. Skipping download.") | |
| return | |
| # Create the directory if it doesn't exist | |
| os.makedirs(os.path.dirname(local_file), exist_ok=True) | |
| # Stream the file download | |
| with requests.get(url, stream=True) as r: | |
| r.raise_for_status() | |
| with open(local_file, 'wb') as f: | |
| for chunk in r.iter_content(chunk_size=chunk_size): | |
| if chunk: # filter out keep-alive new chunks | |
| f.write(chunk) | |
| print() | |
| file_size = format_size(os.path.getsize(local_file)) | |
| print(f"{local_file} ({file_size}) downloaded successfully.") | |
| ## --- end: download_file ------ | |