| import os |
| import requests |
| import tarfile |
| from tqdm import tqdm |
|
|
| DATA_URL: str = ( |
| "http://nlplab1.cs.vt.edu/~menglong/project/multimodal/fact_checking/MOCHEG/dataset/latest_dataset/mocheg_with_tweet_2023_03.tar.gz" |
| ) |
| RAW_DATA_DIR: str = "data/raw" |
| ARCHIVE_NAME: str = "mocheg_with_tweet_2023_03.tar.gz" |
| CHUNK_SIZE: int = 16 * 1024 * 1024 |
|
|
| |
| os.makedirs(RAW_DATA_DIR, exist_ok=True) |
| archive_path: str = os.path.join(RAW_DATA_DIR, ARCHIVE_NAME) |
|
|
|
|
| def check_disk_space(required_space_gb: int) -> bool: |
| """Check if there is enough free disk space.""" |
| stat = os.statvfs(RAW_DATA_DIR) |
| free_space_gb: float = (stat.f_bavail * stat.f_frsize) / (1024**3) |
| return free_space_gb > required_space_gb |
|
|
|
|
| def download_data() -> None: |
| """Download the data if not already present and extract it.""" |
| |
| if os.path.exists(archive_path): |
| print(f"Data already downloaded at {archive_path}. Skipping download.") |
| return |
|
|
| |
| required_space_gb: int = 80 |
| if not check_disk_space(required_space_gb): |
| print(f"Not enough disk space. At least {required_space_gb} GB required.") |
| return |
|
|
| |
| print(f"Downloading data from {DATA_URL}...") |
| response = requests.get(DATA_URL, stream=True) |
| response.raise_for_status() |
|
|
| total_size: int = int(response.headers.get("content-length", 0)) |
| with open(archive_path, "wb") as file, tqdm( |
| desc=ARCHIVE_NAME, |
| total=total_size, |
| unit="B", |
| unit_scale=True, |
| unit_divisor=1024, |
| ) as progress_bar: |
| for chunk in response.iter_content(chunk_size=CHUNK_SIZE): |
| if chunk: |
| file.write(chunk) |
| progress_bar.update(len(chunk)) |
|
|
| print(f"Download completed: {archive_path}") |
|
|
| |
| extract_data(archive_path) |
|
|
|
|
| def extract_data(archive_path: str) -> None: |
| """Extract the downloaded tar.gz file.""" |
| print(f"Extracting data from {archive_path}...") |
| with tarfile.open(archive_path, "r:gz") as tar: |
| tar.extractall(path=RAW_DATA_DIR) |
| print(f"Data extracted to {RAW_DATA_DIR}") |
|
|
|
|
| if __name__ == "__main__": |
| download_data() |
|
|