|
|
from pathlib import Path |
|
|
import os |
|
|
|
|
|
def download_data(): |
|
|
try: |
|
|
from huggingface_hub import snapshot_download |
|
|
except ImportError: |
|
|
print("huggingface_hub not installed. Installing...") |
|
|
os.system("pip install huggingface_hub") |
|
|
from huggingface_hub import snapshot_download |
|
|
|
|
|
|
|
|
HF_REPO_ID = os.getenv("HF_DATA_REPO", "hungnha/do_an_tot_nghiep") |
|
|
|
|
|
data_path = Path("data") |
|
|
|
|
|
if data_path.exists() and any(data_path.iterdir()): |
|
|
print("Data folder already exists. Skipping download.") |
|
|
print(f"To re-download, delete the 'data/' folder first.") |
|
|
return |
|
|
|
|
|
print(f"Downloading data from HuggingFace: {HF_REPO_ID}") |
|
|
print("This may take a few minutes...") |
|
|
|
|
|
try: |
|
|
snapshot_download( |
|
|
repo_id=HF_REPO_ID, |
|
|
repo_type="dataset", |
|
|
local_dir="data", |
|
|
local_dir_use_symlinks=False, |
|
|
) |
|
|
print("Download complete!") |
|
|
print(f"Data saved to: {data_path.absolute()}") |
|
|
except Exception as e: |
|
|
print(f"Error downloading data: {e}") |
|
|
print("\nTips:") |
|
|
print(" 1. Make sure the HF_DATA_REPO environment variable is set correctly") |
|
|
print(" 2. Or update HF_REPO_ID in this script") |
|
|
print(" 3. If repo is private, run: huggingface-cli login") |
|
|
raise |
|
|
|
|
|
if __name__ == "__main__": |
|
|
download_data() |
|
|
|