from pathlib import Path import os def download_data(): try: from huggingface_hub import snapshot_download except ImportError: print("huggingface_hub not installed. Installing...") os.system("pip install huggingface_hub") from huggingface_hub import snapshot_download # Configuration - CHANGE THIS to your HuggingFace repo HF_REPO_ID = os.getenv("HF_DATA_REPO", "hungnha/do_an_tot_nghiep") data_path = Path("data") if data_path.exists() and any(data_path.iterdir()): print("Data folder already exists. Skipping download.") print(f"To re-download, delete the 'data/' folder first.") return print(f"Downloading data from HuggingFace: {HF_REPO_ID}") print("This may take a few minutes...") try: snapshot_download( repo_id=HF_REPO_ID, repo_type="dataset", local_dir="data", local_dir_use_symlinks=False, # Download actual files, not symlinks ) print("Download complete!") print(f"Data saved to: {data_path.absolute()}") except Exception as e: print(f"Error downloading data: {e}") print("\nTips:") print(" 1. Make sure the HF_DATA_REPO environment variable is set correctly") print(" 2. Or update HF_REPO_ID in this script") print(" 3. If repo is private, run: huggingface-cli login") raise if __name__ == "__main__": download_data()