Spaces:

nice-bill
/

deepshelf-api

Running

App Files Files Community

deepshelf-api / scripts /download_data.py

nice-bill

initial commit

cdb73a8 3 months ago

raw

history blame contribute delete

2.85 kB

	import logging
	import os
	import sys
	from pathlib import Path

	from huggingface_hub import snapshot_download

	# Configure logging
	logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
	logger = logging.getLogger(__name__)

	# Define paths directly to avoid importing from src (which isn't copied yet in Docker build)
	PROCESSED_DATA_DIR = Path("data/processed")
	PROCESSED_DATA_PATH = PROCESSED_DATA_DIR / "books_with_embeddings.parquet"
	EMBEDDINGS_PATH = PROCESSED_DATA_DIR / "embeddings.npy"
	CLUSTERS_CACHE_PATH = PROCESSED_DATA_DIR / "clusters_cache.pkl"

	def download_processed_data(repo_id: str):
	"""
	Downloads processed data files (parquet, npy, pkl) from a private Hugging Face Dataset.

	Args:
	repo_id (str): The Hugging Face dataset ID (e.g., 'username/dataset-name').
	"""
	hf_token = os.getenv("HF_TOKEN")
	if not hf_token:
	logger.warning("HF_TOKEN environment variable not found. If the dataset is private, download will fail.")

	logger.info(f"Starting download from Hugging Face Dataset: {repo_id}")
	logger.info(f"Target directory: {PROCESSED_DATA_DIR}")

	try:
	# Ensure directory exists
	PROCESSED_DATA_DIR.mkdir(parents=True, exist_ok=True)

	# Download only specific files to avoid clutter
	allow_patterns = [
	"*.parquet",
	"*.npy",
	"*.pkl",
	"*.json"
	]

	snapshot_download(
	repo_id=repo_id,
	repo_type="dataset",
	local_dir=PROCESSED_DATA_DIR,
	local_dir_use_symlinks=False, # Important for Docker/Deployment
	allow_patterns=allow_patterns,
	token=hf_token
	)

	logger.info("Successfully downloaded all data files.")

	# Verify files
	expected_files = [
	PROCESSED_DATA_PATH,
	EMBEDDINGS_PATH,
	CLUSTERS_CACHE_PATH
	]

	missing = [f.name for f in expected_files if not f.exists()]
	if missing:
	logger.error(f"Warning: The following expected files are still missing after download: {missing}")
	else:
	logger.info("Verification successful: All core data files are present.")

	except Exception as e:
	logger.error(f"Failed to download data from Hugging Face: {e}")
	sys.exit(1)

	if __name__ == "__main__":
	# Default repo ID - WILL BE OVERRIDDEN by environment variable in production
	DEFAULT_REPO_ID = "nice-bill/book-recommender-data"

	repo_id = os.getenv("HF_DATASET_ID", DEFAULT_REPO_ID)

	if repo_id == "PLACEHOLDER_USERNAME/PLACEHOLDER_DATASET":
	logger.error("Please set the HF_DATASET_ID environment variable or update the script.")
	sys.exit(1)

	download_processed_data(repo_id)