classquota / data_loader.py
muhalwan's picture
Revised version
6a0a429
import logging
import os
from pathlib import Path
logger = logging.getLogger(__name__)
def load_data_file() -> str:
hf_token = os.getenv("HF_TOKEN")
if hf_token:
try:
from huggingface_hub import hf_hub_download
logger.info("Dataset: muhalwan/optimized_data_mhs")
file_path = hf_hub_download(
repo_id="muhalwan/optimized_data_mhs",
filename="optimized_data.xlsx",
repo_type="dataset",
token=hf_token,
cache_dir="./hf_cache",
)
logger.info("Data loaded successfully from HF dataset")
return file_path
except Exception as e:
logger.error(f"Failed to download from HF dataset: {e}")
local_path = "data/optimized_data.xlsx"
if Path(local_path).exists():
logger.info(f"Loading data from local file: {local_path}")
return local_path
raise FileNotFoundError(
"No data source available. Either set HF_TOKEN environment variable "
"or place data file at 'data/optimized_data.xlsx'"
)
def get_data_source_info() -> dict:
hf_token = os.getenv("HF_TOKEN")
local_exists = Path("data/optimized_data.xlsx").exists()
return {
"hf_token_available": bool(hf_token),
"local_file_available": local_exists,
"will_use_hf_dataset": bool(hf_token),
"will_use_local": not hf_token and local_exists,
"dataset_repo": "muhalwan/optimized_data_mhs" if hf_token else None,
"local_path": "data/optimized_data.xlsx" if local_exists else None,
}
if __name__ == "__main__":
logging.basicConfig(level=logging.INFO)
print("Data Information")
info = get_data_source_info()
for key, value in info.items():
print(f" {key}: {value}")
try:
file_path = load_data_file()
print(f"\nSuccess! Data file: {file_path}")
except Exception as e:
print(f"\nFailed: {e}")