File size: 1,993 Bytes
48b5cb1
68c5852
48b5cb1
 
 
 
 
 
 
 
 
 
 
 
6a0a429
48b5cb1
 
 
 
 
 
68c5852
48b5cb1
 
6a0a429
48b5cb1
 
 
 
 
 
 
 
6a0a429
48b5cb1
 
6a0a429
 
 
48b5cb1
 
 
 
 
 
 
 
 
 
 
 
 
68c5852
48b5cb1
 
 
 
 
6a0a429
48b5cb1
 
 
 
 
 
 
6a0a429
48b5cb1
6a0a429
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
import logging
import os
from pathlib import Path

logger = logging.getLogger(__name__)


def load_data_file() -> str:
    hf_token = os.getenv("HF_TOKEN")

    if hf_token:
        try:
            from huggingface_hub import hf_hub_download

            logger.info("Dataset: muhalwan/optimized_data_mhs")

            file_path = hf_hub_download(
                repo_id="muhalwan/optimized_data_mhs",
                filename="optimized_data.xlsx",
                repo_type="dataset",
                token=hf_token,
                cache_dir="./hf_cache",
            )

            logger.info("Data loaded successfully from HF dataset")
            return file_path

        except Exception as e:
            logger.error(f"Failed to download from HF dataset: {e}")

    local_path = "data/optimized_data.xlsx"

    if Path(local_path).exists():
        logger.info(f"Loading data from local file: {local_path}")
        return local_path

    raise FileNotFoundError(
        "No data source available. Either set HF_TOKEN environment variable "
        "or place data file at 'data/optimized_data.xlsx'"
    )


def get_data_source_info() -> dict:
    hf_token = os.getenv("HF_TOKEN")
    local_exists = Path("data/optimized_data.xlsx").exists()

    return {
        "hf_token_available": bool(hf_token),
        "local_file_available": local_exists,
        "will_use_hf_dataset": bool(hf_token),
        "will_use_local": not hf_token and local_exists,
        "dataset_repo": "muhalwan/optimized_data_mhs" if hf_token else None,
        "local_path": "data/optimized_data.xlsx" if local_exists else None,
    }


if __name__ == "__main__":
    logging.basicConfig(level=logging.INFO)
    print("Data Information")

    info = get_data_source_info()
    for key, value in info.items():
        print(f"  {key}: {value}")

    try:
        file_path = load_data_file()
        print(f"\nSuccess! Data file: {file_path}")
    except Exception as e:
        print(f"\nFailed: {e}")