plant-msyn / path_config.py
Yoshigold's picture
Update webapp with Scripts files for HF Spaces deployment
f342936 verified
#!/usr/bin/env python3
"""
Centralized path configuration for Plant-mSyn (Multi-genome Synteny).
Hugging Face Spaces Edition.
This module provides configurable paths via environment variables with sensible
fallbacks for Hugging Face Spaces deployment. Use this module for all path
references to ensure cloud deployment compatibility.
Environment Variables (all optional - defaults use relative paths from Scripts/):
PLANTMSYN_PROJECT_DIR - Root project directory
PLANTMSYN_DATA_DIR - Data directory (for HF: where dataset files are)
PLANTMSYN_ANNOTATIONS_DIR - Gene annotation TSV files
PLANTMSYN_MCSCAN_DIR - MCscan results
PLANTMSYN_LOG_DIR - Log files directory
PLANTMSYN_SQL_DIR - SQL catalogs directory
Derived paths (from PLANTMSYN_MCSCAN_DIR):
- bed_files_dir
- i1_blocks_dir
- last_filtered_dir
- pep_files_dir
- lifted_anchors_dir
- custom_meta_dir
Usage:
from path_config import (
PROJECT_DIR, GENOMES_DIR, ANNOTATIONS_DIR,
MCSCAN_RESULTS_DIR, LOG_DIR, get_all_paths
)
# Or use the function for compatibility:
paths = get_all_paths()
bed_dir = paths['bed_files_dir']
"""
import os
from pathlib import Path
from typing import Dict
# Try to load .env file if python-dotenv is available
try:
from dotenv import load_dotenv
# Look for .env in project root (parent of Scripts/)
_script_dir = Path(__file__).resolve().parent
_project_root = _script_dir.parent
_env_file = _project_root / ".env"
if _env_file.exists():
load_dotenv(_env_file)
except ImportError:
pass # python-dotenv not installed, rely on system env vars
# =============================================================================
# DEFAULT PATH COMPUTATION (Hugging Face Spaces Adapted)
# =============================================================================
def _get_default_project_dir() -> Path:
"""Get default project directory.
On HuggingFace Spaces, scripts are in /app/ directly (not Scripts/ subfolder).
So the project dir is the same as the script dir.
In local dev, scripts are in Scripts/ so parent is project dir.
"""
script_dir = Path(__file__).resolve().parent
# Check if we're in HF Spaces (scripts in /app/ directly)
# by checking if 'data' folder exists at same level
if (script_dir / "data").is_dir() or script_dir.name == "app":
return script_dir
# Local development: Scripts/ folder, parent is project dir
return script_dir.parent
def _get_path_from_env(env_var: str, default: Path) -> Path:
"""Get path from environment variable or use default."""
env_value = os.environ.get(env_var)
if env_value:
return Path(env_value).resolve()
return default
# =============================================================================
# CORE PATH CONFIGURATION (Hugging Face Spaces)
# =============================================================================
# Default base paths (computed relative to this file's location)
_DEFAULT_PROJECT_DIR = _get_default_project_dir()
# Primary paths - configurable via environment variables
PROJECT_DIR: Path = _get_path_from_env(
'PLANTMSYN_PROJECT_DIR',
_DEFAULT_PROJECT_DIR
)
# Data directory - for HF Spaces, data is in a separate 'data' folder
DATA_DIR: Path = _get_path_from_env(
'PLANTMSYN_DATA_DIR',
PROJECT_DIR / "data"
)
# Genomes directory (may not be used in HF - preprocessed data instead)
GENOMES_DIR: Path = _get_path_from_env(
'PLANTMSYN_GENOMES_DIR',
DATA_DIR / "genomes"
)
# Annotations directory - under data for HF
ANNOTATIONS_DIR: Path = _get_path_from_env(
'PLANTMSYN_ANNOTATIONS_DIR',
DATA_DIR / "annotations"
)
# MCscan results directory - under data for HF (not protein_pairwise subfolder)
MCSCAN_RESULTS_DIR: Path = _get_path_from_env(
'PLANTMSYN_MCSCAN_DIR',
DATA_DIR / "mcscan_results"
)
LOG_DIR: Path = _get_path_from_env(
'PLANTMSYN_LOG_DIR',
PROJECT_DIR / "logs"
)
# =============================================================================
# DERIVED PATHS (from MCSCAN_RESULTS_DIR)
# =============================================================================
BED_FILES_DIR: Path = MCSCAN_RESULTS_DIR / "bed_files"
I1_BLOCKS_DIR: Path = MCSCAN_RESULTS_DIR / "i1_blocks"
LAST_FILTERED_DIR: Path = MCSCAN_RESULTS_DIR / "last_filtered"
PEP_FILES_DIR: Path = MCSCAN_RESULTS_DIR / "pep_files"
LIFTED_ANCHORS_DIR: Path = MCSCAN_RESULTS_DIR / "lifted_anchors"
CUSTOM_META_DIR: Path = MCSCAN_RESULTS_DIR / "custom_meta"
CUSTOM_SYNTENY_META_DIR: Path = MCSCAN_RESULTS_DIR / "custom_synteny_meta"
# SQL metadata paths - under data for HF
SQL_DIR: Path = _get_path_from_env(
'PLANTMSYN_SQL_DIR',
DATA_DIR / "sql"
)
SEARCH_CATALOGS_DIR: Path = SQL_DIR / "search_catalogs"
METADATA_DB_PATH: Path = SQL_DIR / "plantmsyn_metadata.db"
# Script directory (for reference)
SCRIPT_DIR: Path = Path(__file__).resolve().parent
# =============================================================================
# HELPER FUNCTIONS
# =============================================================================
def get_all_paths() -> Dict[str, Path]:
"""
Get all configured paths as a dictionary.
This function provides backward compatibility with get_project_paths()
and allows accessing all paths from a single dict.
Returns:
Dict with all path configurations
"""
return {
'script_dir': SCRIPT_DIR,
'project_dir': PROJECT_DIR,
'genomes_dir': GENOMES_DIR,
'annotations_dir': ANNOTATIONS_DIR,
'mcscan_results_dir': MCSCAN_RESULTS_DIR,
'log_dir': LOG_DIR,
'bed_files_dir': BED_FILES_DIR,
'i1_blocks_dir': I1_BLOCKS_DIR,
'last_filtered_dir': LAST_FILTERED_DIR,
'pep_files_dir': PEP_FILES_DIR,
'lifted_anchors_dir': LIFTED_ANCHORS_DIR,
'custom_meta_dir': CUSTOM_META_DIR,
'custom_synteny_meta_dir': CUSTOM_SYNTENY_META_DIR,
'sql_dir': SQL_DIR,
'search_catalogs_dir': SEARCH_CATALOGS_DIR,
'metadata_db_path': METADATA_DB_PATH,
}
def print_config():
"""Print current path configuration (useful for debugging)."""
from logger import get_logger
config_logger = get_logger("synteny.path_config")
config_logger.info("Plant-mSyn Path Configuration")
config_logger.info("=" * 50)
for name, path in get_all_paths().items():
exists = "✓" if path.exists() else "✗"
env_var = f"PLANTMSYN_{name.upper()}" if name in ['project_dir', 'genomes_dir', 'annotations_dir', 'log_dir'] or name == 'mcscan_results_dir' else "(derived)"
if name == 'mcscan_results_dir':
env_var = 'PLANTMSYN_MCSCAN_DIR'
elif name == 'script_dir':
env_var = "(computed)"
elif 'dir' in name and env_var == "(derived)":
pass # derived paths
else:
env_var = f"PLANTMSYN_{name.replace('_dir', '').upper()}_DIR"
config_logger.info(f" [{exists}] {name}: {path}")
if __name__ == "__main__":
print_config()