XHS / service /config.py
Trae Bot
Upload Spider_XHS project
c481f8a
from __future__ import annotations
import os
from dataclasses import dataclass
from pathlib import Path
@dataclass(frozen=True)
class ServiceConfig:
concurrency: int
proxy: str | None
callback_url: str | None
storage_dir: Path
orchestrator_db_path: Path
engine_strategy: str
engine_fallback_threshold: int
raw_data_retention_days: int
mediacrawler_storage_state_paths: tuple[Path, ...]
enable_legacy_routes: bool
error_summary_scan_limit: int
def _env_int(name: str, default: int) -> int:
value = os.getenv(name)
if value is None or value == "":
return default
return int(value)
def _env_str(name: str) -> str | None:
value = os.getenv(name)
if value is None or value == "":
return None
return value
def _env_bool(name: str, default: bool = False) -> bool:
value = os.getenv(name)
if value is None or value == "":
return bool(default)
v = str(value).strip().lower()
if v in ("1", "true", "yes", "y", "on"):
return True
if v in ("0", "false", "no", "n", "off"):
return False
return bool(default)
def _env_path_list(*names: str) -> tuple[Path, ...]:
values: list[str] = []
for name in names:
value = os.getenv(name)
if value is None or value == "":
continue
if "," in value:
values.extend([item.strip() for item in value.split(",") if item.strip()])
else:
values.append(value.strip())
paths: list[Path] = []
for item in values:
if not item:
continue
paths.append(Path(item).expanduser())
return tuple(paths)
def _resolve_path(*, base_dir: Path, value: str | None, default_rel: str) -> Path:
raw = str(value or "").strip()
if raw == "":
raw = default_rel
p = Path(raw).expanduser()
if p.is_absolute():
return p.resolve()
return (base_dir / p).resolve()
def load_config() -> ServiceConfig:
base_dir = Path(__file__).resolve().parents[1]
storage_root = (
os.getenv("STORAGE_ROOT")
or os.getenv("SERVICE_STORAGE_ROOT")
or os.getenv("SERVICE_STORAGE_DIR")
)
if storage_root is None or storage_root == "":
storage_dir = (base_dir / "storage").resolve()
else:
candidate = Path(storage_root).expanduser()
if candidate.is_absolute():
storage_dir = candidate.resolve()
else:
storage_dir = (base_dir / candidate).resolve()
storage_state_paths = _env_path_list(
"MEDIACRAWLER_STORAGE_STATE_PATHS",
"MEDIACRAWLER_STORAGE_STATE_PATH",
"SERVICE_STORAGE_STATE_PATHS",
"SERVICE_STORAGE_STATE_PATH",
)
resolved_storage_state_paths: list[Path] = []
for path in storage_state_paths:
if path.is_absolute():
resolved_storage_state_paths.append(path)
else:
resolved_storage_state_paths.append((base_dir / path).resolve())
return ServiceConfig(
concurrency=_env_int("SERVICE_CONCURRENCY", 4),
proxy=_env_str("SERVICE_PROXY"),
callback_url=_env_str("CALLBACK_URL") or _env_str("SERVICE_CALLBACK_URL"),
storage_dir=storage_dir,
orchestrator_db_path=_resolve_path(
base_dir=base_dir,
value=os.getenv("ORCHESTRATOR_DB_PATH"),
default_rel="orchestrator/data/mvp.db",
),
engine_strategy=os.getenv("ENGINE_STRATEGY", "auto"),
engine_fallback_threshold=_env_int("ENGINE_FALLBACK_THRESHOLD", 3),
raw_data_retention_days=_env_int("RAW_DATA_RETENTION_DAYS", 7),
mediacrawler_storage_state_paths=tuple(resolved_storage_state_paths),
enable_legacy_routes=_env_bool("ENABLE_LEGACY_ROUTES", False),
error_summary_scan_limit=_env_int("ERROR_SUMMARY_SCAN_LIMIT", 1000),
)