Spaces:

crazycrazypete
/

pots-shutdown-tracker

Running

File size: 11,144 Bytes

611bfd9

from __future__ import annotations

import sys
from types import UnionType
from pathlib import Path
from typing import Any, Union, get_args, get_origin

BACKEND_ROOT = Path(__file__).resolve().parents[1]
if str(BACKEND_ROOT) not in sys.path:
    sys.path.insert(0, str(BACKEND_ROOT))

from pots_shutdown_tracker.config import Settings


SECRET_ENV_VARS = {
    "POTS_TRACKER_ADMIN_API_KEY",
    "POTS_TRACKER_DB_URL",
    "POTS_TRACKER_FCC_ECFS_API_KEY",
    "POTS_TRACKER_HF_STORAGE_TOKEN",
}

FIELD_DESCRIPTION_OVERRIDES = {
    "active_window_post_target_grace_days": "Days a notice can remain active after its target date.",
    "admin_api_key": "Shared secret required to authorize admin endpoints.",
    "admin_api_key_header": "Header name expected on admin requests.",
    "api_prefix": "Base URL prefix mounted in FastAPI.",
    "area_risk_airport_promotes_to_direct": (
        "When true, `<city> Airport` notices promote to a direct match for searches on `<city>`. "
        "Set false to surface them in the nearby-municipality section instead."
    ),
    "app_name": "Human-readable application name.",
    "auto_create_schema": "Create the database schema automatically at startup.",
    "bulk_lookup_concurrent_workers": "Maximum number of background bulk lookup jobs processed concurrently.",
    "bulk_lookup_file_size_mb": "Maximum uploaded bulk lookup workbook size in megabytes.",
    "bulk_lookup_max_rows": "Maximum data rows accepted in a bulk lookup workbook.",
    "bulk_lookup_retention_days": "Days to retain bulk lookup input and output blobs before cleanup.",
    "cors_allow_origins": "Comma-separated list of allowed browser origins.",
    "db_max_overflow": "Extra SQLAlchemy pool connections allowed above the base size.",
    "db_pool_pre_ping": "Ping pooled database connections before use.",
    "db_pool_recycle_seconds": "Lifetime of pooled database connections before recycle.",
    "db_pool_size": "Base SQLAlchemy database pool size.",
    "db_pool_timeout_seconds": "Seconds to wait for a pooled database connection.",
    "db_url": "Database connection string.",
    "enable_ai": "Enable AI-backed parsing, summarization, and search helpers.",
    "enable_weekly_jobs": "Enable the weekly APScheduler job set.",
    "fcc_ecfs_api_key": "Optional ECFS API key reserved for FCC watch discovery.",
    "fcc_ecfs_base_url": "Base URL for the FCC ECFS public API.",
    "fcc_watch_lookback_months": "Historical lookback window for FCC watch backfill phases.",
    "fcc_watch_proceedings": "Comma-separated ECFS proceeding numbers for targeted FCC watch scans.",
    "fetch_max_content_length_mb": "Maximum fetched body size in megabytes.",
    "fetch_read_timeout_seconds": "Read timeout for fetched responses.",
    "fetch_timeout_seconds": "Overall fetch timeout in seconds.",
    "frontend_dist_path": "Path to the built frontend bundle served by the app.",
    "hf_storage_path_prefix": "Path prefix used for stored blobs in the dataset repo.",
    "hf_storage_require_private": "Refuse to use a public Hugging Face dataset repo.",
    "hf_storage_repo_id": "Hugging Face dataset repo that stores crawler blobs.",
    "hf_storage_revision": "Revision used for Hugging Face dataset uploads and downloads.",
    "hf_storage_token": "Write token for the Hugging Face dataset repo.",
    "lookback_months": "Active corpus lookback window in months.",
    "log_level": "Application log level.",
    "openai_api_key": "OpenAI API key used by AI features.",
    "query_embedding_cache_enabled": "Enable caching for query embeddings.",
    "query_embedding_cache_size": "Maximum number of cached query embeddings.",
    "request_timing_enabled": "Emit request timing logs.",
    "run_migrations_on_startup": "Run Alembic migrations during app startup.",
    "scheduler_enabled_instances": "Number of instances allowed to run scheduler jobs.",
    "search_candidate_limit": "Maximum number of candidate notices considered during search.",
    "search_refinement_limit": "Maximum number of notices retained after search refinement.",
    "search_prewarm_enabled": "Enable search prewarm at startup.",
    "search_prewarm_queries": "Comma-separated search queries used for prewarming.",
    "search_result_cache_enabled": "Enable caching for search responses.",
    "search_result_cache_size": "Maximum number of cached search responses.",
    "search_result_cache_ttl_seconds": "TTL for cached search responses in seconds.",
    "search_trace_enabled": "Emit detailed search trace logs.",
    "search_vector_candidate_limit": "Maximum number of vector candidates considered during search.",
    "slow_request_threshold_ms": "Threshold for logging slow requests in milliseconds.",
    "source_coverage_matrix_file": "Path to the source coverage matrix JSON file.",
    "startup_db_wait_seconds": "Maximum time to wait for the database during startup.",
    "storage_backend": "Select the storage backend (`filesystem` or `huggingface_dataset`).",
    "storage_path": "Local storage and cache root.",
    "serve_frontend": "Serve the frontend bundle from FastAPI.",
    "ops_presets_file": "Path to the curated ops preset list.",
    "timezone": "Application timezone.",
    "trust_matrix_empty_threshold": "Maximum empty-matrix ratio before queryability is disabled.",
    "trust_stale_days": "Days after which the corpus is considered stale.",
    "user_agent": "User-Agent string used for outbound HTTP requests.",
    "weekly_schedule": "Cron expression for the weekly job schedule.",
}

SPECIAL_SCOPE_OVERRIDES = {
    "POTS_TRACKER_ADMIN_API_KEY": "hosted",
    "POTS_TRACKER_CORS_ALLOW_ORIGINS": "hosted",
    "POTS_TRACKER_FCC_ECFS_API_KEY": "hosted",
    "POTS_TRACKER_FCC_ECFS_BASE_URL": "hosted",
    "POTS_TRACKER_FCC_WATCH_LOOKBACK_MONTHS": "hosted",
    "POTS_TRACKER_FCC_WATCH_PROCEEDINGS": "hosted",
    "POTS_TRACKER_FRONTEND_DIST_PATH": "hosted",
    "POTS_TRACKER_HF_STORAGE_PATH_PREFIX": "hosted",
    "POTS_TRACKER_HF_STORAGE_REPO_ID": "hosted",
    "POTS_TRACKER_HF_STORAGE_REQUIRE_PRIVATE": "hosted",
    "POTS_TRACKER_HF_STORAGE_REVISION": "hosted",
    "POTS_TRACKER_HF_STORAGE_TOKEN": "hosted",
    "POTS_TRACKER_RUN_MIGRATIONS_ON_STARTUP": "hosted",
    "POTS_TRACKER_SERVE_FRONTEND": "hosted",
}

TOKEN_DISPLAY = {
    "ai": "AI",
    "api": "API",
    "att": "AT&T",
    "clli": "CLLI",
    "db": "DB",
    "fcc": "FCC",
    "hf": "HF",
    "id": "ID",
    "ip": "IP",
    "json": "JSON",
    "lbs": "lbs",
    "ops": "ops",
    "p90": "p90",
    "png": "PNG",
    "sql": "SQL",
    "ttl": "TTL",
    "ui": "UI",
    "url": "URL",
    "urls": "URLs",
}


def _humanize_token(token: str) -> str:
    if token in TOKEN_DISPLAY:
        return TOKEN_DISPLAY[token]
    if token.isdigit():
        return token
    if token.isupper():
        return token
    return token.replace("-", " ").capitalize()


def _humanize_name(name: str) -> str:
    return " ".join(_humanize_token(part) for part in name.split("_") if part)


def _render_description(field_name: str, env_name: str) -> str:
    if field_name in FIELD_DESCRIPTION_OVERRIDES:
        return FIELD_DESCRIPTION_OVERRIDES[field_name]

    if field_name.endswith("_index_urls"):
        carrier = _humanize_name(field_name.removesuffix("_index_urls"))
        return f"Comma-separated index URLs for {carrier}."
    if field_name.endswith("_tracker_urls"):
        carrier = _humanize_name(field_name.removesuffix("_tracker_urls"))
        return f"Comma-separated tracker URLs for {carrier}."
    if field_name.endswith("_seed_documents_file"):
        carrier = _humanize_name(field_name.removesuffix("_seed_documents_file"))
        return f"Path to the seed document URL file for {carrier}."
    if field_name.endswith("_document_urls_file"):
        carrier = _humanize_name(field_name.removesuffix("_document_urls_file"))
        return f"Path to extra document URLs for {carrier}."
    if field_name.endswith("_document_urls"):
        carrier = _humanize_name(field_name.removesuffix("_document_urls"))
        return f"Comma-separated explicit document URLs for {carrier}."
    if field_name.endswith("_urls_file"):
        carrier = _humanize_name(field_name.removesuffix("_urls_file"))
        return f"Path to an additional URL file for {carrier}."
    if field_name.endswith("_urls"):
        carrier = _humanize_name(field_name.removesuffix("_urls"))
        return f"Comma-separated URLs for {carrier}."

    return f"{_humanize_name(field_name)} setting."


def _render_scope(env_name: str) -> str:
    return SPECIAL_SCOPE_OVERRIDES.get(env_name, "both")


def _render_sensitivity(env_name: str) -> str:
    return "secret" if env_name in SECRET_ENV_VARS else "non-secret"


def _render_type(annotation: Any) -> str:
    if annotation is Any:
        return "Any"
    if annotation is type(None):
        return "None"
    origin = get_origin(annotation)
    if origin in {Union, UnionType}:
        return " | ".join(_render_type(arg) for arg in get_args(annotation))
    if isinstance(annotation, type):
        return annotation.__name__
    text = str(annotation).replace("typing.", "").replace("pathlib.", "")
    return text.replace("<class '", "").replace("'>", "")


def _render_default(value: Any) -> str:
    if value is None:
        return "`unset`"
    if isinstance(value, bool):
        return f"`{str(value).lower()}`"
    if isinstance(value, Path):
        text = value.as_posix()
        if not value.is_absolute() and not text.startswith("./"):
            text = f"./{text}"
        return f"`{text}`"
    if isinstance(value, str):
        return '`""`' if value == "" else f"`{value}`"
    return f"`{value}`"


def iter_reference_rows() -> list[dict[str, str]]:
    rows: list[dict[str, str]] = []
    for field_name, field in Settings.model_fields.items():
        env_name = field.alias or field_name
        if not env_name.startswith("POTS_TRACKER_"):
            continue
        rows.append(
            {
                "name": env_name,
                "type": _render_type(field.annotation),
                "default": _render_default(field.default),
                "description": _render_description(field_name, env_name),
                "scope": _render_scope(env_name),
                "sensitivity": _render_sensitivity(env_name),
            }
        )
    return rows


def render_env_reference() -> str:
    rows = iter_reference_rows()
    lines = [
        "# Environment Reference",
        "",
        "Generated from `backend/app/pots_shutdown_tracker/config.py`. Regenerate this file with `python backend/scripts/dump_env_reference.py > docs/ENV.md` after changing `Settings`.",
        "",
        "| Name | Type | Default | Description | Scope | Sensitivity |",
        "| --- | --- | --- | --- | --- | --- |",
    ]
    for row in rows:
        lines.append(
            f"| `{row['name']}` | {row['type']} | {row['default']} | {row['description']} | {row['scope']} | {row['sensitivity']} |"
        )
    lines.append("")
    return "\n".join(lines)


def main() -> None:
    sys.stdout.write(render_env_reference())


if __name__ == "__main__":
    main()