Spaces:

Legal-i
/

orgstate

Running

App Files Files Community

Legal-i commited on 11 days ago

Commit

8f366e8

verified ·

1 Parent(s): 741565b

Stage 163: BigQuery REST query connector

Browse files

Files changed (3) hide show

infra/ingestion/bigquery_connector.py +388 -0
infra/ingestion/connectors.py +23 -1
requirements-runtime.txt +5 -0

infra/ingestion/bigquery_connector.py ADDED Viewed

	@@ -0,0 +1,388 @@

+"""
+infra.ingestion.bigquery_connector — pull observations from a BigQuery
+dataset via the REST query API + service-account JWT auth.
+Customer flow:
+1. In GCP → IAM & Admin → Service Accounts, create a service account
+   with role ``roles/bigquery.dataViewer`` (read-only) on the
+   dataset(s) you want OrgState to observe, plus
+   ``roles/bigquery.jobUser`` on the project (needed to start
+   queries).
+2. Generate a JSON key for the service account; the file contains
+   ``client_email`` + ``private_key`` + ``token_uri``.
+3. Store the entire JSON as one env var on this deployment
+   (recommended) OR paste it inline into connector_config. The
+   inline path is OK for a demo but lands the private key in the
+   schedule DB.
+4. Configure the SQL query + field mapping (see below).
+Configuration:
+    project_id           GCP project the queries run in (billed there).
+    sql                  Standard SQL with {entity_type} substituted
+                         at fetch time, e.g.
+                         "SELECT entity_id, day, mrr_cents FROM
+                          \\`acme.bi.{entity_type}_daily\\`
+                          WHERE day >= DATE_SUB(CURRENT_DATE(),
+                          INTERVAL 35 DAY)"
+    mapping              {"entity_id": "entity_id",
+                          "day":       "day",
+                          "values":    {"mrr": "mrr_cents",
+                                         "users": "active_users"}}
+    service_account_json (optional, inline) the raw JSON contents.
+    service_account_env  (optional, recommended) env-var name holding
+                         the JSON. ONE of *_json / *_env required.
+    api_timeout          HTTP timeout in seconds (default 30).
+    query_timeout_ms     BigQuery server-side timeout (default 60_000,
+                         max 600_000). Set higher if the customer's
+                         query touches a large partitioned table.
+    use_legacy_sql       false (default) — Standard SQL. Set true only
+                         for legacy datasets.
+    location             optional BigQuery location ("US" / "EU" /
+                         "asia-northeast1"). Required for some
+                         datasets — Google's error message points at
+                         it clearly when missing.
+JWT signing uses PyJWT (already a transitive dep of authlib) with
+the service account's RSA private key; token exchange hits
+oauth2.googleapis.com once per fetch and caches in-memory for the
+token's lifetime (default 1h). The cache is per-connector-instance
+— a long-running scheduler tick gets one token reuse, a
+once-per-day cron always fresh-mints.
+"""
+from __future__ import annotations
+import json
+import os
+import time
+from typing import Any, List, Mapping, Optional
+from urllib.error import HTTPError, URLError
+from urllib.parse import urlencode
+from urllib.request import Request, urlopen
+from core.pipeline import Observation
+from .connectors import Connector
+from .http_connector import ConnectorFetchError
+_OAUTH_TOKEN_URL = "https://oauth2.googleapis.com/token"
+_OAUTH_SCOPE = "https://www.googleapis.com/auth/bigquery.readonly"
+_BQ_QUERY_BASE = "https://bigquery.googleapis.com/bigquery/v2/projects"
+class BigQueryConnector(Connector):
+    """Fetches observations from a BigQuery dataset via the REST API."""
+    type_name = "bigquery"
+    def __init__(
+        self, *,
+        project_id: str,
+        sql: str,
+        mapping: Mapping[str, Any],
+        service_account_json: Optional[str] = None,
+        service_account_env: Optional[str] = None,
+        api_timeout: float = 30.0,
+        query_timeout_ms: int = 60_000,
+        use_legacy_sql: bool = False,
+        location: Optional[str] = None,
+    ):
+        if not project_id:
+            raise ValueError("bigquery connector needs project_id")
+        if not sql:
+            raise ValueError("bigquery connector needs a sql query")
+        if not (service_account_json or service_account_env):
+            raise ValueError(
+                "bigquery connector needs service_account_json OR "
+                "service_account_env (recommended). Set one of them "
+                "in connector_config."
+            )
+        if query_timeout_ms < 1 or query_timeout_ms > 600_000:
+            raise ValueError("query_timeout_ms must be in [1, 600000]")
+        self.project_id = project_id
+        self.sql = sql
+        self.mapping = dict(mapping)
+        self.service_account_json = service_account_json
+        self.service_account_env = service_account_env
+        self.api_timeout = api_timeout
+        self.query_timeout_ms = query_timeout_ms
+        self.use_legacy_sql = use_legacy_sql
+        self.location = location
+        # In-memory token cache — refreshed when the existing token
+        # is within 60s of expiry. Set in _resolve_token().
+        self._cached_token: Optional[str] = None
+        self._cached_until: float = 0.0
+    # ---- contract methods -----------------------------------------
+    def fetch(self, entity_type: str) -> List[Observation]:
+        sa = self._load_service_account()
+        token = self._resolve_token(sa)
+        try:
+            sql_resolved = self.sql.format(entity_type=entity_type)
+        except KeyError as e:
+            raise ConnectorFetchError(
+                "bad_payload",
+                f"sql placeholder {{{e.args[0]}}} is not supported "
+                "(only {entity_type} is)",
+                cause=e,
+            ) from e
+        body: dict = {
+            "query": sql_resolved,
+            "useLegacySql": self.use_legacy_sql,
+            "timeoutMs": self.query_timeout_ms,
+        }
+        if self.location:
+            body["location"] = self.location
+        url = f"{_BQ_QUERY_BASE}/{self.project_id}/queries"
+        req = Request(
+            url, data=json.dumps(body).encode(),
+            headers={
+                "Authorization": f"Bearer {token}",
+                "Content-Type": "application/json",
+                "Accept": "application/json",
+            }, method="POST",
+        )
+        try:
+            with urlopen(req, timeout=self.api_timeout) as resp:
+                payload_bytes = resp.read()
+        except HTTPError as e:
+            code, msg = _classify_http_error(e)
+            raise ConnectorFetchError(
+                code,
+                f"BigQuery {e.code} {e.reason}: {msg}",
+                cause=e,
+            ) from e
+        except URLError as e:
+            raise ConnectorFetchError(
+                "network",
+                f"could not reach BigQuery: {e.reason}",
+                cause=e,
+            ) from e
+        try:
+            payload = json.loads(payload_bytes)
+        except json.JSONDecodeError as e:
+            raise ConnectorFetchError(
+                "bad_payload", "BigQuery returned non-JSON body", cause=e,
+            ) from e
+        return self._rows_to_observations(payload)
+    def entity_types(self) -> List[str]:
+        # Same shape as Salesforce — caller picks at schedule time.
+        return []
+    # ---- service account + token --------------------------------
+    def _load_service_account(self) -> dict:
+        raw = self.service_account_json
+        if not raw and self.service_account_env:
+            raw = os.environ.get(self.service_account_env, "")
+        if not raw:
+            raise ConnectorFetchError(
+                "missing_secret",
+                f"env var {self.service_account_env!r} "
+                "(service_account_env) is unset or empty — paste the "
+                "service-account JSON contents (NOT a path) as the "
+                "env value",
+            )
+        try:
+            sa = json.loads(raw)
+        except json.JSONDecodeError as e:
+            raise ConnectorFetchError(
+                "bad_payload",
+                "service account JSON failed to parse — make sure the "
+                "env var carries the full JSON contents, not a path",
+                cause=e,
+            ) from e
+        for field in ("client_email", "private_key", "token_uri"):
+            if not sa.get(field):
+                raise ConnectorFetchError(
+                    "bad_payload",
+                    f"service account JSON missing required field "
+                    f"{field!r}",
+                )
+        return sa
+    def _resolve_token(self, sa: dict) -> str:
+        now = time.time()
+        if self._cached_token and self._cached_until > now + 60:
+            return self._cached_token
+        # Sign a JWT bearer assertion + exchange for an access token.
+        # Standard Google OAuth 2.0 service-account flow.
+        try:
+            import jwt as _jwt
+        except ImportError as e:                            # pragma: no cover
+            raise ConnectorFetchError(
+                "missing_dep",
+                "BigQuery connector needs PyJWT (transitive via "
+                "authlib) — `pip install pyjwt`.",
+                cause=e,
+            ) from e
+        issued = int(now)
+        expires = issued + 3600
+        claims = {
+            "iss": sa["client_email"],
+            "scope": _OAUTH_SCOPE,
+            "aud": sa.get("token_uri", _OAUTH_TOKEN_URL),
+            "iat": issued,
+            "exp": expires,
+        }
+        try:
+            assertion = _jwt.encode(claims, sa["private_key"],
+                                     algorithm="RS256")
+        except Exception as e:
+            raise ConnectorFetchError(
+                "bad_payload",
+                "JWT signing failed — service account private_key is "
+                "probably malformed (must be a PEM-encoded RSA key)",
+                cause=e,
+            ) from e
+        body = urlencode({
+            "grant_type": "urn:ietf:params:oauth:grant-type:jwt-bearer",
+            "assertion": assertion,
+        }).encode()
+        req = Request(
+            sa.get("token_uri", _OAUTH_TOKEN_URL), data=body,
+            headers={
+                "Content-Type": "application/x-www-form-urlencoded",
+                "Accept": "application/json",
+            }, method="POST",
+        )
+        try:
+            with urlopen(req, timeout=self.api_timeout) as resp:
+                tok_payload = json.loads(resp.read())
+        except HTTPError as e:
+            try:
+                err_body = e.read().decode("utf-8", errors="replace")
+            except Exception:
+                err_body = ""
+            raise ConnectorFetchError(
+                "invalid_credentials",
+                f"OAuth token exchange failed (HTTP {e.code}): "
+                f"{err_body[:300]}",
+                cause=e,
+            ) from e
+        except URLError as e:                               # pragma: no cover
+            raise ConnectorFetchError(
+                "network",
+                f"could not reach Google OAuth: {e.reason}",
+                cause=e,
+            ) from e
+        access_token = tok_payload.get("access_token")
+        if not access_token:
+            raise ConnectorFetchError(
+                "invalid_credentials",
+                "OAuth response had no access_token",
+            )
+        self._cached_token = access_token
+        self._cached_until = float(
+            issued + int(tok_payload.get("expires_in", 3600))
+        )
+        return access_token
+    # ---- response parsing ---------------------------------------
+    def _rows_to_observations(self, payload: dict) -> List[Observation]:
+        if not isinstance(payload, dict):
+            raise ConnectorFetchError(
+                "bad_payload",
+                f"BigQuery response is a {type(payload).__name__}, expected dict",
+            )
+        if not payload.get("jobComplete", True):
+            raise ConnectorFetchError(
+                "upstream",
+                "BigQuery query did not finish within timeoutMs — "
+                "raise query_timeout_ms or partition the query",
+            )
+        schema = (payload.get("schema") or {}).get("fields") or []
+        col_names = [f.get("name") for f in schema]
+        rows = payload.get("rows") or []
+        eid_field = self.mapping.get("entity_id")
+        day_field = self.mapping.get("day")
+        values_map = self.mapping.get("values") or {}
+        if not eid_field or not day_field or not isinstance(values_map, dict):
+            raise ConnectorFetchError(
+                "bad_payload",
+                "mapping must define entity_id, day, and values (dict). "
+                "Got: " + json.dumps(self.mapping, default=str),
+            )
+        # Build a {field_name: row_index} for fast lookup.
+        idx = {name: i for i, name in enumerate(col_names)}
+        # Empty result set is fine — the query genuinely returned
+        # nothing, no point yelling about a missing column. Only
+        # raise the column-missing error when rows exist but the
+        # mapping can't be applied.
+        if rows and (eid_field not in idx or day_field not in idx):
+            raise ConnectorFetchError(
+                "bad_payload",
+                f"query result is missing columns: required "
+                f"{eid_field!r} + {day_field!r}; got {col_names}",
+            )
+        out: List[Observation] = []
+        for row in rows:
+            cells = row.get("f") or []
+            def _get(field: str) -> Any:
+                i = idx.get(field)
+                if i is None or i >= len(cells):
+                    return None
+                return (cells[i] or {}).get("v")
+            eid = _get(eid_field)
+            day = _get(day_field)
+            if not eid or not day:
+                continue
+            obs_values = {}
+            for metric, bq_field in values_map.items():
+                raw = _get(bq_field)
+                if raw is None:
+                    continue
+                try:
+                    obs_values[metric] = float(raw)
+                except (TypeError, ValueError):
+                    continue
+            if not obs_values:
+                continue
+            out.append(Observation(
+                entity_id=str(eid),
+                day=str(day)[:10],
+                values=obs_values,
+            ))
+        if payload.get("pageToken"):
+            import sys as _sys
+            print(
+                f"warning: BigQuery query returned a pageToken; "
+                f"v1 reads only the first page ({len(rows)} rows). "
+                "Tighten the WHERE clause or wait for v2 pagination.",
+                file=_sys.stderr,
+            )
+        return out
+def _classify_http_error(e: HTTPError) -> tuple:
+    """Map a BigQuery HTTP error to a stable (code, hint) tuple."""
+    try:
+        body = e.read().decode("utf-8", errors="replace")
+    except Exception:
+        body = ""
+    snippet = body[:300] if body else ""
+    if e.code == 401:
+        return ("invalid_credentials",
+                f"BigQuery rejected the access token — verify the "
+                f"service account is enabled and has bigquery.jobUser "
+                f"+ bigquery.dataViewer roles. Body: {snippet}")
+    if e.code == 400:
+        return ("bad_query",
+                f"BigQuery rejected the SQL or job config: {snippet}")
+    if e.code == 403:
+        return ("forbidden",
+                f"Service account is missing IAM scope (need "
+                f"bigquery.jobs.create + bigquery.tables.getData on "
+                f"the dataset): {snippet}")
+    if e.code == 404:
+        return ("not_found",
+                f"BigQuery resource not found — wrong project_id or "
+                f"dataset? Body: {snippet}")
+    if 500 <= e.code < 600:
+        return ("upstream",
+                f"BigQuery returned 5xx ({snippet})")
+    return ("http_error", snippet)

infra/ingestion/connectors.py CHANGED Viewed

@@ -188,11 +188,33 @@ def build_connector(connector_type: str, config: dict) -> Connector:
             timeout=float(config.get("timeout", 30.0)),
             base_url=config.get("base_url", "https://api.stripe.com"),
         )
     raise ValueError(
         f"unknown / non-schedulable connector type {connector_type!r} "
         f"(schedulable: ['{CSVFolderConnector.type_name}', "
         f"'{HttpJsonConnector.type_name}', '{SFTPConnector.type_name}', "
         f"'{SQLConnector.type_name}', "
         f"'{SalesforceConnector.type_name}', "
-        f"'{StripeConnector.type_name}'])"
     )

             timeout=float(config.get("timeout", 30.0)),
             base_url=config.get("base_url", "https://api.stripe.com"),
         )
+    # Stage 163 — BigQuery REST query connector.
+    from .bigquery_connector import BigQueryConnector
+    if connector_type == BigQueryConnector.type_name:
+        required = ("project_id", "sql", "mapping")
+        missing = [f for f in required if not config.get(f)]
+        if missing:
+            raise ValueError(
+                "bigquery connector config missing required field(s): "
+                f"{', '.join(missing)}"
+            )
+        return BigQueryConnector(
+            project_id=config["project_id"],
+            sql=config["sql"],
+            mapping=config["mapping"],
+            service_account_json=config.get("service_account_json"),
+            service_account_env=config.get("service_account_env"),
+            api_timeout=float(config.get("api_timeout", 30.0)),
+            query_timeout_ms=int(config.get("query_timeout_ms", 60_000)),
+            use_legacy_sql=bool(config.get("use_legacy_sql", False)),
+            location=config.get("location"),
+        )
     raise ValueError(
         f"unknown / non-schedulable connector type {connector_type!r} "
         f"(schedulable: ['{CSVFolderConnector.type_name}', "
         f"'{HttpJsonConnector.type_name}', '{SFTPConnector.type_name}', "
         f"'{SQLConnector.type_name}', "
         f"'{SalesforceConnector.type_name}', "
+        f"'{StripeConnector.type_name}', "
+        f"'{BigQueryConnector.type_name}'])"
     )

requirements-runtime.txt CHANGED Viewed

@@ -38,6 +38,11 @@ authlib>=1.3,<2.0
 # pdf_unavailable. ~3MB wheel, no native deps, embeds fonts so
 # Hebrew/Latin mixed reports render correctly.
 fpdf2>=2.7
 # Stage 156 — python-bidi runs the Unicode BiDi algorithm so we can
 # render Hebrew RTL text through fpdf2 (which is LTR-only). Tiny pure-
 # Python lib (~30KB); lazy-imported inside delivery/reports/pdf.py so

 # pdf_unavailable. ~3MB wheel, no native deps, embeds fonts so
 # Hebrew/Latin mixed reports render correctly.
 fpdf2>=2.7
+# Stage 163 — PyJWT is a transitive dep of authlib but we now use it
+# directly to sign service-account JWTs for the BigQuery connector
+# (Google's OAuth 2.0 jwt-bearer flow). Tiny pure-Python lib —
+# safe to make explicit so a future authlib bump can't drop it.
+PyJWT>=2.10
 # Stage 156 — python-bidi runs the Unicode BiDi algorithm so we can
 # render Hebrew RTL text through fpdf2 (which is LTR-only). Tiny pure-
 # Python lib (~30KB); lazy-imported inside delivery/reports/pdf.py so