n8n-duplicate

Paused

App Files Files Community

niwayandm commited on Nov 5, 2025

Commit

a16378e

1 Parent(s): 4f74457

Initial files commit

Browse files

Files changed (18) hide show

.gitattributes +2 -0
.gitignore +1 -0
Dockerfile +115 -0
README.md +6 -4
docker-custom-entrypoint.sh +41 -0
fonts/HanyiSentyPagoda_Regular.ttf +3 -0
logs/.gitkeep +0 -0
python/hubspot_audit.py +252 -0
python/hubspot_billing.py +454 -0
python/hubspot_companies.py +475 -0
python/hubspot_contacts.py +415 -0
python/hubspot_deals.py +445 -0
python/hubspot_emails.py +472 -0
python/hubspot_tickets.py +491 -0
python/hubspot_utils.py +946 -0
python/load_hubspot_data.py +163 -0
python/supabase_utils.py +394 -0
requirements.txt +9 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+fonts/HanyiSentyPagoda[[:space:]]Regular.ttf filter=lfs diff=lfs merge=lfs -text
+fonts/HanyiSentyPagoda_Regular.ttf filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1 @@


1	+ .env

Dockerfile ADDED Viewed

	@@ -0,0 +1,115 @@

+FROM node:24-alpine
+# -------------------------
+# Base + build args
+# -------------------------
+USER root
+ARG N8N_PATH=/usr/local/lib/node_modules/n8n
+ARG BASE_PATH=/root/.n8n
+ARG DATABASE_PATH=$BASE_PATH/database
+ARG CONFIG_PATH=$BASE_PATH/config
+ARG WORKFLOWS_PATH=$BASE_PATH/workflows
+ARG LOGS_PATH=$BASE_PATH/logs
+# Optional args passed at build time (no secrets here)
+ARG N8N_ENFORCE_SETTINGS_FILE_PERMISSIONS=$N8N_ENFORCE_SETTINGS_FILE_PERMISSIONS
+ARG N8N_HOST=$N8N_HOST
+ARG N8N_PORT=$N8N_PORT
+ARG N8N_PROTOCOL=https
+ARG N8N_EDITOR_BASE_URL=$N8N_EDITOR_BASE_URL
+ARG WEBHOOK_URL=$WEBHOOK_URL
+ARG GENERIC_TIMEZONE=$GENERIC_TIMEZONE
+ARG TZ=$TZ
+ARG N8N_ENCRYPTION_KEY=$N8N_ENCRYPTION_KEY
+ARG DB_TYPE=$DB_TYPE
+ARG DB_POSTGRESDB_SCHEMA=$DB_POSTGRESDB_SCHEMA
+ARG DB_POSTGRESDB_HOST=$DB_POSTGRESDB_HOST
+ARG DB_POSTGRESDB_DATABASE=$DB_POSTGRESDB_DATABASE
+ARG DB_POSTGRESDB_PORT=$DB_POSTGRESDB_PORT
+ARG DB_POSTGRESDB_USER=$DB_POSTGRESDB_USER
+ARG DB_POSTGRESDB_PASSWORD=$DB_POSTGRESDB_PASSWORD
+# -------------------------
+# System deps
+# -------------------------
+RUN apk add --no-cache \
+    bash \
+    git \
+    python3 \
+    py3-pip \
+    make \
+    g++ \
+    build-base \
+    cairo-dev \
+    pango-dev \
+    chromium \
+    postgresql-client \
+    ffmpeg \
+    yt-dlp
+# Tell Puppeteer to skip installing Chrome.
+ENV PUPPETEER_SKIP_DOWNLOAD=true
+ENV PUPPETEER_EXECUTABLE_PATH=/usr/bin/chromium-browser
+RUN yarn add puppeteer@24.27.0
+# Install n8n-nodes-puppeteer in a permanent location
+RUN mkdir -p /opt/n8n-custom-nodes && \
+    cd /opt/n8n-custom-nodes && \
+    npm install n8n-nodes-puppeteer && \
+    chown -R node:node /opt/n8n-custom-nodes
+# Copy our custom entrypoint
+COPY docker-custom-entrypoint.sh /docker-custom-entrypoint.sh
+RUN chmod +x /docker-custom-entrypoint.sh && \
+    chown node:node /docker-custom-entrypoint.sh
+# -------------------------
+# n8n
+# -------------------------
+RUN npm install -g n8n@1.118.1
+# -------------------------
+# Python venv + requirements
+# -------------------------
+RUN python3 -m venv /opt/venv
+ENV PATH="/opt/venv/bin:$PATH"
+# Python dependencies
+COPY requirements.txt /tmp/requirements.txt
+RUN pip install --no-cache-dir -r /tmp/requirements.txt
+# Python code
+COPY python/ /app/python/
+# -------------------------
+# Runtime layout
+# -------------------------
+# Persist n8n data under /data (enable “Persistent storage” in Space settings)
+ENV N8N_USER_FOLDER="/data" \
+    N8N_LISTEN_ADDRESS="0.0.0.0" \
+    N8N_PORT=7860 \
+    N8N_PROTOCOL="http"
+# Create standard dirs + logs
+RUN mkdir -p $DATABASE_PATH $CONFIG_PATH $WORKFLOWS_PATH $LOGS_PATH \
+    && mkdir -p /data/logs /app/logs \
+    && chmod -R 777 $BASE_PATH /data /app
+COPY logs/ /data/logs/
+# -------------------------
+# Non-root user for Chromium
+# -------------------------
+RUN addgroup -S pptruser && adduser -S -G pptruser pptruser \
+    && mkdir -p /home/pptruser/Downloads \
+    && chown -R pptruser:pptruser /home/pptruser /data /app
+USER pptruser
+WORKDIR /data
+EXPOSE 7860
+# Start n8n
+CMD ["n8n", "start"]

README.md CHANGED Viewed

@@ -1,10 +1,12 @@
 ---
-title: N8n
-emoji: 🔥
-colorFrom: pink
-colorTo: green
 sdk: docker
 pinned: false
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: N8N
+emoji: ⚡
+colorFrom: blue
+colorTo: gray
 sdk: docker
 pinned: false
+license: mit
+short_description: Free n8n with Supabase
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

docker-custom-entrypoint.sh ADDED Viewed

	@@ -0,0 +1,41 @@

+#!/bin/sh
+print_banner() {
+    echo "----------------------------------------"
+    echo "n8n Puppeteer Node - Environment Details"
+    echo "----------------------------------------"
+    echo "Node.js version: $(node -v)"
+    echo "n8n version: $(n8n --version)"
+    # Get Chromium version specifically from the path we're using for Puppeteer
+    CHROME_VERSION=$("$PUPPETEER_EXECUTABLE_PATH" --version 2>/dev/null || echo "Chromium not found")
+    echo "Chromium version: $CHROME_VERSION"
+    # Get Puppeteer version if installed
+    PUPPETEER_PATH="/opt/n8n-custom-nodes/node_modules/n8n-nodes-puppeteer"
+    if [ -f "$PUPPETEER_PATH/package.json" ]; then
+        PUPPETEER_VERSION=$(node -p "require('$PUPPETEER_PATH/package.json').version")
+        echo "n8n-nodes-puppeteer version: $PUPPETEER_VERSION"
+        # Try to resolve puppeteer package from the n8n-nodes-puppeteer directory
+        CORE_PUPPETEER_VERSION=$(cd "$PUPPETEER_PATH" && node -e "try { const version = require('puppeteer/package.json').version; console.log(version); } catch(e) { console.log('not found'); }")
+        echo "Puppeteer core version: $CORE_PUPPETEER_VERSION"
+    else
+        echo "n8n-nodes-puppeteer: not installed"
+    fi
+    echo "Puppeteer executable path: $PUPPETEER_EXECUTABLE_PATH"
+    echo "----------------------------------------"
+}
+# Add custom nodes to the NODE_PATH
+if [ -n "$N8N_CUSTOM_EXTENSIONS" ]; then
+    export N8N_CUSTOM_EXTENSIONS="/opt/n8n-custom-nodes:${N8N_CUSTOM_EXTENSIONS}"
+else
+    export N8N_CUSTOM_EXTENSIONS="/opt/n8n-custom-nodes"
+fi
+print_banner
+# Execute the original n8n entrypoint script
+exec /docker-entrypoint.sh "$@"

fonts/HanyiSentyPagoda_Regular.ttf ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:754eb3539249898f9daffc23b0068561bb9a618a79d517708513b5119fd7e6e5
+size 9982284

logs/.gitkeep ADDED Viewed

File without changes

python/hubspot_audit.py ADDED Viewed

	@@ -0,0 +1,252 @@

+"""
+HubSpot Audit → Supabase (incremental since a millisecond cursor)
+Usage from orchestrator:
+    import load_hubspot_companies
+    load_hubspot_companies.main(since_ms=<int milliseconds since epoch UTC>)
+Direct CLI:
+    # epoch ms
+    python load_hubspot_companies.py 1754025600000
+    # ISO-8601
+    python load_hubspot_companies.py 2025-08-01T09:30:00Z
+    # Back-compat date (floors to 00:00Z)
+    python load_hubspot_companies.py 2025-08-01
+"""
+import os
+import logging
+import datetime
+from typing import Dict, List, Optional, Union
+import re
+from dotenv import load_dotenv
+from supabase import create_client
+from supabase_utils import batched_insert, update_sync_metadata
+from hubspot_utils import (
+    to_epoch_ms_from_utc_iso,
+    page_account_activity,
+    build_login_index,
+    build_security_index,
+    normalize_audit_event,
+    enrich_audit_row_by_category,
+    deduplicate_by_key,
+)
+# Logging
+logging.basicConfig(
+    filename=f"logs/hubspot_audit_logs_{datetime.datetime.now().strftime('%Y-%m-%d')}.log",
+    filemode="a",
+    level=logging.INFO,
+    format="%(asctime)s [%(levelname)s] %(message)s",
+)
+# Environment
+load_dotenv()
+HUBSPOT_TOKEN = os.getenv("HUBSPOT_TOKEN")
+SUPABASE_URL = os.getenv("SUPABASE_URL")
+SUPABASE_SERVICE_ROLE_KEY = os.getenv("SUPABASE_SERVICE_ROLE_KEY")
+if not HUBSPOT_TOKEN:
+    raise RuntimeError("HUBSPOT_TOKEN is required")
+if not SUPABASE_URL or not SUPABASE_SERVICE_ROLE_KEY:
+    raise RuntimeError(
+        "SUPABASE_URL and SUPABASE_SERVICE_ROLE_KEY are required")
+supabase_client = create_client(SUPABASE_URL, SUPABASE_SERVICE_ROLE_KEY)
+# Config
+AUDITLOG_TABLE = os.getenv("HUBSPOT_AUDITLOG_TABLE", "hubspot_audits")
+HUBSPOT_LIMIT = int(os.getenv("HUBSPOT_AUDITLOG_LIMIT", "100"))
+HUBSPOT_MAX_PAGES = int(os.getenv("HUBSPOT_AUDITLOG_MAX_PAGES", "10000"))
+MATCH_WINDOW_SECONDS = int(os.getenv("HUBSPOT_MATCH_WINDOW_SECONDS", "300"))
+INITIAL_BACKOFF_SECONDS = float(os.getenv("HUBSPOT_BACKOFF_START", "1.0"))
+MAX_BACKOFF_SECONDS = float(os.getenv("HUBSPOT_BACKOFF_MAX", "16.0"))
+BOOTSTRAP_SINCE_MS_ENV = os.getenv(
+    "BOOTSTRAP_SINCE_MS_ENV")  # may be epoch ms or ISO-8601
+def _today_midnight_ms_utc() -> int:
+    now = datetime.datetime.now(datetime.timezone.utc)
+    dt = now.replace(hour=0, minute=0, second=0, microsecond=0)
+    # hubspot_utils.to_epoch_ms_from_utc_iso expects ISO; format accordingly
+    return to_epoch_ms_from_utc_iso(dt.isoformat().replace("+00:00", "Z"))
+def _ensure_utc(dt: datetime.datetime) -> datetime.datetime:
+    if dt.tzinfo is None:
+        dt = dt.replace(tzinfo=datetime.timezone.utc)
+    return dt.astimezone(datetime.timezone.utc)
+def floor_to_utc_midnight(dt: datetime.datetime) -> datetime.datetime:
+    dt = _ensure_utc(dt)
+    return dt.replace(hour=0, minute=0, second=0, microsecond=0)
+def _parse_iso_like_to_dt(value: str) -> datetime.datetime:
+    if value.endswith("Z"):
+        value = value[:-1] + "+00:00"
+    dt = datetime.datetime.fromisoformat(value)
+    return _ensure_utc(dt)
+def to_epoch_ms(dt_or_str: Union[str, datetime.datetime]) -> int:
+    if isinstance(dt_or_str, str):
+        dt = _parse_iso_like_to_dt(dt_or_str)
+    elif isinstance(dt_or_str, datetime.datetime):
+        dt = _ensure_utc(dt_or_str)
+    else:
+        raise TypeError(f"Unsupported type for to_epoch_ms: {type(dt_or_str)}")
+    return int(dt.timestamp() * 1000)
+BASE_AUDIT_URL = "https://api.hubapi.com/account-info/v3/activity/audit-logs"
+BASE_SECURITY_URL = "https://api.hubspot.com/account-info/v3/activity/security".replace(
+    "hubspot.com", "hubapi.com")  # ensure hubapi
+BASE_LOGIN_URL = "https://api.hubapi.com/account-info/v3/activity/login"
+def fetch_streams(occurred_after_ms: int) -> Dict[str, List[dict]]:
+    """
+    Fetch the three streams from HubSpot account-info API:
+      - Audit logs
+      - Login activity
+      - Security activity
+    We pass occurred_after_ms and omit occurred_before_ms (None).
+    """
+    audit = page_account_activity(
+        base_url=BASE_AUDIT_URL,
+        token=HUBSPOT_TOKEN,
+        occurred_after_ms=occurred_after_ms,
+        occurred_before_ms=None,
+        limit=HUBSPOT_LIMIT,
+        max_pages=HUBSPOT_MAX_PAGES,
+    )
+    login = page_account_activity(
+        base_url=BASE_LOGIN_URL,
+        token=HUBSPOT_TOKEN,
+        occurred_after_ms=occurred_after_ms,
+        occurred_before_ms=None,
+        limit=HUBSPOT_LIMIT,
+        max_pages=HUBSPOT_MAX_PAGES,
+    )
+    security = page_account_activity(
+        base_url=BASE_SECURITY_URL,
+        token=HUBSPOT_TOKEN,
+        occurred_after_ms=occurred_after_ms,
+        occurred_before_ms=None,
+        limit=HUBSPOT_LIMIT,
+        max_pages=HUBSPOT_MAX_PAGES,
+    )
+    logging.info("Fetched counts: audit=%d login=%d security=%d",
+                 len(audit), len(login), len(security))
+    return {"audit": audit, "login": login, "security": security}
+def build_indices(login_events: List[dict], security_events: List[dict]):
+    login_idx = build_login_index(login_events)
+    security_idx = build_security_index(security_events)
+    return login_idx, security_idx
+def normalize_and_enrich(audit_events: List[dict], login_idx, security_idx) -> List[dict]:
+    rows: List[dict] = []
+    for ev in audit_events:
+        row = normalize_audit_event(ev)
+        if not row.get("audit_id"):
+            continue
+        row = enrich_audit_row_by_category(
+            row, login_idx, security_idx, match_window_seconds=MATCH_WINDOW_SECONDS)
+        rows.append(row)
+    rows = deduplicate_by_key(rows, "audit_id")
+    return rows
+def upsert(rows: List[dict]) -> None:
+    if not rows:
+        logging.info("No rows to upsert.")
+        sync_time = datetime.datetime.now(datetime.timezone.utc)
+        update_sync_metadata(
+            supabase_client, AUDITLOG_TABLE, sync_time.isoformat())
+        return
+    batched_insert(
+        supabase_client,
+        AUDITLOG_TABLE,
+        rows,
+        batch_size=500,
+        on_conflict=["audit_id"],
+    )
+    logging.info("Upserted %d rows into %s", len(rows), AUDITLOG_TABLE)
+    sync_time = datetime.datetime.now(datetime.timezone.utc)
+    update_sync_metadata(supabase_client, AUDITLOG_TABLE,
+                         sync_time.isoformat())
+def main(since_ms: Optional[int] = None):
+    logging.info(
+        "Starting HubSpot audit sync (occurredAfter_ms=%s, occurredBefore=SKIPPED).", since_ms)
+    if since_ms is None and BOOTSTRAP_SINCE_MS_ENV:
+        try:
+            since_ms = int(BOOTSTRAP_SINCE_MS_ENV)
+        except ValueError:
+            raise RuntimeError(
+                "HUBSPOT_BILLING_SINCE_MS must be an integer (ms) if set.")
+    if since_ms is None:
+        # Default: today@00:00:00Z for first run
+        today0 = floor_to_utc_midnight(
+            datetime.datetime.now(datetime.timezone.utc))
+        since_ms = to_epoch_ms(today0)
+    print(f"Fetching HubSpot audit logs occurredAfter > {since_ms} ...")
+    streams = fetch_streams(occurred_after_ms=since_ms)
+    login_idx, security_idx = build_indices(
+        streams["login"], streams["security"])
+    rows = normalize_and_enrich(streams["audit"], login_idx, security_idx)
+    print("Upserting into Supabase...")
+    upsert(rows)
+    print(f"Synced {len(rows)} audit rows into '{AUDITLOG_TABLE}'.")
+    print("Audit logs sync complete.")
+def _parse_cli_arg_to_ms(arg: str) -> int:
+    """
+    Accept:
+      - integer epoch ms
+      - ISO-8601 (Z or offset)
+      - YYYY-MM-DD (floors to 00:00Z)
+    """
+    # epoch ms or seconds
+    if re.fullmatch(r"\d{10,13}", arg):
+        v = int(arg)
+        if v < 10_000_000_000_000:  # seconds -> ms
+            v *= 1000
+        return v
+    # YYYY-MM-DD
+    if re.fullmatch(r"\d{4}-\d{2}-\d{2}", arg):
+        d = datetime.datetime.strptime(
+            arg, "%Y-%m-%d").replace(tzinfo=datetime.timezone.utc)
+        return to_epoch_ms(floor_to_utc_midnight(d))
+    # ISO-8601
+    return to_epoch_ms(arg)
+if __name__ == "__main__":
+    import sys
+    if len(sys.argv) > 1:
+        try:
+            main(since_ms=_parse_cli_arg_to_ms(sys.argv[1]))
+        except Exception as e:
+            print(
+                f"Invalid timestamp. Provide epoch ms, ISO-8601, or YYYY-MM-DD. Error: {e}")
+            sys.exit(1)
+    else:
+        main()

python/hubspot_billing.py ADDED Viewed

	@@ -0,0 +1,454 @@

+"""
+HubSpot Billing → Supabase (incremental since a millisecond cursor)
+Usage from orchestrator:
+    import load_hubspot_billing
+    load_hubspot_billing.main(since_ms=<int milliseconds since epoch UTC>)
+Direct CLI:
+    # epoch ms
+    python load_hubspot_billing.py 1754025600000
+    # ISO-8601
+    python load_hubspot_billing.py 2025-08-01T09:30:00Z
+    # Back-compat date (floors to 00:00Z)
+    python load_hubspot_billing.py 2025-08-01
+"""
+import os
+import re
+import time
+import logging
+import datetime
+from typing import List, Dict, Optional, Tuple, Union
+import httpx
+import hubspot
+from dotenv import load_dotenv
+from supabase import create_client
+from hubspot_utils import (
+    parse_ts, try_parse_int, deduplicate_by_key,
+)
+from supabase_utils import (
+    batched_insert, update_sync_metadata,
+)
+# -----------------------------------------------------------------------------
+# Logging
+# -----------------------------------------------------------------------------
+logging.basicConfig(
+    filename=f"logs/hubspot_billing_pipeline_{datetime.datetime.now().strftime('%Y-%m-%d')}.log",
+    filemode="a",
+    level=logging.INFO,
+    format="%(asctime)s [%(levelname)s] %(message)s",
+)
+# -----------------------------------------------------------------------------
+# Environment
+# -----------------------------------------------------------------------------
+load_dotenv()
+HUBSPOT_TOKEN = os.getenv("HUBSPOT_TOKEN")
+SUPABASE_URL = os.getenv("SUPABASE_URL")
+SUPABASE_SERVICE_ROLE_KEY = os.getenv("SUPABASE_SERVICE_ROLE_KEY")
+# Optional bootstrap cursor if orchestrator doesn't provide one
+BOOTSTRAP_SINCE_MS_ENV = os.getenv("HUBSPOT_BILLING_SINCE_MS")
+if not HUBSPOT_TOKEN:
+    raise RuntimeError("HUBSPOT_TOKEN is not set")
+if not SUPABASE_URL or not SUPABASE_SERVICE_ROLE_KEY:
+    raise RuntimeError("Supabase env vars are not set")
+hubspot_client = hubspot.Client.create(access_token=HUBSPOT_TOKEN)
+supabase_client = create_client(SUPABASE_URL, SUPABASE_SERVICE_ROLE_KEY)
+# -----------------------------------------------------------------------------
+# Config
+# -----------------------------------------------------------------------------
+# Custom object type ID for Billing Services
+BILLING_OBJECT_TYPE = "2-38060359"
+BILLING_PROPERTIES = [
+    "cli",
+    "account_code",
+    "service_name",
+    "status",
+    "type",
+    "tariff_name",
+    "contract_renewal_date",
+    "supplier_network",
+    "network_name",
+    "product_type_name",
+    "hs_created_by_user_id",
+    "hs_createdate",
+    # "hs_lastmodifieddate",  # optionally include if needed later
+]
+PROPERTY_RENAME = {
+    "hs_created_by_user_id": "hubspot_created_by",
+    "hs_createdate": "hubspot_created_at",
+}
+# -----------------------------------------------------------------------------
+# Time helpers (mirroring hubspot_tickets)
+# -----------------------------------------------------------------------------
+def _ensure_utc(dt: datetime.datetime) -> datetime.datetime:
+    if dt.tzinfo is None:
+        dt = dt.replace(tzinfo=datetime.timezone.utc)
+    return dt.astimezone(datetime.timezone.utc)
+def floor_to_utc_midnight(dt: datetime.datetime) -> datetime.datetime:
+    dt = _ensure_utc(dt)
+    return dt.replace(hour=0, minute=0, second=0, microsecond=0)
+def _parse_iso_like_to_dt(value: str) -> datetime.datetime:
+    if value.endswith("Z"):
+        value = value[:-1] + "+00:00"
+    dt = datetime.datetime.fromisoformat(value)
+    return _ensure_utc(dt)
+def to_epoch_ms(dt_or_str: Union[str, datetime.datetime]) -> int:
+    if isinstance(dt_or_str, str):
+        dt = _parse_iso_like_to_dt(dt_or_str)
+    elif isinstance(dt_or_str, datetime.datetime):
+        dt = _ensure_utc(dt_or_str)
+    else:
+        raise TypeError(f"Unsupported type for to_epoch_ms: {type(dt_or_str)}")
+    return int(dt.timestamp() * 1000)
+def parse_any_ts_ms(value: Optional[Union[str, int, float]]) -> Optional[int]:
+    """
+    Accepts:
+      - ms-epoch as str/int/float
+      - seconds-epoch as str/int/float (auto *1000)
+      - ISO-8601 string
+    Returns ms since epoch or None.
+    """
+    if value is None:
+        return None
+    # numeric-ish string or number
+    try:
+        v = int(float(value))  # handles numeric strings
+        # heuristics: treat 10-digit as seconds
+        if v < 10_000_000_000_000:
+            v *= 1000
+        return v
+    except Exception:
+        pass
+    # ISO-8601
+    try:
+        return to_epoch_ms(str(value))
+    except Exception:
+        return None
+# -----------------------------------------------------------------------------
+# Search (by timestamp cursor)
+# -----------------------------------------------------------------------------
+def _search_billing_ids_from(since_ms: int, prop: str) -> List[str]:
+    """
+    Search billing custom object IDs where {prop} > since_ms.
+    Sort ascending so we can advance cursor monotonically.
+    """
+    url = f"https://api.hubapi.com/crm/v3/objects/{BILLING_OBJECT_TYPE}/search"
+    headers = {
+        "Authorization": f"Bearer {HUBSPOT_TOKEN}",
+        "Content-Type": "application/json",
+        "Accept": "application/json",
+    }
+    payload = {
+        "filterGroups": [{
+            "filters": [
+                {"propertyName": prop, "operator": "GT", "value": str(since_ms)},
+            ]
+        }],
+        "limit": 100,
+        "sorts": [{"propertyName": prop, "direction": "ASCENDING"}],
+        "properties": ["hs_object_id"],
+    }
+    ids: List[str] = []
+    after: Optional[str] = None
+    with httpx.Client(timeout=30.0) as client:
+        while True:
+            body = dict(payload)
+            if after:
+                body["after"] = after
+            resp = client.post(url, headers=headers, json=body)
+            if resp.status_code >= 400:
+                try:
+                    logging.error(
+                        "Billing search error for prop '%s': %s", prop, resp.json())
+                except Exception:
+                    logging.error(
+                        "Billing search error for prop '%s': %s", prop, resp.text)
+                resp.raise_for_status()
+            data = resp.json()
+            ids.extend([obj["id"] for obj in data.get("results", []) or []])
+            after = (data.get("paging") or {}).get("next", {}).get("after")
+            if not after:
+                break
+            time.sleep(0.1)
+    return ids
+def search_billing_ids_after_ms(since_ms: int) -> Tuple[List[str], str]:
+    """
+    Search billing IDs where {prop} > since_ms (strictly greater),
+    sorted ASC so the max timestamp at the end is monotonic.
+    Returns a tuple of (ids, prop_used).
+    """
+    props_to_try = ["hs_createdate"]
+    last_err = None
+    for prop in props_to_try:
+        try:
+            ids = _search_billing_ids_from(since_ms, prop)
+            logging.info(
+                "Billing search with '%s' returned %d IDs.", prop, len(ids))
+            return ids, prop
+        except httpx.HTTPStatusError as e:
+            last_err = e
+            continue
+    if last_err:
+        raise last_err
+    return [], "hs_createdate"
+# -----------------------------------------------------------------------------
+# Read-by-ID (with associations)
+# -----------------------------------------------------------------------------
+def read_billing_by_ids(
+    billing_ids: List[str],
+    cursor_prop: str,
+) -> Tuple[List[Dict], List[Dict], List[Dict], Optional[int]]:
+    """
+    Read billing services by ID with properties and associations (companies/deals).
+    Returns: services, company_links, deal_links, max_ts_ms_for_cursor_prop
+    """
+    if not billing_ids:
+        return [], [], [], None
+    services: List[Dict] = []
+    company_links: List[Dict] = []
+    deal_links: List[Dict] = []
+    max_ts_ms: Optional[int] = None
+    assoc_types = ["companies", "deals"]
+    for i, bid in enumerate(billing_ids, start=1):
+        try:
+            record = hubspot_client.crm.objects.basic_api.get_by_id(
+                object_type=BILLING_OBJECT_TYPE,
+                object_id=bid,
+                properties=BILLING_PROPERTIES,
+                associations=assoc_types,
+                archived=False,
+            )
+            p = record.properties or {}
+            # Track max timestamp based on cursor_prop
+            cursor_val = p.get(cursor_prop)
+            ts_ms = parse_any_ts_ms(cursor_val)
+            if ts_ms is not None and (max_ts_ms is None or ts_ms > max_ts_ms):
+                max_ts_ms = ts_ms
+            # Build service row
+            row = {
+                "billing_id": str(record.id),
+                "cli": p.get("cli"),
+                "account_code": p.get("account_code"),
+                "service_name": p.get("service_name"),
+                "status": p.get("status"),
+                "type": p.get("type"),
+                "contract_renewal_date": parse_ts(p.get("contract_renewal_date")),
+                "supplier_network": p.get("supplier_network"),
+                "network_name": p.get("network_name"),
+                "product_type_name": p.get("product_type_name"),
+                "hubspot_created_by": try_parse_int(p.get("hs_created_by_user_id")),
+                "hubspot_created_at": parse_ts(p.get("hs_createdate")),
+            }
+            services.append(row)
+            # Associations
+            assoc = record.associations or {}
+            if assoc.get("companies") and getattr(assoc["companies"], "results", None):
+                for a in assoc["companies"].results:
+                    if a.id:
+                        company_links.append({
+                            "billing_id": str(record.id),
+                            "company_id": str(a.id),
+                        })
+            if assoc.get("deals") and getattr(assoc["deals"], "results", None):
+                for a in assoc["deals"].results:
+                    if a.id:
+                        deal_links.append({
+                            "billing_id": str(record.id),
+                            "deal_id": str(a.id),
+                        })
+            if i % 200 == 0:
+                logging.info("Read %d billing services...", i)
+            time.sleep(0.05)
+        except httpx.HTTPStatusError as e:
+            logging.error("HTTP error reading billing %s: %s", bid, e)
+        except Exception as e:
+            logging.error("Error reading billing %s: %s", bid, e)
+    return services, company_links, deal_links, max_ts_ms
+# -----------------------------------------------------------------------------
+# Upsert
+# -----------------------------------------------------------------------------
+def upsert_billing(
+    services: List[Dict],
+    company_links: List[Dict],
+    deal_links: List[Dict],
+) -> None:
+    if services:
+        services = deduplicate_by_key(services, key="billing_id")
+        batched_insert(
+            supabase_client, "hubspot_billing_services", services,
+            batch_size=1000, on_conflict=["billing_id"]
+        )
+        print(f"Upserted {len(services)} billing services.")
+    if company_links:
+        company_links = deduplicate_by_key(
+            company_links, key=("billing_id", "company_id"))
+        batched_insert(
+            supabase_client, "hubspot_billing_companies", company_links,
+            batch_size=1000, on_conflict=["billing_id", "company_id"]
+        )
+        print(f"Upserted {len(company_links)} billing-company associations.")
+    if deal_links:
+        deal_links = deduplicate_by_key(
+            deal_links, key=("billing_id", "deal_id"))
+        batched_insert(
+            supabase_client, "hubspot_billing_deals", deal_links,
+            batch_size=1000, on_conflict=["billing_id", "deal_id"]
+        )
+        print(f"Upserted {len(deal_links)} billing-deal associations.")
+# -----------------------------------------------------------------------------
+# Main (timestamp cursor)
+# -----------------------------------------------------------------------------
+def main(since_ms: Optional[int] = None):
+    """
+    Orchestrates:
+      1) Search billing IDs with <cursor_prop> > since_ms
+      2) Read full records with associations (track max timestamp)
+      3) Upsert into Supabase
+      4) Update sync metadata with { last_sync_metadata, last_sync_time, cursor_prop }
+    """
+    # Resolve since_ms
+    if since_ms is None and BOOTSTRAP_SINCE_MS_ENV:
+        try:
+            since_ms = int(BOOTSTRAP_SINCE_MS_ENV)
+        except ValueError:
+            raise RuntimeError(
+                "HUBSPOT_BILLING_SINCE_MS must be an integer (ms) if set.")
+    if since_ms is None:
+        # Default: today@00:00:00Z for first run
+        today0 = floor_to_utc_midnight(
+            datetime.datetime.now(datetime.timezone.utc))
+        since_ms = to_epoch_ms(today0)
+    print(f"Searching billing services with timestamp > {since_ms} ...")
+    ids, cursor_prop = search_billing_ids_after_ms(since_ms)
+    print(f"Search property: {cursor_prop}. Found {len(ids)} billing IDs.")
+    now_iso = datetime.datetime.now(datetime.timezone.utc).isoformat()
+    if not ids:
+        print("No billing services beyond the cursor. Updating sync metadata and exiting.")
+        update_sync_metadata(supabase_client, "hubspot_billing_services", now_iso)
+        return
+    print("Reading billing services (with associations)...")
+    services, company_links, deal_links, max_ts_ms = read_billing_by_ids(
+        ids, cursor_prop)
+    print("Upserting into Supabase...")
+    upsert_billing(services, company_links, deal_links)
+    # Advance cursor to max timestamp we actually ingested for the chosen property
+    new_cursor_ms = max_ts_ms if max_ts_ms is not None else since_ms
+    update_sync_metadata(supabase_client, "hubspot_billing_services", now_iso)
+    print(
+        f"Billing sync complete. Advanced cursor to {new_cursor_ms} using prop '{cursor_prop}'.")
+# -----------------------------------------------------------------------------
+# CLI
+# -----------------------------------------------------------------------------
+def _parse_cli_arg_to_ms(arg: str) -> int:
+    """
+    Accept:
+      - integer epoch ms
+      - ISO-8601 (Z or offset)
+      - YYYY-MM-DD (floors to 00:00Z)
+    """
+    # epoch ms or seconds
+    if re.fullmatch(r"\d{10,13}", arg):
+        v = int(arg)
+        if v < 10_000_000_000_000:  # seconds -> ms
+            v *= 1000
+        return v
+    # YYYY-MM-DD
+    if re.fullmatch(r"\d{4}-\d{2}-\d{2}", arg):
+        d = datetime.datetime.strptime(
+            arg, "%Y-%m-%d").replace(tzinfo=datetime.timezone.utc)
+        return to_epoch_ms(floor_to_utc_midnight(d))
+    # ISO-8601
+    return to_epoch_ms(arg)
+if __name__ == "__main__":
+    import sys
+    if len(sys.argv) > 1:
+        try:
+            main(since_ms=_parse_cli_arg_to_ms(sys.argv[1]))
+        except Exception as e:
+            print(
+                f"Invalid timestamp. Provide epoch ms, ISO-8601, or YYYY-MM-DD. Error: {e}")
+            raise SystemExit(1)
+    else:
+        main()

python/hubspot_companies.py ADDED Viewed

	@@ -0,0 +1,475 @@

+"""
+HubSpot Companies → Supabase (incremental since a millisecond cursor)
+Usage from orchestrator:
+    import load_hubspot_companies
+    load_hubspot_companies.main(since_ms=<int milliseconds since epoch UTC>)
+Direct CLI:
+    # epoch ms
+    python load_hubspot_companies.py 1754025600000
+    # ISO-8601
+    python load_hubspot_companies.py 2025-08-01T09:30:00Z
+    # Back-compat date (floors to 00:00Z)
+    python load_hubspot_companies.py 2025-08-01
+"""
+import os
+import re
+import time
+import logging
+import datetime
+from typing import List, Dict, Optional, Tuple, Union
+import httpx
+import hubspot
+from dotenv import load_dotenv
+from supabase import create_client
+from hubspot.crm.companies import ApiException as CompaniesApiException
+from hubspot_utils import (
+    try_parse_int, parse_ts, get_property_label_mapping,
+)
+from supabase_utils import (
+    update_sync_metadata, enrich_supabase_row, upload_raw_json_to_supabase,
+    batched_insert, fetch_supabase_table,
+)
+# -----------------------------------------------------------------------------
+# Logging
+# -----------------------------------------------------------------------------
+logging.basicConfig(
+    filename=f"logs/hubspot_company_pipeline_{datetime.datetime.now().strftime('%Y-%m-%d')}.log",
+    filemode="a",
+    level=logging.INFO,
+    format="%(asctime)s [%(levelname)s] %(message)s",
+)
+# -----------------------------------------------------------------------------
+# Environment
+# -----------------------------------------------------------------------------
+load_dotenv()
+HUBSPOT_TOKEN = os.getenv("HUBSPOT_TOKEN")
+SUPABASE_URL = os.getenv("SUPABASE_URL")
+SUPABASE_SERVICE_ROLE_KEY = os.getenv("SUPABASE_SERVICE_ROLE_KEY")
+# Optional bootstrap cursor if orchestrator doesn't provide one
+BOOTSTRAP_SINCE_MS_ENV = os.getenv("HUBSPOT_COMPANIES_SINCE_MS")
+if not HUBSPOT_TOKEN:
+    raise RuntimeError("HUBSPOT_TOKEN is not set")
+if not SUPABASE_URL or not SUPABASE_SERVICE_ROLE_KEY:
+    raise RuntimeError("Supabase env vars are not set")
+hubspot_client = hubspot.Client.create(access_token=HUBSPOT_TOKEN)
+supabase_client = create_client(SUPABASE_URL, SUPABASE_SERVICE_ROLE_KEY)
+# -----------------------------------------------------------------------------
+# Config
+# -----------------------------------------------------------------------------
+COMPANY_PROPERTIES = [
+    "name",
+    "city",
+    "company_email",
+    "address",
+    "address2",
+    "domain",
+    "number_of_active__cli_s",
+    "number_of__mobile___cloned_",
+    "createdate",
+    "hs_lastmodifieddate",
+    "lastmodifieddate",
+    "review_date_updated_by___clone",
+    "industry",
+    "macro_industry_grouping",
+    "closest_review_date___clone",
+]
+# -----------------------------------------------------------------------------
+# Time helpers
+# -----------------------------------------------------------------------------
+def _ensure_utc(dt: datetime.datetime) -> datetime.datetime:
+    if dt.tzinfo is None:
+        dt = dt.replace(tzinfo=datetime.timezone.utc)
+    return dt.astimezone(datetime.timezone.utc)
+def floor_to_utc_midnight(dt: datetime.datetime) -> datetime.datetime:
+    dt = _ensure_utc(dt)
+    return dt.replace(hour=0, minute=0, second=0, microsecond=0)
+def _parse_iso_like_to_dt(value: str) -> datetime.datetime:
+    if isinstance(value, str) and value.endswith("Z"):
+        value = value[:-1] + "+00:00"
+    dt = datetime.datetime.fromisoformat(value)
+    return _ensure_utc(dt)
+def to_epoch_ms(dt_or_str: Union[str, datetime.datetime]) -> int:
+    if isinstance(dt_or_str, str):
+        dt = _parse_iso_like_to_dt(dt_or_str)
+    elif isinstance(dt_or_str, datetime.datetime):
+        dt = _ensure_utc(dt_or_str)
+    else:
+        raise TypeError(f"Unsupported type for to_epoch_ms: {type(dt_or_str)}")
+    return int(dt.timestamp() * 1000)
+def parse_any_ts_ms(value: Optional[Union[str, int, float]]) -> Optional[int]:
+    if value is None:
+        return None
+    try:
+        v = int(str(value))
+        if v < 10_000_000_000_000:  # seconds → ms
+            v *= 1000
+        return v
+    except ValueError:
+        pass
+    try:
+        return to_epoch_ms(str(value))
+    except Exception:
+        logging.warning("Could not parse timestamp value=%r", value)
+        return None
+# -----------------------------------------------------------------------------
+# Search IDs (ts > since_ms) with property fallback
+# -----------------------------------------------------------------------------
+def _search_company_ids_from(since_ms: int, prop: str) -> List[str]:
+    """
+    Search companies where {prop} > since_ms (epoch-ms).
+    Sort ascending so we can advance the cursor monotonically.
+    """
+    url = "https://api.hubapi.com/crm/v3/objects/companies/search"
+    headers = {
+        "Authorization": f"Bearer {HUBSPOT_TOKEN}",
+        "Content-Type": "application/json",
+        "Accept": "application/json",
+    }
+    payload = {
+        "filterGroups": [{
+            "filters": [
+                {"propertyName": prop, "operator": "GT",
+                    "value": str(since_ms)},
+            ]
+        }],
+        "limit": 100,
+        "sorts": [{"propertyName": prop, "direction": "ASCENDING"}],
+    }
+    ids: List[str] = []
+    after: Optional[str] = None
+    with httpx.Client(timeout=30.0) as client:
+        while True:
+            body = dict(payload)
+            if after:
+                body["after"] = after
+            resp = client.post(url, headers=headers, json=body)
+            if resp.status_code >= 400:
+                try:
+                    logging.error(
+                        "Company search error for prop '%s': %s", prop, resp.json())
+                except Exception:
+                    logging.error(
+                        "Company search error for prop '%s': %s", prop, resp.text)
+                resp.raise_for_status()
+            data = resp.json()
+            ids.extend([obj["id"] for obj in data.get("results", []) or []])
+            after = (data.get("paging") or {}).get("next", {}).get("after")
+            if not after:
+                break
+            time.sleep(0.1)
+    return ids
+def search_company_ids_after_ms(since_ms: int) -> Tuple[List[str], str]:
+    """
+    Try these properties in order; return (ids, prop_used) for the first successful search:
+        1) hs_lastmodifieddate
+        2) lastmodifieddate
+        3) createdate
+    """
+    props_to_try = ["createdate"]
+    last_err = None
+    for prop in props_to_try:
+        try:
+            ids = _search_company_ids_from(since_ms, prop)
+            logging.info(
+                "Company search with '%s' returned %d IDs.", prop, len(ids))
+            return ids, prop
+        except httpx.HTTPStatusError as e:
+            last_err = e
+            continue
+    if last_err:
+        raise last_err
+    return [], "hs_lastmodifieddate"
+# -----------------------------------------------------------------------------
+# Read-by-ID (with associations) → enrich & track max cursor ts
+# -----------------------------------------------------------------------------
+def _enrich_company_data_from_record(
+    record,
+    industry_map: Dict[str, str],
+    macro_industry_map: Dict[str, str],
+) -> Dict:
+    props = record.properties or {}
+    company_data: Dict[str, Optional[str]] = {"id": record.id}
+    for p in COMPANY_PROPERTIES:
+        company_data[p] = props.get(p)
+    # Association counts
+    num_contacts = 0
+    num_deals = 0
+    if getattr(record, "associations", None):
+        if record.associations.get("contacts") and getattr(record.associations["contacts"], "results", None):
+            num_contacts = len(
+                {a.id for a in record.associations["contacts"].results if getattr(a, "id", None)})
+        if record.associations.get("deals") and getattr(record.associations["deals"], "results", None):
+            num_deals = len(
+                {a.id for a in record.associations["deals"].results if getattr(a, "id", None)})
+    company_data["number_of_associated_contacts"] = num_contacts
+    company_data["number_of_associated_deals"] = num_deals
+    # Map label properties
+    code = company_data.get("industry")
+    company_data["industry"] = industry_map.get(
+        code) if code in industry_map else None
+    macro = company_data.get("macro_industry_grouping")
+    company_data["macro_industry_grouping"] = macro_industry_map.get(
+        macro) if macro in macro_industry_map else None
+    return company_data
+def read_companies_by_ids(company_ids: List[str], cursor_prop: str) -> Tuple[List[Dict], Optional[int]]:
+    if not company_ids:
+        return [], None
+    companies: List[Dict] = []
+    assoc_types = ["contacts", "deals"]
+    # Fetch label maps once
+    try:
+        industry_map = get_property_label_mapping(
+            hubspot_client, "companies", "industry")
+    except Exception as e:
+        logging.warning("Failed to fetch industry map: %s", e)
+        industry_map = {}
+    try:
+        macro_industry_map = get_property_label_mapping(
+            hubspot_client, "companies", "macro_industry_grouping")
+    except Exception as e:
+        logging.warning("Failed to fetch macro_industry_grouping map: %s", e)
+        macro_industry_map = {}
+    max_ts_ms: Optional[int] = None
+    for i, cid in enumerate(company_ids, start=1):
+        try:
+            record = hubspot_client.crm.companies.basic_api.get_by_id(
+                company_id=cid,
+                properties=COMPANY_PROPERTIES,
+                associations=assoc_types,
+                archived=False,
+            )
+            # Track max timestamp for the chosen cursor property
+            cursor_val = (record.properties or {}).get(cursor_prop)
+            ts_ms = parse_any_ts_ms(cursor_val)
+            if ts_ms is not None and (max_ts_ms is None or ts_ms > max_ts_ms):
+                max_ts_ms = ts_ms
+            companies.append(_enrich_company_data_from_record(
+                record, industry_map, macro_industry_map))
+            if i % 200 == 0:
+                logging.info("Read %d companies...", i)
+            time.sleep(0.05)
+        except httpx.HTTPStatusError as e:
+            logging.error("HTTP error reading company %s: %s", cid, e)
+        except (CompaniesApiException, httpx.HTTPError) as e:
+            logging.error("Error reading company %s: %s", cid, e)
+    return companies, max_ts_ms
+# -----------------------------------------------------------------------------
+# Map → Supabase rows and diff
+# -----------------------------------------------------------------------------
+def map_company_data_for_db(companies: List[Dict]) -> List[Dict]:
+    mapped: List[Dict] = []
+    for c in companies:
+        base_row = {
+            "company_id": try_parse_int(c["id"]),
+            "company_name": c.get("name"),
+            "company_email": c.get("company_email"),
+            "city": c.get("city"),
+            "domain": c.get("domain"),
+            "street_address": c.get("address"),
+            "street_address2": c.get("address2"),
+            "hubspot_create_date": parse_ts(c.get("createdate")),
+            "hubspot_modified_date": parse_ts(c.get("hs_lastmodifieddate") or c.get("lastmodifieddate")),
+            "review_date_updated_by": c.get("review_date_updated_by___clone") or None,
+            "number_of_active_clis": try_parse_int(c.get("number_of_active__cli_s")),
+            "number_of_clis": try_parse_int(c.get("number_of__mobile___cloned_")),
+            "number_of_associated_contacts": c.get("number_of_associated_contacts", 0),
+            "number_of_associated_deals": c.get("number_of_associated_deals", 0),
+            "industry": c.get("industry"),
+            "macro_industry_grouping": c.get("macro_industry_grouping"),
+            "closest_review_date": parse_ts(c.get("closest_review_date___clone")),
+        }
+        mapped.append(enrich_supabase_row(base_row))
+    return mapped
+def companies_are_different(new_row: Dict, old_row: Dict) -> bool:
+    compare_keys = [
+        "company_name", "company_email", "city", "domain", "street_address",
+        "street_address2", "hubspot_modified_date", "review_date_updated_by",
+        "number_of_active_clis", "number_of_clis",
+        "number_of_associated_contacts", "number_of_associated_deals",
+        "industry", "macro_industry_grouping", "closest_review_date",
+    ]
+    for key in compare_keys:
+        if str(new_row.get(key)) != str(old_row.get(key)):
+            return True
+    return False
+# -----------------------------------------------------------------------------
+# Upsert
+# -----------------------------------------------------------------------------
+def upsert_companies(companies: List[Dict]) -> None:
+    if not companies:
+        print("No companies to upsert.")
+        return
+    existing = fetch_supabase_table(
+        supabase_client, "hubspot_companies", "company_id")
+    mapped = map_company_data_for_db(companies)
+    rows_to_upsert: List[Dict] = []
+    for row in mapped:
+        cid = row.get("company_id")
+        if not cid:
+            continue
+        old_row = existing.get(str(cid))
+        if not old_row or companies_are_different(row, old_row):
+            rows_to_upsert.append(row)
+    print(f"{len(rows_to_upsert)} companies to insert/update (out of {len(companies)} read).")
+    if rows_to_upsert:
+        # upload_raw_json_to_supabase(supabase_client, rows_to_upsert, object_type="companies")
+        batched_insert(supabase_client, "hubspot_companies",
+                       rows_to_upsert, batch_size=1000)
+# -----------------------------------------------------------------------------
+# Main (timestamp cursor)
+# -----------------------------------------------------------------------------
+def main(since_ms: Optional[int] = None):
+    """
+    Orchestrates:
+      1) Search company IDs with <cursor_prop> > since_ms (property fallback)
+      2) Read full companies (track max timestamp for <cursor_prop>)
+      3) Upsert into Supabase
+      4) Update sync metadata with { last_sync_metadata, last_sync_time, cursor_prop }
+    """
+    # Resolve since_ms
+    if since_ms is None and BOOTSTRAP_SINCE_MS_ENV:
+        try:
+            since_ms = int(BOOTSTRAP_SINCE_MS_ENV)
+        except ValueError:
+            raise RuntimeError(
+                "HUBSPOT_COMPANIES_SINCE_MS must be an integer (ms) if set.")
+    if since_ms is None:
+        # Default: today@00:00:00Z for first run
+        today0 = floor_to_utc_midnight(
+            datetime.datetime.now(datetime.timezone.utc))
+        since_ms = to_epoch_ms(today0)
+    print(f"Searching companies with timestamp > {since_ms} ...")
+    ids, cursor_prop = search_company_ids_after_ms(since_ms)
+    print(f"Search property: {cursor_prop}. Found {len(ids)} company IDs.")
+    now_iso = datetime.datetime.now(datetime.timezone.utc).isoformat()
+    if not ids:
+        print("No companies beyond the cursor. Updating sync metadata and exiting.")
+        update_sync_metadata(supabase_client, "companies", now_iso)
+        return
+    print("Reading companies (with associations)...")
+    companies, max_ts_ms = read_companies_by_ids(ids, cursor_prop)
+    print("Upserting into Supabase...")
+    upsert_companies(companies)
+    # Advance cursor to max timestamp we actually ingested for the chosen property
+    new_cursor_ms = max_ts_ms if max_ts_ms is not None else since_ms
+    update_sync_metadata(supabase_client, "companies", now_iso)
+    print(
+        f"Companies sync complete. Advanced cursor to {new_cursor_ms} using prop '{cursor_prop}'.")
+# -----------------------------------------------------------------------------
+# CLI
+# -----------------------------------------------------------------------------
+def _parse_cli_arg_to_ms(arg: str) -> int:
+    """
+    Accept:
+      - integer epoch ms
+      - ISO-8601 (Z or offset)
+      - YYYY-MM-DD (floors to 00:00Z)
+    """
+    # epoch ms or seconds
+    if re.fullmatch(r"\d{10,13}", arg):
+        v = int(arg)
+        if v < 10_000_000_000_000:  # seconds -> ms
+            v *= 1000
+        return v
+    # YYYY-MM-DD
+    if re.fullmatch(r"\d{4}-\d{2}-\d{2}", arg):
+        d = datetime.datetime.strptime(
+            arg, "%Y-%m-%d").replace(tzinfo=datetime.timezone.utc)
+        return to_epoch_ms(floor_to_utc_midnight(d))
+    # ISO-8601
+    return to_epoch_ms(arg)
+if __name__ == "__main__":
+    import sys
+    if len(sys.argv) > 1:
+        try:
+            main(since_ms=_parse_cli_arg_to_ms(sys.argv[1]))
+        except Exception as e:
+            print(
+                f"Invalid timestamp. Provide epoch ms, ISO-8601, or YYYY-MM-DD. Error: {e}")
+            sys.exit(1)
+    else:
+        main()

python/hubspot_contacts.py ADDED Viewed

	@@ -0,0 +1,415 @@

+"""
+HubSpot Contacts → Supabase (incremental since a millisecond cursor)
+Usage from orchestrator:
+    import load_hubspot_contacts
+    load_hubspot_contacts.main(since_ms=<int milliseconds since epoch UTC>)
+"""
+import os
+import re
+import time
+import logging
+import datetime
+from typing import List, Dict, Tuple, Optional, Union
+import httpx
+import hubspot
+from dotenv import load_dotenv
+from supabase import create_client
+from hubspot.crm.contacts import ApiException as ContactsApiException
+from hubspot_utils import (
+    try_parse_int, parse_ts, get_property_label_mapping,
+)
+from supabase_utils import (
+    fetch_supabase_table, update_sync_metadata, enrich_supabase_row,
+    upload_raw_json_to_supabase, batched_insert,
+)
+# -----------------------------------------------------------------------------
+# Logging
+# -----------------------------------------------------------------------------
+logging.basicConfig(
+    filename=f"logs/hubspot_contact_pipeline_{datetime.datetime.now().strftime('%Y-%m-%d')}.log",
+    filemode="a",
+    level=logging.INFO,
+    format="%(asctime)s [%(levelname)s] %(message)s",
+)
+# -----------------------------------------------------------------------------
+# Environment
+# -----------------------------------------------------------------------------
+load_dotenv()
+HUBSPOT_TOKEN = os.getenv("HUBSPOT_TOKEN")
+SUPABASE_URL = os.getenv("SUPABASE_URL")
+SUPABASE_SERVICE_ROLE_KEY = os.getenv("SUPABASE_SERVICE_ROLE_KEY")
+# Optional bootstrap cursor if orchestrator doesn't provide one
+BOOTSTRAP_SINCE_MS_ENV = os.getenv("HUBSPOT_CONTACTS_SINCE_MS")
+if not HUBSPOT_TOKEN:
+    raise RuntimeError("HUBSPOT_TOKEN is not set")
+if not SUPABASE_URL or not SUPABASE_SERVICE_ROLE_KEY:
+    raise RuntimeError("Supabase env vars are not set")
+hubspot_client = hubspot.Client.create(access_token=HUBSPOT_TOKEN)
+supabase_client = create_client(SUPABASE_URL, SUPABASE_SERVICE_ROLE_KEY)
+# -----------------------------------------------------------------------------
+# Config
+# -----------------------------------------------------------------------------
+CONTACT_PROPERTIES = [
+    "full_name",
+    "firstname",
+    "lastname",
+    "email",
+    "phone",
+    "job_title_level",
+    "createdate",
+    "lastmodifieddate",
+    "hs_lastmodifieddate",
+    "notes_last_updated",
+    "associatedcompanyid",
+]
+# -----------------------------------------------------------------------------
+# Time helpers
+# -----------------------------------------------------------------------------
+def _ensure_utc(dt: datetime.datetime) -> datetime.datetime:
+    if dt.tzinfo is None:
+        dt = dt.replace(tzinfo=datetime.timezone.utc)
+    return dt.astimezone(datetime.timezone.utc)
+def floor_to_utc_midnight(dt: datetime.datetime) -> datetime.datetime:
+    dt = _ensure_utc(dt)
+    return dt.replace(hour=0, minute=0, second=0, microsecond=0)
+def _parse_iso_like_to_dt(value: str) -> datetime.datetime:
+    if isinstance(value, str) and value.endswith("Z"):
+        value = value[:-1] + "+00:00"
+    dt = datetime.datetime.fromisoformat(value)
+    return _ensure_utc(dt)
+def to_epoch_ms(dt_or_str: Union[str, datetime.datetime]) -> int:
+    if isinstance(dt_or_str, str):
+        dt = _parse_iso_like_to_dt(dt_or_str)
+    elif isinstance(dt_or_str, datetime.datetime):
+        dt = _ensure_utc(dt_or_str)
+    else:
+        raise TypeError(f"Unsupported type for to_epoch_ms: {type(dt_or_str)}")
+    return int(dt.timestamp() * 1000)
+def parse_any_ts_ms(value: Optional[Union[str, int, float]]) -> Optional[int]:
+    """
+    Accepts ms-epoch / sec-epoch / ISO-8601; returns ms since epoch or None.
+    """
+    if value is None:
+        return None
+    try:
+        v = int(str(value))
+        if v < 10_000_000_000_000:  # seconds → ms
+            v *= 1000
+        return v
+    except ValueError:
+        pass
+    try:
+        return to_epoch_ms(str(value))
+    except Exception:
+        logging.warning("Could not parse timestamp value=%r", value)
+        return None
+# -----------------------------------------------------------------------------
+# Mapping & helpers
+# -----------------------------------------------------------------------------
+def map_contact_data_for_db(contacts: List[Dict]) -> List[Dict]:
+    mapped = []
+    for c in contacts:
+        base_row = {
+            "contact_id": try_parse_int(c["id"]),
+            "full_name": c.get("full_name"),
+            "first_name": c.get("firstname"),
+            "last_name": c.get("lastname"),
+            "email": c.get("email"),
+            "phone_number": c.get("phone"),
+            "job_title_level": c.get("job_title_level"),
+            "hubspot_create_date": parse_ts(c.get("createdate")) or None,
+            "hubspot_modified_date": parse_ts(c.get("lastmodifieddate") or c.get("hs_lastmodifieddate")) or None,
+            "hubspot_last_activity_date": parse_ts(c.get("notes_last_updated")) or None,
+            "number_of_associated_deals": c.get("number_of_associated_deals", 0),
+            "associated_company_id": try_parse_int(c.get("associatedcompanyid")),
+        }
+        mapped.append(enrich_supabase_row(base_row))
+    return mapped
+def contacts_are_different(new_row: Dict, old_row: Dict) -> bool:
+    compare_keys = [
+        "full_name", "first_name", "last_name", "email", "phone_number",
+        "hubspot_create_date", "hubspot_modified_date",
+        "hubspot_last_activity_date", "number_of_associated_deals",
+        "associated_company_id"
+    ]
+    for key in compare_keys:
+        if str(new_row.get(key)) != str(old_row.get(key)):
+            return True
+    return False
+# -----------------------------------------------------------------------------
+# Search IDs (ts > since_ms) with property fallback
+# -----------------------------------------------------------------------------
+def _search_contact_ids_from(since_ms: int, prop: str) -> List[str]:
+    """
+    Search contacts where {prop} > since_ms (epoch-ms).
+    Sort ascending so we can advance the cursor monotonically.
+    """
+    url = "https://api.hubapi.com/crm/v3/objects/contacts/search"
+    headers = {
+        "Authorization": f"Bearer {HUBSPOT_TOKEN}",
+        "Content-Type": "application/json",
+        "Accept": "application/json",
+    }
+    payload = {
+        "filterGroups": [{
+            "filters": [
+                {"propertyName": prop, "operator": "GT", "value": str(since_ms)}
+            ]
+        }],
+        "limit": 100,
+        "sorts": [{"propertyName": prop, "direction": "ASCENDING"}],
+    }
+    ids: List[str] = []
+    after: Optional[str] = None
+    with httpx.Client(timeout=30.0) as client:
+        while True:
+            body = dict(payload)
+            if after:
+                body["after"] = after
+            resp = client.post(url, headers=headers, json=body)
+            if resp.status_code >= 400:
+                try:
+                    logging.error("Contacts search error for prop '%s': %s", prop, resp.json())
+                except Exception:
+                    logging.error("Contacts search error for prop '%s': %s", prop, resp.text)
+                resp.raise_for_status()
+            data = resp.json()
+            ids.extend([obj["id"] for obj in data.get("results", []) or []])
+            after = (data.get("paging") or {}).get("next", {}).get("after")
+            if not after:
+                break
+            time.sleep(0.1)
+    return ids
+def search_contact_ids_after_ms(since_ms: int) -> Tuple[List[str], str]:
+    """
+    Try these properties in order; return (ids, prop_used) for the first successful search:
+        1) hs_lastmodifieddate
+        2) lastmodifieddate
+        3) createdate
+    """
+    props_to_try = ["createdate"]
+    last_err = None
+    for prop in props_to_try:
+        try:
+            ids = _search_contact_ids_from(since_ms, prop)
+            logging.info("Contacts search with '%s' returned %d IDs.", prop, len(ids))
+            return ids, prop
+        except httpx.HTTPStatusError as e:
+            last_err = e
+            continue
+    if last_err:
+        raise last_err
+    return [], "hs_lastmodifieddate"
+# -----------------------------------------------------------------------------
+# Read-by-ID (with associations) → enrich & track max cursor ts
+# -----------------------------------------------------------------------------
+def _enrich_contact_data_from_record(record, job_level_map: Optional[Dict[str, str]]) -> Dict:
+    props = record.properties or {}
+    contact_data: Dict[str, Optional[str]] = {"id": record.id}
+    for p in CONTACT_PROPERTIES:
+        contact_data[p] = props.get(p)
+    # Full name fallback
+    if not contact_data.get("full_name") or not str(contact_data["full_name"]).strip():
+        first = contact_data.get("firstname") or ""
+        last = contact_data.get("lastname") or ""
+        contact_data["full_name"] = f"{first.strip()} {last.strip()}".strip()
+    # Associations: deals count
+    num_deals = 0
+    if getattr(record, "associations", None) and record.associations.get("deals"):
+        bucket = record.associations["deals"]
+        if getattr(bucket, "results", None):
+            num_deals = len({a.id for a in bucket.results if getattr(a, "id", None)})
+    contact_data["number_of_associated_deals"] = num_deals
+    # Map job_title_level code → label (if configured)
+    if job_level_map:
+        code = contact_data.get("job_title_level")
+        contact_data["job_title_level"] = job_level_map.get(code) if code in job_level_map else None
+    return contact_data
+def read_contacts_by_ids(contact_ids: List[str], cursor_prop: str) -> Tuple[List[Dict], Optional[int]]:
+    if not contact_ids:
+        return [], None
+    contacts: List[Dict] = []
+    assoc_types = ["deals"]
+    # Fetch property label map once
+    try:
+        job_level_map = get_property_label_mapping(hubspot_client, "contacts", "job_title_level")
+    except Exception as e:
+        logging.warning("Failed to fetch job_title_level map: %s", e)
+        job_level_map = None
+    max_ts_ms: Optional[int] = None
+    for i, cid in enumerate(contact_ids, start=1):
+        try:
+            record = hubspot_client.crm.contacts.basic_api.get_by_id(
+                contact_id=cid,
+                properties=CONTACT_PROPERTIES,
+                associations=assoc_types,
+                archived=False,
+            )
+            # Track max timestamp for the chosen cursor property
+            cursor_val = (record.properties or {}).get(cursor_prop)
+            ts_ms = parse_any_ts_ms(cursor_val)
+            if ts_ms is not None and (max_ts_ms is None or ts_ms > max_ts_ms):
+                max_ts_ms = ts_ms
+            contacts.append(_enrich_contact_data_from_record(record, job_level_map))
+            if i % 200 == 0:
+                logging.info("Read %d contacts...", i)
+            time.sleep(0.05)  # gentle pacing
+        except httpx.HTTPStatusError as e:
+            logging.error("HTTP error reading contact %s: %s", cid, e)
+        except (ContactsApiException, httpx.HTTPError) as e:
+            logging.error("Error reading contact %s: %s", cid, e)
+    return contacts, max_ts_ms
+# -----------------------------------------------------------------------------
+# Upsert flow
+# -----------------------------------------------------------------------------
+def upsert_contacts(contacts: List[Dict]) -> None:
+    if not contacts:
+        print("No contacts to upsert.")
+        return
+    existing = fetch_supabase_table(supabase_client, "hubspot_contacts", "contact_id")
+    rows_to_upsert: List[Dict] = []
+    for c in contacts:
+        contact_id = try_parse_int(c.get("id"))
+        if not contact_id:
+            continue
+        mapped_row = map_contact_data_for_db([c])[0]
+        existing_row = existing.get(str(contact_id))
+        if not existing_row or contacts_are_different(mapped_row, existing_row):
+            rows_to_upsert.append(mapped_row)
+    print(f"{len(rows_to_upsert)} contacts to insert/update (out of {len(contacts)} read).")
+    if rows_to_upsert:
+        # upload_raw_json_to_supabase(supabase_client, rows_to_upsert, object_type="contacts")
+        batched_insert(supabase_client, "hubspot_contacts", rows_to_upsert, batch_size=1000)
+# -----------------------------------------------------------------------------
+# Main (timestamp cursor)
+# -----------------------------------------------------------------------------
+def main(since_ms: Optional[int] = None):
+    """
+    Orchestrates:
+      1) Search contact IDs with <cursor_prop> > since_ms (property fallback)
+      2) Read full contacts (track max timestamp for <cursor_prop>)
+      3) Upsert into Supabase
+      4) Update sync metadata with { last_sync_metadata, last_sync_time, cursor_prop }
+    """
+    # Resolve since_ms
+    if since_ms is None and BOOTSTRAP_SINCE_MS_ENV:
+        try:
+            since_ms = int(BOOTSTRAP_SINCE_MS_ENV)
+        except ValueError:
+            raise RuntimeError("HUBSPOT_CONTACTS_SINCE_MS must be an integer (ms) if set.")
+    if since_ms is None:
+        # Default: today@00:00:00Z for first run
+        today0 = floor_to_utc_midnight(datetime.datetime.now(datetime.timezone.utc))
+        since_ms = to_epoch_ms(today0)
+    print(f"Searching contacts with timestamp > {since_ms} ...")
+    ids, cursor_prop = search_contact_ids_after_ms(since_ms)
+    print(f"Search property: {cursor_prop}. Found {len(ids)} contact IDs.")
+    now_iso = datetime.datetime.now(datetime.timezone.utc).isoformat()
+    if not ids:
+        print("No contacts beyond the cursor. Updating sync metadata and exiting.")
+        update_sync_metadata(supabase_client, "contacts", now_iso)
+        return
+    print("Reading contacts (with associations)...")
+    contacts, max_ts_ms = read_contacts_by_ids(ids, cursor_prop)
+    print("Upserting into Supabase...")
+    upsert_contacts(contacts)
+    # Advance cursor to max timestamp we actually ingested for the chosen property
+    new_cursor_ms = max_ts_ms if max_ts_ms is not None else since_ms
+    update_sync_metadata(supabase_client, "contacts", now_iso)
+    print(f"Contacts sync complete. Advanced cursor to {new_cursor_ms} using prop '{cursor_prop}'.")
+# -----------------------------------------------------------------------------
+# CLI
+# -----------------------------------------------------------------------------
+def _parse_cli_arg_to_ms(arg: str) -> int:
+    """
+    Accept:
+      - integer epoch ms
+      - ISO-8601 (Z or offset)
+      - YYYY-MM-DD (floors to 00:00Z)
+    """
+    # epoch ms or seconds
+    if re.fullmatch(r"\d{10,13}", arg):
+        v = int(arg)
+        if v < 10_000_000_000_000:  # seconds -> ms
+            v *= 1000
+        return v
+    # YYYY-MM-DD
+    if re.fullmatch(r"\d{4}-\d{2}-\d{2}", arg):
+        d = datetime.datetime.strptime(arg, "%Y-%m-%d").replace(tzinfo=datetime.timezone.utc)
+        return to_epoch_ms(floor_to_utc_midnight(d))
+    # ISO-8601
+    return to_epoch_ms(arg)
+if __name__ == "__main__":
+    import sys
+    if len(sys.argv) > 1:
+        try:
+            main(since_ms=_parse_cli_arg_to_ms(sys.argv[1]))
+        except Exception as e:
+            print(f"Invalid timestamp. Provide epoch ms, ISO-8601, or YYYY-MM-DD. Error: {e}")
+            sys.exit(1)
+    else:
+        main()

python/hubspot_deals.py ADDED Viewed

	@@ -0,0 +1,445 @@

+"""
+HubSpot Deals → Supabase (incremental since a millisecond cursor)
+Usage from orchestrator:
+    import load_hubspot_deals
+    load_hubspot_deals.main(since_ms=<int milliseconds since epoch UTC>)
+Direct CLI:
+    # epoch ms
+    python load_hubspot_deals.py 1754025600000
+    # ISO-8601
+    python load_hubspot_deals.py 2025-08-01T09:30:00Z
+    # Back-compat date (floors to 00:00Z)
+    python load_hubspot_deals.py 2025-08-01
+"""
+import os
+import re
+import time
+import logging
+import datetime
+from typing import List, Dict, Optional, Tuple, Union
+import httpx
+import hubspot
+from dotenv import load_dotenv
+from supabase import create_client
+from hubspot.crm.deals import ApiException as DealsApiException
+from hubspot_utils import (
+    parse_ts, try_parse_int, try_parse_float, deduplicate_by_key
+)
+from supabase_utils import (
+    insert_into_supabase_table, update_sync_metadata
+)
+# -----------------------------------------------------------------------------
+# Constants
+# -----------------------------------------------------------------------------
+DEAL_TO_COMPANY_ASSOC_TYPE = "deal_to_company"
+HUBSPOT_TEAM_MAP = {
+    "1322863": "Fulfilment Team",
+    "1322864": "Customer Services Team",
+    "1322865": "Sales Team",
+    "1448134": "Accounts & Billing Team",
+    "3557793": "Marketing Team",
+    "57348939": "MDR Team",
+}
+DEAL_PROPERTIES = [
+    "dealname",
+    "hubspot_owner_id",
+    "pipeline",
+    "dealstage",
+    "createdate",            # use createdate; hs_createdate is also seen in some portals
+    "hs_createdate",
+    "closedate",
+    "contract_signed_date",
+    "contract_end_date",
+    "notes_last_updated",
+    "num_contacted_notes",
+    "hubspot_team_id",
+    "hs_analytics_source",
+    "number_of_cli",
+    "amount",
+    "hs_acv",
+    "hs_tcv",
+    "margin",
+    "source_of_deal_2___migration",
+    "hs_primary_associated_company",
+    "hs_lastmodifieddate",   # for mapping/debugging
+    "lastmodifieddate",
+]
+# -----------------------------------------------------------------------------
+# Logging
+# -----------------------------------------------------------------------------
+logging.basicConfig(
+    filename=f"logs/hubspot_deals_pipeline_{datetime.datetime.now().strftime('%Y-%m-%d')}.log",
+    filemode="a",
+    level=logging.INFO,
+    format="%(asctime)s [%(levelname)s] %(message)s",
+)
+# -----------------------------------------------------------------------------
+# Environment
+# -----------------------------------------------------------------------------
+load_dotenv()
+HUBSPOT_TOKEN = os.getenv("HUBSPOT_TOKEN")
+SUPABASE_URL = os.getenv("SUPABASE_URL")
+SUPABASE_SERVICE_ROLE_KEY = os.getenv("SUPABASE_SERVICE_ROLE_KEY")
+# Optional bootstrap cursor if orchestrator doesn't provide one
+BOOTSTRAP_SINCE_MS_ENV = os.getenv("HUBSPOT_DEALS_SINCE_MS")
+if not HUBSPOT_TOKEN:
+    raise RuntimeError("HUBSPOT_TOKEN is not set")
+if not SUPABASE_URL or not SUPABASE_SERVICE_ROLE_KEY:
+    raise RuntimeError("Supabase env vars are not set")
+hubspot_client = hubspot.Client.create(access_token=HUBSPOT_TOKEN)
+supabase_client = create_client(SUPABASE_URL, SUPABASE_SERVICE_ROLE_KEY)
+# -----------------------------------------------------------------------------
+# Time helpers
+# -----------------------------------------------------------------------------
+def _ensure_utc(dt: datetime.datetime) -> datetime.datetime:
+    if dt.tzinfo is None:
+        dt = dt.replace(tzinfo=datetime.timezone.utc)
+    return dt.astimezone(datetime.timezone.utc)
+def floor_to_utc_midnight(dt: datetime.datetime) -> datetime.datetime:
+    dt = _ensure_utc(dt)
+    return dt.replace(hour=0, minute=0, second=0, microsecond=0)
+def _parse_iso_like_to_dt(value: str) -> datetime.datetime:
+    if isinstance(value, str) and value.endswith("Z"):
+        value = value[:-1] + "+00:00"
+    dt = datetime.datetime.fromisoformat(value)
+    return _ensure_utc(dt)
+def to_epoch_ms(dt_or_str: Union[str, datetime.datetime]) -> int:
+    if isinstance(dt_or_str, str):
+        dt = _parse_iso_like_to_dt(dt_or_str)
+    elif isinstance(dt_or_str, datetime.datetime):
+        dt = _ensure_utc(dt_or_str)
+    else:
+        raise TypeError(f"Unsupported type for to_epoch_ms: {type(dt_or_str)}")
+    return int(dt.timestamp() * 1000)
+def parse_any_ts_ms(value: Optional[Union[str, int, float]]) -> Optional[int]:
+    """
+    Accepts ms-epoch / sec-epoch / ISO-8601; returns ms since epoch or None.
+    """
+    if value is None:
+        return None
+    try:
+        v = int(str(value))
+        if v < 10_000_000_000_000:  # seconds → ms
+            v *= 1000
+        return v
+    except ValueError:
+        pass
+    try:
+        return to_epoch_ms(str(value))
+    except Exception:
+        logging.warning("Could not parse timestamp value=%r", value)
+        return None
+# -----------------------------------------------------------------------------
+# Pipeline / Stage labels
+# -----------------------------------------------------------------------------
+def get_pipeline_and_stage_mappings() -> Tuple[Dict[str, str], Dict[str, str]]:
+    """Retrieve pipeline and stage label mappings for deals."""
+    try:
+        resp = hubspot_client.crm.pipelines.pipelines_api.get_all(object_type="deals")
+        pipeline_mapping: Dict[str, str] = {}
+        stage_mapping: Dict[str, str] = {}
+        for p in resp.results:
+            pipeline_mapping[p.id] = p.label
+            for s in p.stages:
+                stage_mapping[s.id] = s.label
+        return pipeline_mapping, stage_mapping
+    except Exception as e:
+        logging.error("Failed to fetch pipeline/stage mappings: %s", e)
+        return {}, {}
+# -----------------------------------------------------------------------------
+# Search IDs (ts > since_ms) with property fallback
+# -----------------------------------------------------------------------------
+def _search_deal_ids_from(since_ms: int, prop: str) -> List[str]:
+    """
+    Search deals where {prop} > since_ms (epoch-ms).
+    Sort ascending so we can advance the cursor monotonically.
+    """
+    url = "https://api.hubapi.com/crm/v3/objects/deals/search"
+    headers = {
+        "Authorization": f"Bearer {HUBSPOT_TOKEN}",
+        "Content-Type": "application/json",
+        "Accept": "application/json",
+    }
+    payload = {
+        "filterGroups": [{
+            "filters": [
+                {"propertyName": prop, "operator": "GT", "value": str(since_ms)},
+            ]
+        }],
+        "limit": 100,
+        "sorts": [{"propertyName": prop, "direction": "ASCENDING"}],
+    }
+    ids: List[str] = []
+    after: Optional[str] = None
+    with httpx.Client(timeout=30.0) as client:
+        while True:
+            body = dict(payload)
+            if after:
+                body["after"] = after
+            resp = client.post(url, headers=headers, json=body)
+            if resp.status_code >= 400:
+                try:
+                    logging.error("Deal search error for prop '%s': %s", prop, resp.json())
+                except Exception:
+                    logging.error("Deal search error for prop '%s': %s", prop, resp.text)
+                resp.raise_for_status()
+            data = resp.json()
+            ids.extend([obj["id"] for obj in data.get("results", []) or []])
+            after = (data.get("paging") or {}).get("next", {}).get("after")
+            if not after:
+                break
+            time.sleep(0.1)
+    return ids
+def search_deal_ids_after_ms(since_ms: int) -> Tuple[List[str], str]:
+    """
+    Try these properties in order; return (ids, prop_used) for the first successful search:
+        1) hs_lastmodifieddate
+        2) lastmodifieddate
+        3) createdate
+        4) hs_createdate
+    """
+    props_to_try = ["createdate", "hs_createdate"]
+    last_err = None
+    for prop in props_to_try:
+        try:
+            ids = _search_deal_ids_from(since_ms, prop)
+            logging.info("Deal search with '%s' returned %d IDs.", prop, len(ids))
+            return ids, prop
+        except httpx.HTTPStatusError as e:
+            last_err = e
+            continue
+    if last_err:
+        raise last_err
+    return [], "hs_lastmodifieddate"
+# -----------------------------------------------------------------------------
+# Read-by-ID (with associations) → map rows and track max cursor ts
+# -----------------------------------------------------------------------------
+def _extract_primary_company_id(record, props: Dict) -> Optional[int]:
+    """
+    Prefer hs_primary_associated_company property. If missing, fall back to
+    associations of type 'deal_to_company' (if available).
+    """
+    primary = props.get("hs_primary_associated_company")
+    if primary:
+        return try_parse_int(primary)
+    assoc = getattr(record, "associations", None)
+    if assoc and assoc.get("companies") and getattr(assoc["companies"], "results", None):
+        for a in assoc["companies"].results:
+            if getattr(a, "type", None) == DEAL_TO_COMPANY_ASSOC_TYPE or not hasattr(a, "type"):
+                return try_parse_int(a.id)
+    return None
+def read_deals_by_ids(
+    deal_ids: List[str],
+    cursor_prop: str,
+) -> Tuple[List[Dict], List[Dict], Optional[int]]:
+    """
+    Read deals by ID including contacts/companies associations.
+    Returns: (all_deals, deal_contact_links, max_ts_ms_for_cursor_prop)
+    """
+    if not deal_ids:
+        return [], [], None
+    all_deals: List[Dict] = []
+    deal_contact_links: List[Dict] = []
+    assoc_types = ["contacts", "companies"]
+    pipeline_map, stage_map = get_pipeline_and_stage_mappings()
+    max_ts_ms: Optional[int] = None
+    for i, did in enumerate(deal_ids, start=1):
+        try:
+            record = hubspot_client.crm.deals.basic_api.get_by_id(
+                deal_id=did, properties=DEAL_PROPERTIES, associations=assoc_types, archived=False
+            )
+            p = record.properties or {}
+            # Track max timestamp for the cursor property we used in the search
+            cursor_val = p.get(cursor_prop)
+            ts_ms = parse_any_ts_ms(cursor_val)
+            if ts_ms is not None and (max_ts_ms is None or ts_ms > max_ts_ms):
+                max_ts_ms = ts_ms
+            # Contacts
+            if getattr(record, "associations", None) and record.associations.get("contacts"):
+                bucket = record.associations["contacts"]
+                if getattr(bucket, "results", None):
+                    for a in bucket.results:
+                        if a.id and a.id.isdigit():
+                            deal_contact_links.append({
+                                "deal_id": try_parse_int(record.id),
+                                "contact_id": try_parse_int(a.id),
+                            })
+            # Company
+            company_id = _extract_primary_company_id(record, p)
+            # Created date: accept either createdate or hs_createdate
+            created_iso = p.get("createdate") or p.get("hs_createdate")
+            all_deals.append({
+                "deal_id": try_parse_int(record.id),
+                "dealname": p.get("dealname"),
+                "hubspot_owner_id": try_parse_int(p.get("hubspot_owner_id")),
+                "pipeline_id": try_parse_int(p.get("pipeline")),
+                "pipeline_label": pipeline_map.get(p.get("pipeline"), ""),
+                "dealstage": p.get("dealstage"),
+                "dealstage_label": stage_map.get(p.get("dealstage"), ""),
+                "hubspot_createdate": parse_ts(created_iso),
+                "closedate": parse_ts(p.get("closedate")) or None,
+                "contract_signed_date": p.get("contract_signed_date") or None,
+                "contract_end_date": p.get("contract_end_date") or None,
+                "hubspot_last_activity_date": parse_ts(p.get("notes_last_updated")),
+                "num_contacted_notes": p.get("num_contacted_notes"),
+                "hubspot_team_id": try_parse_int(p.get("hubspot_team_id")),
+                "hubspot_team_label": HUBSPOT_TEAM_MAP.get(p.get("hubspot_team_id"), ""),
+                "hs_analytics_source": p.get("hs_analytics_source"),
+                "number_of_cli": try_parse_int(p.get("number_of_cli")),
+                "amount": try_parse_int(p.get("amount")),
+                "annual_contract_value": try_parse_float(p.get("hs_acv")),
+                "total_contract_value": try_parse_float(p.get("hs_tcv")),
+                "margin": try_parse_float(p.get("margin")),
+                "source_of_deal": p.get("source_of_deal_2___migration"),
+                "company_id": company_id,
+            })
+            if i % 200 == 0:
+                logging.info("Read %d deals...", i)
+            time.sleep(0.05)
+        except httpx.HTTPStatusError as e:
+            logging.error("HTTP error reading deal %s: %s", did, e)
+        except (DealsApiException, httpx.HTTPError) as e:
+            logging.error("Error reading deal %s: %s", did, e)
+    return all_deals, deal_contact_links, max_ts_ms
+# -----------------------------------------------------------------------------
+# Upsert
+# -----------------------------------------------------------------------------
+def upsert_deals(deals: List[Dict], deal_contact_links: List[Dict]) -> None:
+    if deals:
+        insert_into_supabase_table(supabase_client, "hubspot_deals", deals)
+        print(f"Upserted {len(deals)} deals.")
+    if deal_contact_links:
+        deal_contact_links = deduplicate_by_key(deal_contact_links, key=("deal_id", "contact_id"))
+        insert_into_supabase_table(
+            supabase_client,
+            "hubspot_deal_contacts",
+            deal_contact_links,
+            on_conflict=["deal_id", "contact_id"],
+        )
+        print(f"Upserted {len(deal_contact_links)} deal-contact associations.")
+# -----------------------------------------------------------------------------
+# Main (timestamp cursor)
+# -----------------------------------------------------------------------------
+def main(since_ms: Optional[int] = None):
+    """
+    Orchestrates:
+      1) Search deal IDs with <cursor_prop> > since_ms (property fallback)
+      2) Read full deals with associations (track max timestamp for <cursor_prop>)
+      3) Upsert into Supabase
+      4) Update sync metadata with { last_sync_metadata, last_sync_time, cursor_prop }
+    """
+    # Resolve since_ms
+    if since_ms is None and BOOTSTRAP_SINCE_MS_ENV:
+        try:
+            since_ms = int(BOOTSTRAP_SINCE_MS_ENV)
+        except ValueError:
+            raise RuntimeError("HUBSPOT_DEALS_SINCE_MS must be an integer (ms) if set.")
+    if since_ms is None:
+        # Default: today@00:00:00Z for first run
+        today0 = floor_to_utc_midnight(datetime.datetime.now(datetime.timezone.utc))
+        since_ms = to_epoch_ms(today0)
+    print(f"Searching deals with timestamp > {since_ms} ...")
+    ids, cursor_prop = search_deal_ids_after_ms(since_ms)
+    print(f"Search property: {cursor_prop}. Found {len(ids)} deal IDs.")
+    now_iso = datetime.datetime.now(datetime.timezone.utc).isoformat()
+    if not ids:
+        print("No deals beyond the cursor. Updating sync metadata and exiting.")
+        update_sync_metadata(supabase_client, "hubspot_deals", now_iso)
+        return
+    print("Reading deals (with associations)...")
+    deals, deal_contact_links, max_ts_ms = read_deals_by_ids(ids, cursor_prop)
+    print("Upserting into Supabase...")
+    upsert_deals(deals, deal_contact_links)
+    # Advance cursor to max timestamp we actually ingested for the chosen property
+    new_cursor_ms = max_ts_ms if max_ts_ms is not None else since_ms
+    update_sync_metadata(supabase_client, "hubspot_deals", now_iso)
+    print(f"Deals sync complete. Advanced cursor to {new_cursor_ms} using prop '{cursor_prop}'.")
+# -----------------------------------------------------------------------------
+# CLI
+# -----------------------------------------------------------------------------
+def _parse_cli_arg_to_ms(arg: str) -> int:
+    """
+    Accept:
+      - integer epoch ms
+      - ISO-8601 (Z or offset)
+      - YYYY-MM-DD (floors to 00:00Z)
+    """
+    # epoch ms or seconds
+    if re.fullmatch(r"\d{10,13}", arg):
+        v = int(arg)
+        if v < 10_000_000_000_000:  # seconds -> ms
+            v *= 1000
+        return v
+    # YYYY-MM-DD
+    if re.fullmatch(r"\d{4}-\d{2}-\d{2}", arg):
+        d = datetime.datetime.strptime(arg, "%Y-%m-%d").replace(tzinfo=datetime.timezone.utc)
+        return to_epoch_ms(floor_to_utc_midnight(d))
+    # ISO-8601
+    return to_epoch_ms(arg)
+if __name__ == "__main__":
+    import sys
+    if len(sys.argv) > 1:
+        try:
+            main(since_ms=_parse_cli_arg_to_ms(sys.argv[1]))
+        except Exception as e:
+            print(f"Invalid timestamp. Provide epoch ms, ISO-8601, or YYYY-MM-DD. Error: {e}")
+            sys.exit(1)
+    else:
+        main()

python/hubspot_emails.py ADDED Viewed

	@@ -0,0 +1,472 @@

+"""
+HubSpot Emails → Supabase (incremental since a millisecond cursor)
+Usage from orchestrator:
+    import load_hubspot_emails
+    load_hubspot_emails.main(since_ms=<int milliseconds since epoch UTC>)
+Direct CLI:
+    # epoch ms
+    python load_hubspot_emails.py 1754025600000
+    # ISO-8601 (Z or offset)
+    python load_hubspot_emails.py 2025-08-01T09:30:00Z
+    # Back-compat date (floors to 00:00:00Z)
+    python load_hubspot_emails.py 2025-08-01
+"""
+import os
+import time
+import logging
+import datetime
+import re
+from typing import List, Dict, Optional, Tuple, Union
+import httpx
+import hubspot
+from dotenv import load_dotenv
+from supabase import create_client
+from hubspot.crm.objects.emails import ApiException as EmailApiException
+from hubspot_utils import (
+    clean_text,
+)
+from supabase_utils import (
+    batched_insert, update_sync_metadata,
+)
+# -----------------------------------------------------------------------------
+# Logging
+# -----------------------------------------------------------------------------
+logging.basicConfig(
+    filename=f"logs/hubspot_email_pipeline_{datetime.datetime.now().strftime('%Y-%m-%d')}.log",
+    filemode="a",
+    level=logging.INFO,
+    format="%(asctime)s [%(levelname)s] %(message)s",
+)
+# -----------------------------------------------------------------------------
+# Environment
+# -----------------------------------------------------------------------------
+load_dotenv()
+HUBSPOT_TOKEN = os.getenv("HUBSPOT_TOKEN")
+SUPABASE_URL = os.getenv("SUPABASE_URL")
+SUPABASE_SERVICE_ROLE_KEY = os.getenv("SUPABASE_SERVICE_ROLE_KEY")
+# Optional bootstrap cursor if orchestrator doesn't provide one
+BOOTSTRAP_SINCE_MS_ENV = os.getenv("HUBSPOT_EMAILS_SINCE_MS")
+if not HUBSPOT_TOKEN:
+    raise RuntimeError("HUBSPOT_TOKEN is not set")
+if not SUPABASE_URL or not SUPABASE_SERVICE_ROLE_KEY:
+    raise RuntimeError("Supabase env vars are not set")
+hubspot_client = hubspot.Client.create(access_token=HUBSPOT_TOKEN)
+supabase_client = create_client(SUPABASE_URL, SUPABASE_SERVICE_ROLE_KEY)
+# -----------------------------------------------------------------------------
+# Config
+# -----------------------------------------------------------------------------
+EMAIL_PROPERTIES = [
+    "hs_email_from_email",
+    "hs_email_to_email",
+    "hs_email_subject",
+    "hs_email_direction",
+    "hs_email_text",
+    "hs_timestamp",
+]
+# -----------------------------------------------------------------------------
+# Email parsing
+# -----------------------------------------------------------------------------
+EMAIL_RE = re.compile(r'[\w\.\+\-]+@[\w\.\-]+\.\w+')
+def parse_emails(raw: Optional[object]) -> List[str]:
+    if raw is None:
+        return []
+    candidates: List[str] = []
+    if isinstance(raw, list):
+        for item in raw or []:
+            if not item:
+                continue
+            if isinstance(item, str):
+                for chunk in re.split(r'[;,]', item):
+                    candidates.extend(EMAIL_RE.findall(chunk))
+    elif isinstance(raw, str):
+        for chunk in re.split(r'[;,]', raw):
+            candidates.extend(EMAIL_RE.findall(chunk))
+    else:
+        candidates.extend(EMAIL_RE.findall(str(raw)))
+    return sorted({c.strip().lower() for c in candidates if c and c.strip()})
+# -----------------------------------------------------------------------------
+# Time helpers
+# -----------------------------------------------------------------------------
+def _ensure_utc(dt: datetime.datetime) -> datetime.datetime:
+    if dt.tzinfo is None:
+        dt = dt.replace(tzinfo=datetime.timezone.utc)
+    return dt.astimezone(datetime.timezone.utc)
+def floor_to_utc_midnight(dt: datetime.datetime) -> datetime.datetime:
+    dt = _ensure_utc(dt)
+    return dt.replace(hour=0, minute=0, second=0, microsecond=0)
+def _parse_iso_like_to_dt(value: str) -> datetime.datetime:
+    if value.endswith("Z"):
+        value = value[:-1] + "+00:00"
+    dt = datetime.datetime.fromisoformat(value)
+    return _ensure_utc(dt)
+def to_epoch_ms(dt_or_str: Union[str, datetime.datetime]) -> int:
+    if isinstance(dt_or_str, str):
+        dt = _parse_iso_like_to_dt(dt_or_str)
+    elif isinstance(dt_or_str, datetime.datetime):
+        dt = _ensure_utc(dt_or_str)
+    else:
+        raise TypeError(f"Unsupported type for to_epoch_ms: {type(dt_or_str)}")
+    return int(dt.timestamp() * 1000)
+def parse_hs_timestamp_ms(value: Optional[Union[str, int, float]]) -> Optional[int]:
+    """
+    HubSpot CRM datetime properties often come back as strings.
+    Accepts:
+      - ms epoch as str/int/float
+      - ISO-8601 strings (rare for CRM objects, but robust to it)
+    Returns ms since epoch or None.
+    """
+    if value is None:
+        return None
+    # Easy path: integer ms
+    try:
+        v = int(str(value))
+        # treat values too small as seconds and up-convert, just in case
+        if v < 10_000_000_000:  # < ~2001-09 in seconds
+            v = v * 1000
+        return v
+    except ValueError:
+        pass
+    # Fallback: ISO
+    try:
+        return to_epoch_ms(str(value))
+    except Exception:
+        logging.warning("Could not parse hs_timestamp=%r", value)
+        return None
+# -----------------------------------------------------------------------------
+# Search IDs (ts > since_ms)
+# -----------------------------------------------------------------------------
+def search_email_ids_after_ms(since_ms: int) -> List[str]:
+    """
+    Find email IDs where hs_timestamp > since_ms (strictly greater),
+    sorted ASC so the max timestamp at the end is monotonic.
+    """
+    url = "https://api.hubapi.com/crm/v3/objects/emails/search"
+    headers = {
+        "Authorization": f"Bearer {HUBSPOT_TOKEN}",
+        "Content-Type": "application/json",
+        "Accept": "application/json",
+    }
+    payload = {
+        "filterGroups": [{
+            "filters": [
+                {"propertyName": "hs_timestamp",
+                    "operator": "GT", "value": str(since_ms)}
+            ]
+        }],
+        "limit": 100,
+        "sorts": [{"propertyName": "hs_timestamp", "direction": "ASCENDING"}],
+        # We only need IDs here; we’ll read full records later
+    }
+    ids: List[str] = []
+    after: Optional[str] = None
+    with httpx.Client(timeout=30.0) as client:
+        while True:
+            body = dict(payload)
+            if after:
+                body["after"] = after
+            resp = client.post(url, headers=headers, json=body)
+            if resp.status_code >= 400:
+                try:
+                    logging.error("Email search error: %s", resp.json())
+                except Exception:
+                    logging.error("Email search error: %s", resp.text)
+                resp.raise_for_status()
+            data = resp.json()
+            ids.extend([obj["id"] for obj in data.get("results", []) or []])
+            after = (data.get("paging") or {}).get("next", {}).get("after")
+            if not after:
+                break
+            time.sleep(0.1)
+    logging.info("Found %d email IDs after %d.", len(ids), since_ms)
+    return ids
+# -----------------------------------------------------------------------------
+# Read-by-ID (with associations)
+# -----------------------------------------------------------------------------
+def read_emails_by_ids(
+    email_ids: List[str]
+) -> Tuple[List[Dict], List[Dict], List[Dict], List[Dict], Optional[int]]:
+    """
+    Read emails by ID with properties and associations (companies/contacts/deals).
+    Returns data lists + max hs_timestamp (ms) seen.
+    """
+    if not email_ids:
+        return [], [], [], [], None
+    email_metadata_data: List[Dict] = []
+    email_company_links: List[Dict] = []
+    email_contact_links: List[Dict] = []
+    email_deal_links: List[Dict] = []
+    email_ticket_links: List[Dict] = []
+    assoc_types = ["companies", "contacts", "deals", "tickets"]
+    max_ts_ms: Optional[int] = None
+    for i, email_id in enumerate(email_ids, start=1):
+        try:
+            record = hubspot_client.crm.objects.emails.basic_api.get_by_id(
+                email_id, properties=EMAIL_PROPERTIES, associations=assoc_types, archived=False
+            )
+            props = record.properties or {}
+            cleaned = clean_text(props.get("hs_email_text") or "")
+            # Robust timestamp handling
+            hs_ts_ms = parse_hs_timestamp_ms(props.get("hs_timestamp"))
+            if hs_ts_ms is not None:
+                if (max_ts_ms is None) or (hs_ts_ms > max_ts_ms):
+                    max_ts_ms = hs_ts_ms
+                sent_at_iso = datetime.datetime.fromtimestamp(
+                    hs_ts_ms / 1000, tz=datetime.timezone.utc).isoformat()
+            else:
+                sent_at_iso = None
+            email_metadata_data.append({
+                "email_id": record.id,
+                "subject": props.get("hs_email_subject"),
+                "from_email": props.get("hs_email_from_email") or "",
+                "to_emails": parse_emails(props.get("hs_email_to_email")),
+                "sent_at": sent_at_iso,
+                "direction": props.get("hs_email_direction"),
+                "email_content": cleaned,
+            })
+            assoc = record.associations or {}
+            if assoc.get("companies") and getattr(assoc["companies"], "results", None):
+                for a in assoc["companies"].results:
+                    if a.id and a.id.isdigit():
+                        email_company_links.append(
+                            {"email_id": record.id, "company_id": int(a.id)})
+            if assoc.get("contacts") and getattr(assoc["contacts"], "results", None):
+                for a in assoc["contacts"].results:
+                    if a.id and a.id.isdigit():
+                        email_contact_links.append(
+                            {"email_id": record.id, "contact_id": int(a.id)})
+            if assoc.get("deals") and getattr(assoc["deals"], "results", None):
+                for a in assoc["deals"].results:
+                    if a.id and a.id.isdigit():
+                        email_deal_links.append(
+                            {"email_id": record.id, "deal_id": int(a.id)})
+            if assoc.get("tickets") and getattr(assoc["tickets"], "results", None):
+                for a in assoc["tickets"].results:
+                    if a.id and a.id.isdigit():
+                        email_ticket_links.append(
+                            {"email_id": record.id, "ticket_id": int(a.id)})
+            if i % 200 == 0:
+                logging.info("Read %d emails...", i)
+            time.sleep(0.05)
+        except httpx.HTTPStatusError as e:
+            logging.error("HTTP error reading email %s: %s", email_id, e)
+        except (EmailApiException, httpx.HTTPError) as e:
+            logging.error("Error reading email %s: %s", email_id, e)
+    return (
+        email_metadata_data, email_company_links, email_contact_links,
+        email_deal_links, email_ticket_links, max_ts_ms
+    )
+# -----------------------------------------------------------------------------
+# Upsert
+# -----------------------------------------------------------------------------
+def upsert_emails(
+    email_metadata_data: List[Dict],
+    email_company_links: List[Dict],
+    email_contact_links: List[Dict],
+    email_deal_links: List[Dict],
+    email_ticket_links: List[Dict],
+) -> None:
+    if email_metadata_data:
+        batched_insert(
+            supabase_client,
+            "hubspot_emails",
+            email_metadata_data,
+            batch_size=1000,
+            # If your helper supports on_conflict:
+            # on_conflict=["email_id"]
+        )
+        print(f"Upserted {len(email_metadata_data)} email metadata rows.")
+    if email_company_links:
+        batched_insert(
+            supabase_client,
+            "hubspot_email_companies",
+            email_company_links,
+            batch_size=1000,
+            on_conflict=["email_id", "company_id"],
+        )
+        print(f"Upserted {len(email_company_links)} email-company links.")
+    if email_contact_links:
+        batched_insert(
+            supabase_client,
+            "hubspot_email_contacts",
+            email_contact_links,
+            batch_size=1000,
+            on_conflict=["email_id", "contact_id"],
+        )
+        print(f"Upserted {len(email_contact_links)} email-contact links.")
+    if email_deal_links:
+        batched_insert(
+            supabase_client,
+            "hubspot_email_deals",
+            email_deal_links,
+            batch_size=1000,
+            on_conflict=["email_id", "deal_id"],
+        )
+        print(f"Upserted {len(email_deal_links)} email-deal links.")
+    if email_ticket_links:
+        batched_insert(
+            supabase_client,
+            "hubspot_email_tickets",
+            email_ticket_links,
+            batch_size=1000,
+            on_conflict=["email_id", "ticket_id"],
+        )
+        print(f"Upserted {len(email_ticket_links)} email-ticket links.")
+# -----------------------------------------------------------------------------
+# Main (timestamp cursor)
+# -----------------------------------------------------------------------------
+def main(since_ms: Optional[int] = None):
+    """
+    Orchestrates:
+      1) Search email IDs with hs_timestamp > since_ms
+      2) Read full emails with associations (track max timestamp)
+      3) Upsert into Supabase
+      4) Update sync metadata with both last_sync_metadata and last_sync_time
+    """
+    # Resolve since_ms
+    if since_ms is None and BOOTSTRAP_SINCE_MS_ENV:
+        try:
+            since_ms = int(BOOTSTRAP_SINCE_MS_ENV)
+        except ValueError:
+            raise RuntimeError(
+                "HUBSPOT_EMAILS_SINCE_MS must be an integer (ms) if set.")
+    if since_ms is None:
+        # Default: today@00:00:00Z for first run
+        today0 = floor_to_utc_midnight(
+            datetime.datetime.now(datetime.timezone.utc))
+        since_ms = to_epoch_ms(today0)
+    print(f"Searching emails with hs_timestamp > {since_ms} ...")
+    ids = search_email_ids_after_ms(since_ms)
+    print(f"Found {len(ids)} email IDs.")
+    if not ids:
+        print("No emails beyond the cursor. Updating sync metadata and exiting.")
+        # Record only last_sync_time; helper sets updated_at
+        now_iso = datetime.datetime.now(datetime.timezone.utc).isoformat()
+        update_sync_metadata(supabase_client, "emails", now_iso)
+        return
+    print("Reading emails (with associations)...")
+    (
+        email_metadata_data, email_company_links, email_contact_links,
+        email_deal_links, email_ticket_links, max_ts_ms
+    ) = read_emails_by_ids(ids)
+    print("Upserting into Supabase...")
+    upsert_emails(
+        email_metadata_data,
+        email_company_links,
+        email_contact_links,
+        email_deal_links,
+        email_ticket_links,
+    )
+    # Advance cursor to max timestamp we actually ingested
+    new_cursor_ms = max_ts_ms if max_ts_ms is not None else since_ms
+    now_iso = datetime.datetime.now(datetime.timezone.utc).isoformat()
+    update_sync_metadata(supabase_client, "emails", now_iso)
+    print(f"Emails sync complete. Advanced cursor to {new_cursor_ms}.")
+# -----------------------------------------------------------------------------
+# CLI
+# -----------------------------------------------------------------------------
+def _parse_cli_arg_to_ms(arg: str) -> int:
+    """
+    Accept:
+      - integer epoch ms
+      - ISO-8601 (Z or offset)
+      - YYYY-MM-DD (floors to 00:00Z for convenience/back-compat)
+    """
+    # epoch ms
+    if re.fullmatch(r"\d{10,13}", arg):
+        v = int(arg)
+        if v < 10_000_000_000_000:  # seconds -> ms
+            v *= 1000
+        return v
+    # YYYY-MM-DD
+    if re.fullmatch(r"\d{4}-\d{2}-\d{2}", arg):
+        d = datetime.datetime.strptime(
+            arg, "%Y-%m-%d").replace(tzinfo=datetime.timezone.utc)
+        return to_epoch_ms(floor_to_utc_midnight(d))
+    # ISO-8601
+    return to_epoch_ms(arg)
+if __name__ == "__main__":
+    import sys
+    if len(sys.argv) > 1:
+        try:
+            main(since_ms=_parse_cli_arg_to_ms(sys.argv[1]))
+        except Exception as e:
+            print(
+                f"Invalid timestamp. Provide epoch ms, ISO-8601, or YYYY-MM-DD. Error: {e}")
+            sys.exit(1)
+    else:
+        main()

python/hubspot_tickets.py ADDED Viewed

	@@ -0,0 +1,491 @@

+"""
+HubSpot Tickets → Supabase (incremental since a millisecond cursor)
+Usage from orchestrator:
+    import load_hubspot_tickets
+    load_hubspot_tickets.main(since_ms=<int milliseconds since epoch UTC>)
+Direct CLI:
+    # epoch ms
+    python load_hubspot_tickets.py 1754025600000
+    # ISO-8601
+    python load_hubspot_tickets.py 2025-08-01T09:30:00Z
+    # Back-compat date (floors to 00:00Z)
+    python load_hubspot_tickets.py 2025-08-01
+"""
+import os
+import re
+import time
+import logging
+import datetime
+from typing import List, Dict, Optional, Tuple, Union
+import httpx
+import hubspot
+from dotenv import load_dotenv
+from supabase import create_client
+from hubspot.crm.tickets import ApiException as TicketsApiException
+from hubspot_utils import (
+    parse_ts, try_parse_int, deduplicate_by_key,
+)
+from supabase_utils import (
+    batched_insert, update_sync_metadata,
+)
+# -----------------------------------------------------------------------------
+# Logging
+# -----------------------------------------------------------------------------
+logging.basicConfig(
+    filename=f"logs/hubspot_tickets_pipeline_{datetime.datetime.now().strftime('%Y-%m-%d')}.log",
+    filemode="a",
+    level=logging.INFO,
+    format="%(asctime)s [%(levelname)s] %(message)s",
+)
+# -----------------------------------------------------------------------------
+# Environment
+# -----------------------------------------------------------------------------
+load_dotenv()
+HUBSPOT_TOKEN = os.getenv("HUBSPOT_TOKEN")
+SUPABASE_URL = os.getenv("SUPABASE_URL")
+SUPABASE_SERVICE_ROLE_KEY = os.getenv("SUPABASE_SERVICE_ROLE_KEY")
+# Optional bootstrap cursor if orchestrator doesn't provide one
+BOOTSTRAP_SINCE_MS_ENV = os.getenv("HUBSPOT_TICKETS_SINCE_MS")
+if not HUBSPOT_TOKEN:
+    raise RuntimeError("HUBSPOT_TOKEN is not set")
+if not SUPABASE_URL or not SUPABASE_SERVICE_ROLE_KEY:
+    raise RuntimeError("Supabase env vars are not set")
+hubspot_client = hubspot.Client.create(access_token=HUBSPOT_TOKEN)
+supabase_client = create_client(SUPABASE_URL, SUPABASE_SERVICE_ROLE_KEY)
+# -----------------------------------------------------------------------------
+# Config
+# -----------------------------------------------------------------------------
+# Custom object association (CLI)
+CLI_ASSOC_TYPE_ID = "2-25629083"
+CLI_ASSOC_FALLBACK_KEY = "p8600202_cli_s"
+TICKET_PROPERTIES = [
+    "closed_date",
+    "content",
+    "ticket_type",
+    "createdate",
+    "hs_created_by_user_id",
+    "hs_lastmodifieddate",
+    "hs_object_id",
+    "hs_pipeline",
+    "hs_pipeline_stage",
+    "hs_ticket_priority",
+    "subject",
+    "hubspot_owner_id",
+    "source_type",
+]
+# -----------------------------------------------------------------------------
+# Time helpers
+# -----------------------------------------------------------------------------
+def _ensure_utc(dt: datetime.datetime) -> datetime.datetime:
+    if dt.tzinfo is None:
+        dt = dt.replace(tzinfo=datetime.timezone.utc)
+    return dt.astimezone(datetime.timezone.utc)
+def floor_to_utc_midnight(dt: datetime.datetime) -> datetime.datetime:
+    dt = _ensure_utc(dt)
+    return dt.replace(hour=0, minute=0, second=0, microsecond=0)
+def _parse_iso_like_to_dt(value: str) -> datetime.datetime:
+    if value.endswith("Z"):
+        value = value[:-1] + "+00:00"
+    dt = datetime.datetime.fromisoformat(value)
+    return _ensure_utc(dt)
+def to_epoch_ms(dt_or_str: Union[str, datetime.datetime]) -> int:
+    if isinstance(dt_or_str, str):
+        dt = _parse_iso_like_to_dt(dt_or_str)
+    elif isinstance(dt_or_str, datetime.datetime):
+        dt = _ensure_utc(dt_or_str)
+    else:
+        raise TypeError(f"Unsupported type for to_epoch_ms: {type(dt_or_str)}")
+    return int(dt.timestamp() * 1000)
+def parse_any_ts_ms(value: Optional[Union[str, int, float]]) -> Optional[int]:
+    """
+    Accepts:
+      - ms-epoch as str/int/float
+      - seconds-epoch as str/int/float (auto *1000)
+      - ISO-8601 string
+    Returns ms since epoch or None.
+    """
+    if value is None:
+        return None
+    # try int first
+    try:
+        v = int(str(value))
+        if v < 10_000_000_000:  # seconds → ms
+            v *= 1000
+        return v
+    except ValueError:
+        pass
+    # try ISO
+    try:
+        return to_epoch_ms(str(value))
+    except Exception:
+        logging.warning("Could not parse timestamp value=%r", value)
+        return None
+# -----------------------------------------------------------------------------
+# Pipeline & stage mappings
+# -----------------------------------------------------------------------------
+def get_pipeline_and_stage_mappings() -> Tuple[Dict[str, str], Dict[str, str]]:
+    try:
+        resp = hubspot_client.crm.pipelines.pipelines_api.get_all(
+            object_type="tickets")
+        pipeline_mapping: Dict[str, str] = {}
+        stage_mapping: Dict[str, str] = {}
+        for p in resp.results:
+            pipeline_mapping[p.id] = p.label
+            for s in p.stages:
+                stage_mapping[s.id] = s.label
+        return pipeline_mapping, stage_mapping
+    except Exception as e:
+        logging.error("Failed to fetch pipeline/stage mappings: %s", e)
+        return {}, {}
+# -----------------------------------------------------------------------------
+# Search IDs (ts > since_ms) with property fallback
+# -----------------------------------------------------------------------------
+def _search_ticket_ids_from(since_ms: int, prop: str) -> List[str]:
+    """
+    Search tickets where {prop} > since_ms.
+    Sort ascending so we can advance cursor monotonically.
+    """
+    url = "https://api.hubapi.com/crm/v3/objects/tickets/search"
+    headers = {
+        "Authorization": f"Bearer {HUBSPOT_TOKEN}",
+        "Content-Type": "application/json",
+        "Accept": "application/json",
+    }
+    payload = {
+        "filterGroups": [{
+            "filters": [
+                {"propertyName": prop, "operator": "GT",
+                    "value": str(since_ms)},
+            ]
+        }],
+        "limit": 100,
+        "sorts": [{"propertyName": prop, "direction": "ASCENDING"}],
+    }
+    ids: List[str] = []
+    after: Optional[str] = None
+    with httpx.Client(timeout=30.0) as client:
+        while True:
+            body = dict(payload)
+            if after:
+                body["after"] = after
+            resp = client.post(url, headers=headers, json=body)
+            if resp.status_code >= 400:
+                try:
+                    logging.error(
+                        "Ticket search error for prop '%s': %s", prop, resp.json())
+                except Exception:
+                    logging.error(
+                        "Ticket search error for prop '%s': %s", prop, resp.text)
+                resp.raise_for_status()
+            data = resp.json()
+            ids.extend([obj["id"] for obj in data.get("results", []) or []])
+            after = (data.get("paging") or {}).get("next", {}).get("after")
+            if not after:
+                break
+            time.sleep(0.1)
+    return ids
+def search_ticket_ids_after_ms(since_ms: int) -> Tuple[List[str], str]:
+    """
+    Search ticket IDs where {prop} > since_ms (strictly greater),
+    sorted ASC so the max timestamp at the end is monotonic.
+    Returns a tuple of (ticket_ids, prop_used).
+    """
+    props_to_try = ["createdate"]
+    last_err = None
+    for prop in props_to_try:
+        try:
+            ids = _search_ticket_ids_from(since_ms, prop)
+            logging.info(
+                "Ticket search with '%s' returned %d IDs.", prop, len(ids))
+            return ids, prop
+        except httpx.HTTPStatusError as e:
+            last_err = e
+            continue
+    if last_err:
+        raise last_err
+    return [], "createdate"
+# -----------------------------------------------------------------------------
+# Read-by-ID (with associations)
+# -----------------------------------------------------------------------------
+def read_tickets_by_ids(
+    ticket_ids: List[str],
+    cursor_prop: str,
+) -> Tuple[List[Dict], List[Dict], List[Dict], List[Dict], Optional[int]]:
+    """
+    Read tickets by ID with properties and associations (CLI/deals/contacts).
+    Returns: tickets, ticket_cli_links, ticket_deal_links, ticket_contact_links, max_ts_ms_for_cursor_prop
+    """
+    if not ticket_ids:
+        return [], [], [], [], None
+    tickets: List[Dict] = []
+    ticket_cli_links: List[Dict] = []
+    ticket_deal_links: List[Dict] = []
+    ticket_contact_links: List[Dict] = []
+    assoc_types = [CLI_ASSOC_TYPE_ID, "deals", "contacts"]
+    pipeline_map, stage_map = get_pipeline_and_stage_mappings()
+    max_ts_ms: Optional[int] = None
+    for i, tid in enumerate(ticket_ids, start=1):
+        try:
+            record = hubspot_client.crm.tickets.basic_api.get_by_id(
+                tid, properties=TICKET_PROPERTIES, associations=assoc_types, archived=False
+            )
+            p = record.properties or {}
+            # Track max timestamp based on cursor_prop we searched on
+            cursor_val = p.get(cursor_prop)
+            ts_ms = parse_any_ts_ms(cursor_val)
+            if ts_ms is not None and (max_ts_ms is None or ts_ms > max_ts_ms):
+                max_ts_ms = ts_ms
+            tickets.append({
+                "ticket_id": try_parse_int(record.id),
+                "subject": p.get("subject"),
+                "content": p.get("content"),
+                "ticket_type": p.get("ticket_type"),
+                "closed_date": parse_ts(p.get("closed_date")),
+                "hubspot_modified_date": parse_ts(p.get("hs_lastmodifieddate")),
+                "pipeline_id": try_parse_int(p.get("hs_pipeline")),
+                "pipeline_label": pipeline_map.get(p.get("hs_pipeline"), ""),
+                "ticket_status_id": try_parse_int(p.get("hs_pipeline_stage")),
+                "ticket_status_label": stage_map.get(p.get("hs_pipeline_stage"), ""),
+                "ticket_priority": p.get("hs_ticket_priority"),
+                "source_type": p.get("source_type"),
+                "hubspot_owner_id": try_parse_int(p.get("hubspot_owner_id")),
+                "hubspot_created_at": parse_ts(p.get("createdate")),
+                "hubspot_created_by": try_parse_int(p.get("hs_created_by_user_id")),
+            })
+            # Associations
+            assoc = record.associations or {}
+            # CLI (custom object) key may be numeric ID or the name key
+            cli_bucket = None
+            if assoc.get(CLI_ASSOC_TYPE_ID):
+                cli_bucket = assoc[CLI_ASSOC_TYPE_ID]
+            elif assoc.get(CLI_ASSOC_FALLBACK_KEY):
+                cli_bucket = assoc[CLI_ASSOC_FALLBACK_KEY]
+            if cli_bucket and getattr(cli_bucket, "results", None):
+                for a in cli_bucket.results:
+                    if a.id and a.id.isdigit():
+                        ticket_cli_links.append({
+                            "ticket_id": try_parse_int(record.id),
+                            "cli_id": try_parse_int(a.id),
+                        })
+            if assoc.get("deals") and getattr(assoc["deals"], "results", None):
+                for a in assoc["deals"].results:
+                    if a.id and a.id.isdigit():
+                        ticket_deal_links.append({
+                            "ticket_id": try_parse_int(record.id),
+                            "deal_id": try_parse_int(a.id),
+                        })
+            if assoc.get("contacts") and getattr(assoc["contacts"], "results", None):
+                for a in assoc["contacts"].results:
+                    if a.id and a.id.isdigit():
+                        ticket_contact_links.append({
+                            "ticket_id": try_parse_int(record.id),
+                            "contact_id": try_parse_int(a.id),
+                        })
+            if i % 200 == 0:
+                logging.info("Read %d tickets...", i)
+            time.sleep(0.05)
+        except httpx.HTTPStatusError as e:
+            logging.error("HTTP error reading ticket %s: %s", tid, e)
+        except (TicketsApiException, httpx.HTTPError) as e:
+            logging.error("Error reading ticket %s: %s", tid, e)
+    return tickets, ticket_cli_links, ticket_deal_links, ticket_contact_links, max_ts_ms
+# -----------------------------------------------------------------------------
+# Upsert
+# -----------------------------------------------------------------------------
+def upsert_tickets(
+    tickets: List[Dict],
+    ticket_cli_links: List[Dict],
+    ticket_deal_links: List[Dict],
+    ticket_contact_links: List[Dict],
+) -> None:
+    if tickets:
+        tickets = deduplicate_by_key(tickets, key="ticket_id")
+        batched_insert(
+            supabase_client, "hubspot_tickets", tickets,
+            batch_size=1000, on_conflict=["ticket_id"]
+        )
+        print(f"Upserted {len(tickets)} tickets.")
+    if ticket_cli_links:
+        ticket_cli_links = deduplicate_by_key(
+            ticket_cli_links, key=("ticket_id", "cli_id"))
+        batched_insert(
+            supabase_client, "hubspot_ticket_clis", ticket_cli_links,
+            batch_size=1000, on_conflict=["ticket_id", "cli_id"]
+        )
+        print(f"Upserted {len(ticket_cli_links)} ticket-cli associations.")
+    if ticket_deal_links:
+        ticket_deal_links = deduplicate_by_key(
+            ticket_deal_links, key=("ticket_id", "deal_id"))
+        batched_insert(
+            supabase_client, "hubspot_ticket_deals", ticket_deal_links,
+            batch_size=1000, on_conflict=["ticket_id", "deal_id"]
+        )
+        print(f"Upserted {len(ticket_deal_links)} ticket-deal associations.")
+    if ticket_contact_links:
+        ticket_contact_links = deduplicate_by_key(
+            ticket_contact_links, key=("ticket_id", "contact_id"))
+        batched_insert(
+            supabase_client, "hubspot_ticket_contacts", ticket_contact_links,
+            batch_size=1000, on_conflict=["ticket_id", "contact_id"]
+        )
+        print(
+            f"Upserted {len(ticket_contact_links)} ticket-contact associations.")
+# -----------------------------------------------------------------------------
+# Main (timestamp cursor)
+# -----------------------------------------------------------------------------
+def main(since_ms: Optional[int] = None):
+    """
+    Orchestrates:
+      1) Search ticket IDs with <cursor_prop> > since_ms (property fallback)
+      2) Read full tickets with associations (track max timestamp for <cursor_prop>)
+      3) Upsert into Supabase
+      4) Update sync metadata with { last_sync_metadata, last_sync_time, cursor_prop }
+    """
+    # Resolve since_ms
+    if since_ms is None and BOOTSTRAP_SINCE_MS_ENV:
+        try:
+            since_ms = int(BOOTSTRAP_SINCE_MS_ENV)
+        except ValueError:
+            raise RuntimeError(
+                "HUBSPOT_TICKETS_SINCE_MS must be an integer (ms) if set.")
+    if since_ms is None:
+        # Default: today@00:00:00Z for first run
+        today0 = floor_to_utc_midnight(
+            datetime.datetime.now(datetime.timezone.utc))
+        since_ms = to_epoch_ms(today0)
+    print(f"Searching tickets with timestamp > {since_ms} ...")
+    ids, cursor_prop = search_ticket_ids_after_ms(since_ms)
+    print(f"Search property: {cursor_prop}. Found {len(ids)} ticket IDs.")
+    now_iso = datetime.datetime.now(datetime.timezone.utc).isoformat()
+    if not ids:
+        print("No tickets beyond the cursor. Updating sync metadata and exiting.")
+        # Only record last_sync_time (updated_at handled by helper)
+        update_sync_metadata(supabase_client, "hubspot_tickets", now_iso)
+        return
+    print("Reading tickets (with associations)...")
+    tickets, ticket_cli_links, ticket_deal_links, ticket_contact_links, max_ts_ms = read_tickets_by_ids(
+        ids, cursor_prop)
+    print("Upserting into Supabase...")
+    upsert_tickets(tickets, ticket_cli_links,
+                   ticket_deal_links, ticket_contact_links)
+    # Advance cursor to max timestamp we actually ingested for the chosen property
+    new_cursor_ms = max_ts_ms if max_ts_ms is not None else since_ms
+    # Record last_sync_time only (updated_at handled by helper)
+    update_sync_metadata(supabase_client, "hubspot_tickets", now_iso)
+    print(
+        f"Tickets sync complete. Advanced cursor to {new_cursor_ms} using prop '{cursor_prop}'.")
+# -----------------------------------------------------------------------------
+# CLI
+# -----------------------------------------------------------------------------
+def _parse_cli_arg_to_ms(arg: str) -> int:
+    """
+    Accept:
+      - integer epoch ms
+      - ISO-8601 (Z or offset)
+      - YYYY-MM-DD (floors to 00:00Z)
+    """
+    # epoch ms or seconds
+    if re.fullmatch(r"\d{10,13}", arg):
+        v = int(arg)
+        if v < 10_000_000_000_000:  # seconds -> ms
+            v *= 1000
+        return v
+    # YYYY-MM-DD
+    if re.fullmatch(r"\d{4}-\d{2}-\d{2}", arg):
+        d = datetime.datetime.strptime(
+            arg, "%Y-%m-%d").replace(tzinfo=datetime.timezone.utc)
+        return to_epoch_ms(floor_to_utc_midnight(d))
+    # ISO-8601
+    return to_epoch_ms(arg)
+if __name__ == "__main__":
+    import sys
+    if len(sys.argv) > 1:
+        try:
+            main(since_ms=_parse_cli_arg_to_ms(sys.argv[1]))
+        except Exception as e:
+            print(
+                f"Invalid timestamp. Provide epoch ms, ISO-8601, or YYYY-MM-DD. Error: {e}")
+            sys.exit(1)
+    else:
+        main()

python/hubspot_utils.py ADDED Viewed

	@@ -0,0 +1,946 @@

+"""
+This script contains utility functions for working with HubSpot data.
+"""
+import time
+import datetime
+import re
+import httpx
+import html
+import requests
+import logging
+from typing import Dict, List, Optional, Tuple
+from collections import defaultdict
+import pandas as pd
+from hubspot.crm.objects import ApiException as ObjectApiException
+from hubspot.crm.contacts.exceptions import ApiException as ContactsApiException
+from hubspot.crm.companies.exceptions import ApiException as CompaniesApiException
+from hubspot.crm.contacts import (
+    PublicObjectSearchRequest, Filter, FilterGroup, BatchInputSimplePublicObjectId
+)
+from hubspot.crm.companies import (
+    PublicObjectSearchRequest as CompanySearchRequest,
+    Filter as CompanyFilter,
+    FilterGroup as CompanyFilterGroup
+)
+_MISSING = {None, ""}
+def serialize(obj):
+    """
+    Recursively serialize a complex object into a plain dictionary.
+    This function is useful for serializing HubSpot API responses into a form
+    that can be easily stored in a database or file.
+    It currently supports the following types:
+    - Objects with a `to_dict` method
+    - Lists
+    - Dictionaries
+    - Datetime objects
+    - Objects with a `__dict__` attribute
+    If an object doesn't match any of the above, it is converted to a string
+    using the `str` function.
+    :param obj: The object to serialize
+    :return: A plain dictionary
+    """
+    if hasattr(obj, "to_dict"):
+        return serialize(obj.to_dict())
+    elif isinstance(obj, list):
+        return [serialize(item) for item in obj]
+    elif isinstance(obj, dict):
+        return {key: serialize(value) for key, value in obj.items()}
+    elif isinstance(obj, datetime.datetime):
+        return obj.isoformat()
+    elif hasattr(obj, "__dict__"):
+        return {key: serialize(value) for key, value in obj.__dict__.items() if not key.startswith("_")}
+    else:
+        return str(obj)
+def try_parse_int(value):
+    """
+    Attempts to parse the given value as an integer.
+    :param value: The value to be parsed.
+    :return: The integer representation of the value if parsing is successful,
+             otherwise None.
+    """
+    if value is None:
+        return None
+    try:
+        return int(value)
+    except (ValueError, TypeError):
+        return None
+def try_parse_float(value):
+    """
+    Attempts to parse the given value as a float.
+    :param value: The value to be parsed.
+    :return: The float representation of the value if parsing is successful,
+             otherwise None.
+    """
+    if value is None:
+        return None
+    try:
+        return float(value)
+    except (ValueError, TypeError):
+        return None
+def parse_ts(ts):
+    """
+    Parses a timestamp string in ISO 8601 format and converts it to a UTC datetime string.
+    This function attempts to parse the given timestamp string, adjusting for the
+    timezone offset if provided. If the parsing is successful, it returns the
+    datetime in ISO 8601 format with UTC timezone. If the parsing fails or if
+    the input is None, it returns the original timestamp or None.
+    :param ts: The timestamp string to be parsed.
+    :return: A UTC timezone-aware datetime string in ISO 8601 format, the original
+             timestamp if parsing fails, or None if the input is None.
+    """
+    if not ts:
+        return None
+    try:
+        if isinstance(ts, (int, float)) or str(ts).isdigit():
+            ts = int(ts)
+            dt = datetime.datetime.fromtimestamp(
+                ts / 1000.0, tz=datetime.timezone.utc)
+        else:
+            dt = datetime.datetime.fromisoformat(
+                str(ts).replace("Z", "+00:00"))
+        return dt.isoformat()
+    except (ValueError, TypeError) as e:
+        logging.error(
+            "Failed to parse timestamp '%s': %s", ts, str(e)
+        )
+        return None
+def deduplicate_by_key(rows, key):
+    """
+    Removes duplicate rows from a list of dictionaries based on a specified key or tuple of keys.
+    :param rows: The list of dictionaries to deduplicate.
+    :param key: A string or tuple of strings representing the key(s) to deduplicate by.
+                If a string, the value of the dictionary at that key will be used.
+                If a tuple, the values of the dictionary at each key in the tuple
+                will be combined into a tuple and used as the deduplication key.
+    :return: A list of unique dictionaries, with duplicates removed based on the key(s) specified.
+    """
+    seen = set()
+    unique_rows = []
+    for row in rows:
+        val = tuple(row[k] for k in key) if isinstance(
+            key, tuple) else row.get(key)
+        if val and val not in seen:
+            seen.add(val)
+            unique_rows.append(row)
+    return unique_rows
+def fix_surrogates(text: str) -> str:
+    """
+    Fix surrogates in a string.
+    Some emails may contain surrogates, which are invalid utf-8 characters.
+    This function will convert such characters to valid utf-8 characters.
+    :param text: The string to fix.
+    :return: The fixed string.
+    """
+    return text.encode('utf-16', 'surrogatepass').decode('utf-16')
+def strip_footer(text: str) -> str:
+    """
+    Strips common email footers like confidentiality notices and signatures.
+    """
+    footer_cutoff_patterns = [
+        r"(?i)Registration Number \\d+[A-Z]?",
+        r"(?i)this e[- ]?mail message contains confidential information",
+        r"(?i)this email and any attachments.*?confidential",
+        r"(?i)if you are not the intended recipient",
+        r"(?i)please notify us immediately by return e[- ]?mail",
+        r"(?i)no liability is accepted for any damage",
+        r"(?i)this message is intended only for the use of the individual",
+        r"(?i)confidentiality notice",
+        r"(?i)please consider the environment before printing",
+        r"(?i)registered in England and Wales",
+        r"(?i)the views expressed in this email are those of the sender",
+        r"(?i)accepts no liability",
+        r"(?i)has taken steps to ensure",
+    ]
+    for pattern in footer_cutoff_patterns:
+        match = re.search(pattern, text)
+        if match:
+            return text[:match.start()].strip()
+    return text
+def clean_text(raw_text: str) -> str:
+    """
+    Clean an email text by removing HTML, unescaping entities, and trimming
+    quoted sections, footers, and signatures.
+    """
+    # Defensive defaults
+    if raw_text is None:
+        raw_text = ""
+    # Optional helper fallbacks
+    try:
+        fixed = fix_surrogates(raw_text)
+    except NameError:
+        # If fix_surrogates isn't available, just pass through
+        fixed = raw_text
+    # Start from the corrected source
+    text = fixed
+    # Normalize Windows/Mac line endings early (before HTML handling)
+    text = text.replace("\r\n", "\n").replace("\r", "\n")
+    # Normalize common HTML line breaks to real newlines before removing tags
+    text = re.sub(r"(?i)<br\s*/?>", "\n", text)
+    # Strip HTML tags
+    text = re.sub(r"<[^>]+>", "", text)
+    # Unescape HTML entities
+    text = html.unescape(text)
+    # --- Trim quoted / forwarded blocks heuristics ---
+    on_wrote_pat = re.compile(r"(?im)^[>\s]*on\s+.+?wrote:")
+    fwd_hdr_pat = re.compile(
+        r"(?im)^[>\s]*(?:from|sent|date|subject|to)\s*:\s.+$")
+    sep_pat = re.compile(
+        r"(?im)^(?:[>\s]*-----\s*original message\s*-----|[>\s]*begin forwarded message)")
+    cut_idx = None
+    for m in (on_wrote_pat.search(text), fwd_hdr_pat.search(text), sep_pat.search(text)):
+        if m:
+            idx = m.start()
+            cut_idx = idx if cut_idx is None or idx < cut_idx else cut_idx
+    if cut_idx is not None:
+        text = text[:cut_idx].rstrip("\n")
+    # --- Line-by-line cleanup, footer & signature handling ---
+    lines = text.split("\n")
+    cleaned = []
+    footer_regex = re.compile(
+        r"(?i)\b(confidential|privacy policy|unsubscribe|follow us|visit our website|"
+        r"please consider the environment|registered office|copyright|"
+        r"this e-?mail and any attachments|do not print this email)\b"
+    )
+    sig_sep = re.compile(r"^--\s?$")  # standard sig delimiter
+    for line in lines:
+        stripped = line.strip()
+        # Skip pure separators
+        if re.match(r"^[-=_*]{3,}\s*$", stripped):
+            continue
+        # Skip common footer lines
+        if footer_regex.search(stripped):
+            continue
+        # Stop including anything after a signature separator
+        if sig_sep.match(stripped):
+            break
+        # Keep empty lines (collapse later)
+        if stripped == "":
+            cleaned.append("")
+            continue
+        cleaned.append(stripped)
+    out = "\n".join(cleaned)
+    out = re.sub(r"\n{3,}", "\n\n", out).strip()
+    # Optional final footer stripping
+    try:
+        out = strip_footer(out)
+    except NameError:
+        pass
+    return out
+def get_search_config(hubspot_client, object_type):
+    """
+    Retrieves the necessary classes and search API instance for the given HubSpot object type.
+    :param hubspot_client: The HubSpot client object.
+    :param object_type: The type of object to search for.
+    :return: A dictionary containing the necessary classes and search API instance.
+    :raises ValueError: If the object_type is unsupported.
+    """
+    if object_type == "contacts":
+        return {
+            "FilterCls": Filter,
+            "FilterGroupCls": FilterGroup,
+            "SearchRequestCls": PublicObjectSearchRequest,
+            "search_api": hubspot_client.crm.contacts.search_api,
+            "modified_prop": "createdate",
+        }
+    if object_type == "companies":
+        return {
+            "FilterCls": CompanyFilter,
+            "FilterGroupCls": CompanyFilterGroup,
+            "SearchRequestCls": CompanySearchRequest,
+            "search_api": hubspot_client.crm.companies.search_api,
+            "modified_prop": "createdate",
+        }
+    raise ValueError(f"Unsupported object_type '{object_type}'")
+def get_property_label_mapping(hubspot_client, object_type: str, property_name: str) -> dict:
+    """
+    Retrieves the label mapping for a HubSpot property.
+    :param hubspot_client: The HubSpot client instance.
+    :param object_type: "contacts" or "companies"
+    :param property_name: The internal name of the property (e.g., "industry").
+    :return: Dictionary mapping internal values to human-readable labels.
+    """
+    try:
+        prop_info = hubspot_client.crm.properties.core_api.get_by_name(
+            object_type, property_name)
+        return {opt.value: opt.label for opt in prop_info.options}
+    except Exception as e:
+        logging.warning("Failed to fetch mapping for %s: %s",
+                        property_name, str(e))
+        return {}
+def build_filter_request(since, after):
+    """
+    Builds a HubSpot SearchRequest to fetch companies created after a given datetime.
+    :param since: The datetime to filter by createdate
+    :param after: Optional result offset to use for pagination
+    :return: PublicObjectSearchRequest
+    """
+    value = int(since.timestamp() * 1000)
+    filter_obj = Filter(property_name="createdate",
+                        operator="GTE", value=value)
+    filter_group = FilterGroup(filters=[filter_obj])
+    return PublicObjectSearchRequest(
+        filter_groups=[filter_group],
+        properties=[
+            "full_name",
+            "firstname",
+            "lastname",
+            "email",
+            "phone",
+            "createdate",
+            "lastmodifieddate",
+            "lastactivitydate",
+            "associatedcompanyid",
+        ],
+        limit=100,
+        after=after
+    )
+def fetch_custom_object(hubspot_client, object_name="tickets", properties=None,
+                        associations=None):
+    """
+    Fetches all records of a custom HubSpot object and returns them as a list of dictionaries.
+    :param hubspot_client: The HubSpot client object.
+    :param object_name: The name of the custom object to fetch. Defaults to "tickets".
+    :param properties: A list of properties to include in the response. If None, all properties are included.
+    :param associations: A list of associations to include in the response. If None, no associations are included.
+    :return: A list of dictionaries, where each dictionary represents a record of the custom object.
+    :raises ObjectApiException: If the HubSpot API returns an error when fetching the custom object.
+    """
+    all_objects = []
+    after = None
+    while True:
+        try:
+            response = hubspot_client.crm.objects.basic_api.get_page(
+                object_type=object_name,
+                properties=properties,
+                limit=100,
+                after=after,
+                archived=False,
+                associations=associations
+            )
+            if not response.results:
+                break
+            for record in response.results:
+                props = record.properties
+                object_id = record.id
+                # exclude hs_object_id and optionally filter properties
+                if properties:
+                    filtered_props = {k: props.get(
+                        k) for k in properties if k != "hs_object_id"}
+                else:
+                    filtered_props = {
+                        k: v for k, v in props.items() if k != "hs_object_id"}
+                object_dict = {"billing_id": object_id, **filtered_props}
+                # cast ints if present
+                for key in ["hs_created_by_user_id"]:
+                    if key in object_dict:
+                        object_dict[key] = try_parse_int(object_dict[key])
+                # parse timestamps if present
+                for key in ["hs_createdate", "hs_lastmodifieddate"]:
+                    if key in object_dict:
+                        object_dict[key] = parse_ts(object_dict[key])
+                # include associations if requested
+                if associations and record.associations:
+                    assoc_data = {}
+                    for assoc_type, assoc_records in record.associations.items():
+                        assoc_data[assoc_type] = [
+                            ar.id for ar in assoc_records.results]
+                    object_dict["associations"] = assoc_data
+                all_objects.append(object_dict)
+            if response.paging and response.paging.next:
+                after = response.paging.next.after
+            else:
+                break
+            time.sleep(0.1)
+        except ObjectApiException as e:
+            logging.error("Exception when fetching %s: %s", object_name, e)
+            break
+    return all_objects
+def fetch_total_objects(
+    hubspot_client,
+    object_type: str,
+    modified_after: datetime.datetime = None,
+    archived: bool = False
+) -> int:
+    """
+    Fetches the total number of HubSpot objects of the given type.
+    Supports counting archived objects using pagination.
+    :param hubspot_client: HubSpot client
+    :param object_type: "contacts" or "companies"
+    :param modified_after: Only used for non-archived search_api queries
+    :param archived: Whether to count archived records
+    :return: Total number of matching objects
+    """
+    try:
+        if archived:
+            total = 0
+            after = None
+            while True:
+                if object_type == "contacts":
+                    response = hubspot_client.crm.contacts.basic_api.get_page(
+                        limit=100, archived=True, after=after
+                    )
+                elif object_type == "companies":
+                    response = hubspot_client.crm.companies.basic_api.get_page(
+                        limit=100, archived=True, after=after
+                    )
+                else:
+                    raise ValueError(
+                        f"Unsupported object_type '{object_type}' for archived=True")
+                total += len(response.results)
+                if response.paging and response.paging.next:
+                    after = response.paging.next.after
+                else:
+                    break
+                time.sleep(0.1)
+            logging.info("Total %s (archived): %d", object_type, total)
+            print(f"Total {object_type} (archived): {total}")
+            return total
+        # Non-archived path via search API
+        config = get_search_config(hubspot_client, object_type)
+        filters = []
+        if modified_after:
+            value = int(modified_after.timestamp() * 1000)
+            filters.append(config["FilterCls"](
+                property_name=config["modified_prop"], operator="GTE", value=value
+            ))
+        request_body = config["SearchRequestCls"](
+            filter_groups=[config["FilterGroupCls"](
+                filters=filters)] if filters else None,
+            properties=["id"],
+            limit=1
+        )
+        response = config["search_api"].do_search(request_body)
+        total = response.total
+        logging.info("Total %s in HubSpot: %d", object_type, total)
+        print(f"Total {object_type} in HubSpot: {total}")
+        return total
+    except (ContactsApiException, CompaniesApiException) as api_err:
+        logging.error("API error occurred while fetching %s: %s",
+                      object_type, str(api_err))
+    except httpx.HTTPError as http_err:
+        logging.error("HTTP error occurred while fetching %s: %s",
+                      object_type, str(http_err))
+    except ValueError as val_err:
+        logging.error("ValueError while fetching %s: %s",
+                      object_type, str(val_err))
+    except BaseException as critical_err:
+        logging.critical("Unexpected error fetching %s: %s",
+                         object_type, str(critical_err))
+        raise
+    return 0
+def _coalesce(*vals):
+    """
+    Returns the first non-missing value from the given arguments.
+    A value is considered "missing" if it is None or an empty string.
+    :param vals: Variable number of arguments to coalesce.
+    :return: The first non-missing value, or None if all values are missing.
+    """
+    for v in vals:
+        if v not in _MISSING:
+            return v
+    return None
+def parse_ts_dt(ts: Optional[str]) -> Optional[datetime.datetime]:
+    """
+    Parses a timestamp string in ISO 8601 format and converts it to a UTC datetime object.
+    :param ts: The timestamp string to be parsed.
+    :return: A UTC timezone-aware datetime object, or None if the input is None or parsing fails.
+    """
+    if not ts:
+        return None
+    try:
+        return datetime.datetime.fromisoformat(str(ts).replace("Z", "+00:00")).astimezone(datetime.timezone.utc)
+    except Exception:
+        return None
+def to_epoch_ms_from_utc_iso(date_str: str) -> int:
+    """
+    Converts a date string in ISO 8601 format or 'YYYY-MM-DD' format to milliseconds since the Unix epoch (January 1, 1970, 00:00:00 UTC).
+    :param date_str: The date string to be converted.
+    :return: The number of milliseconds since the Unix epoch.
+    """
+    if not date_str:
+        raise ValueError("date_str is required")
+    if len(date_str) == 10 and date_str[4] == "-" and date_str[7] == "-":
+        # YYYY-MM-DD -> midnight UTC
+        dt = datetime.datetime.strptime(
+            date_str, "%Y-%m-%d").replace(tzinfo=datetime.timezone.utc)
+    else:
+        dt = datetime.datetime.fromisoformat(date_str.replace("Z", "+00:00"))
+    return int(dt.timestamp() * 1000)
+def request_with_retry(method: str,
+                       url: str,
+                       headers: Dict[str, str],
+                       params: Dict[str, str],
+                       initial_backoff: float = 1.0,
+                       max_backoff: float = 16.0,
+                       retries: int = 8) -> Dict:
+    """
+    Sends a request to the given URL with the given method, headers, and parameters, and
+    retries the request up to the given number of times if it fails with a HubSpot API error
+    (429, 500, 502, 503, 504). If the request fails with another error code, it raises a
+    RuntimeError. If all retries fail, it raises a RuntimeError with a message indicating the
+    number of retry attempts.
+    :param method: The HTTP method to use (e.g. GET, POST).
+    :param url: The URL to send the request to.
+    :param headers: A dictionary of HTTP headers to include in the request.
+    :param params: A dictionary of URL parameters to include in the request.
+    :param initial_backoff: The initial backoff time in seconds.
+    :param max_backoff: The maximum backoff time in seconds.
+    :param retries: The number of times to retry the request before giving up.
+    :return: The JSON response from the request, decoded as a dictionary.
+    :raises RuntimeError: If all retry attempts fail.
+    """
+    backoff = initial_backoff
+    for attempt in range(1, retries + 1):
+        try:
+            resp = requests.request(
+                method, url, headers=headers, params=params, timeout=60)
+            if resp.status_code < 400:
+                try:
+                    return resp.json()
+                except Exception as e:
+                    logging.error(
+                        "Failed to decode JSON response from %s: %s", url, e)
+                    raise
+            # HubSpot rate limits or transient server error
+            if resp.status_code in (429, 500, 502, 503, 504):
+                logging.warning(
+                    "HubSpot API %s (%d). Retrying in %.1fs (attempt %d/%d)...",
+                    url, resp.status_code, backoff, attempt, retries
+                )
+                time.sleep(backoff)
+                backoff = min(max_backoff, backoff * 2)
+                continue
+            # Permanent error
+            logging.error("HubSpot API error %s: %s",
+                          resp.status_code, resp.text)
+            resp.raise_for_status()
+        except (requests.exceptions.ConnectionError,
+                requests.exceptions.Timeout) as e:
+            logging.warning(
+                "Network error on attempt %d/%d (%s). Retrying in %.1fs...",
+                attempt, retries, str(e), backoff
+            )
+            time.sleep(backoff)
+            backoff = min(max_backoff, backoff * 2)
+            continue
+        except Exception as e:
+            logging.error("Unexpected error during HubSpot request: %s", e)
+            raise
+    # If we exhausted retries
+    raise RuntimeError(f"Failed after {retries} attempts calling {url}")
+def page_account_activity(base_url: str,
+                          token: str,
+                          occurred_after_ms: int,
+                          occurred_before_ms: int,
+                          limit: int = 200,
+                          max_pages: int = 1000) -> List[Dict]:
+    """
+    Fetches all account activity events from the given base URL that occurred
+    after the given timestamp in milliseconds.
+    :param base_url: The base URL to fetch events from.
+    :param token: The HubSpot API token to use for authentication.
+    :param occurred_after_ms: The timestamp in milliseconds to fetch events after.
+    :param limit: The maximum number of events to fetch per page (default=200).
+    :param max_pages: The maximum number of pages to fetch before stopping pagination
+    (default=1000).
+    :return: A list of all fetched events.
+    """
+    headers = {"Authorization": f"Bearer {token}",
+               "Accept": "application/json"}
+    is_security = "activity/security" in base_url
+    is_audit = "activity/audit-logs" in base_url
+    params: Dict[str, str] = {"limit": str(limit)}
+    if is_audit:
+        params["occurredAfter"] = str(occurred_after_ms)
+        if occurred_before_ms is not None:
+            params["occurredBefore"] = str(occurred_before_ms)
+    elif is_security:
+        params["fromTimestamp"] = str(occurred_after_ms)
+        if occurred_before_ms is not None:
+            params["toTimestamp"] = str(occurred_before_ms)
+    all_items: List[Dict] = []
+    after: Optional[str] = None
+    prev_after: Optional[str] = None
+    pages = 0
+    # seen on /security when exhausted
+    TERMINAL_AFTER_VALUES = {"MC0w", "0-0", ""}
+    while True:
+        if after:
+            params["after"] = after
+        else:
+            params.pop("after", None)
+        data = request_with_retry("GET", base_url, headers, params)
+        results = data.get("results") or data.get("events") or []
+        if not isinstance(results, list):
+            logging.warning("Unexpected results shape at %s: %s",
+                            base_url, type(results))
+            results = []
+        all_items.extend(results)
+        paging = data.get("paging") or {}
+        next_obj = paging.get("next") or {}
+        next_after = next_obj.get("after")
+        pages += 1
+        logging.info("Fetched page %d from %s: %d items (total=%d). after=%s",
+                     pages, base_url, len(results), len(all_items), str(next_after))
+        if len(results) == 0:
+            logging.info(
+                "No results returned; stopping pagination for %s.", base_url)
+            break
+        if not next_after:
+            logging.info(
+                "No next cursor; stopping pagination for %s.", base_url)
+            break
+        if next_after in TERMINAL_AFTER_VALUES:
+            logging.info(
+                "Terminal 'after' (%s); stopping pagination for %s.", next_after, base_url)
+            break
+        if prev_after is not None and next_after == prev_after:
+            logging.info(
+                "Repeated 'after' cursor (%s); stopping pagination for %s.", next_after, base_url)
+            break
+        if pages >= max_pages:
+            logging.warning(
+                "Reached max_pages=%d for %s; stopping pagination.", max_pages, base_url)
+            break
+        prev_after, after = after, next_after
+        time.sleep(0.1)
+    return all_items
+def safe_get_actor(ev: Dict) -> Dict:
+    """
+    Safely extracts the actor information from an audit event.
+    This function is robust against the possibility of the actor being None or
+    not a dictionary.
+    :param ev: The audit event to extract the actor from
+    :return: A dictionary containing the actor's userId and userEmail
+    """
+    actor = ev.get("actingUser")
+    if not isinstance(actor, dict):
+        actor = {}
+    return {
+        "userId": ev.get("userId") or actor.get("userId"),
+        "userEmail": ev.get("userEmail") or actor.get("userEmail") or actor.get("email"),
+    }
+def build_login_index(login_events: List[Dict]) -> Dict[Tuple[Optional[str], Optional[str]], List[Dict]]:
+    """
+    Builds an in-memory index of login events by (userId,email) and sorts by loginAt (UTC).
+    :param login_events: List of login events to index.
+    :return: A dictionary containing the indexed login events, where each key is a tuple of (userId,email)
+    and the value is a sorted list of login events for that key.
+    """
+    idx = defaultdict(list)
+    for e in login_events:
+        user_id = e.get("userId")
+        email = e.get("email")
+        ts = parse_ts_dt(e.get("loginAt"))
+        if ts is None:
+            continue
+        e["_ts"] = ts
+        idx[(str(user_id) if user_id is not None else None, email)].append(e)
+    for k in idx:
+        idx[k].sort(key=lambda r: r["_ts"])
+    return idx
+def build_security_index(security_events: List[Dict]) -> Dict[Tuple[Optional[str], Optional[str]], List[Dict]]:
+    """
+    Builds an in-memory index of security events by (userId,email) and sorts by createdAt (UTC).
+    :param security_events: List of security events to index.
+    :return: A dictionary containing the indexed security events,
+    where each key is a tuple of (userId,email) and the value is a sorted list of security events for that key.
+    """
+    idx = defaultdict(list)
+    for e in security_events:
+        actor = safe_get_actor(e)
+        user_id = actor.get("userId")
+        email = actor.get("userEmail") or e.get("actingUser")
+        ts = parse_ts_dt(e.get("createdAt"))
+        if ts is None:
+            continue
+        e["_ts"] = ts
+        idx[(str(user_id) if user_id is not None else None, email)].append(e)
+    for k in idx:
+        idx[k].sort(key=lambda r: r["_ts"])
+    return idx
+def find_best_time_match(ts: Optional[datetime.datetime],
+                         key: Tuple[Optional[str], Optional[str]],
+                         index: Dict[Tuple[Optional[str], Optional[str]], List[Dict]],
+                         window_seconds: int) -> Optional[Dict]:
+    """
+    Find the best match for a given timestamp in the given index within
+    the given window of seconds.
+    :param ts: The timestamp to search for
+    :param key: The key to search the index for
+    :param index: The index to search
+    :param window_seconds: The window of seconds to search in
+    :return: The best matching event, or None if no match found
+    """
+    if ts is None:
+        return None
+    candidates = index.get(key, [])
+    if not candidates:
+        return None
+    best, best_diff = None, float("inf")
+    for c in candidates:
+        diff = abs((ts - c["_ts"]).total_seconds())
+        if diff <= window_seconds and diff < best_diff:
+            best, best_diff = c, diff
+    return best
+def fill_network_fields(row: Dict, src_event: Dict) -> None:
+    """
+    Fill in network fields (ip_address, country_code, region_code) in the given row
+    from the given src_event if they are missing.
+    :param row: The row to fill in network fields for
+    :param src_event: The source event to draw network fields from
+    :return: None
+    """
+    if not isinstance(src_event, dict):
+        return
+    ip = _coalesce(
+        src_event.get("ipAddress"),
+        src_event.get("sourceIp"),
+        src_event.get("ip"),
+        (src_event.get("context") or {}).get("ipAddress"),
+    )
+    country = _coalesce(
+        src_event.get("countryCode"),
+        (src_event.get("context") or {}).get("countryCode"),
+    )
+    region = _coalesce(
+        src_event.get("regionCode"),
+        (src_event.get("context") or {}).get("regionCode"),
+    )
+    if row.get("ip_address") in _MISSING and ip not in _MISSING:
+        row["ip_address"] = ip
+    if row.get("country_code") in _MISSING and country not in _MISSING:
+        row["country_code"] = country
+    if row.get("region_code") in _MISSING and region not in _MISSING:
+        row["region_code"] = region
+def normalize_audit_event(ev: Dict) -> Dict:
+    """
+    Normalize an audit event by extracting relevant fields into a standard format.
+    The normalized format includes the following fields:
+    - audit_id: The unique identifier for the audit event.
+    - category: The category of the audit event.
+    - sub_category: The sub-category of the audit event.
+    - action: The action taken in the audit event.
+    - target_object_id: The ID of the object targeted in the audit event.
+    - user_id: The ID of the user who triggered the audit event.
+    - user_email: The email address of the user who triggered the audit event.
+    - hubspot_occurred_at: The timestamp of when the audit event occurred in HubSpot.
+    - ip_address: The IP address associated with the audit event.
+    - country_code: The country code associated with the audit event.
+    - region_code: The region code associated with the audit event.
+    :param ev: The audit event to normalize.
+    :return: A dictionary containing the normalized audit event fields.
+    """
+    actor = safe_get_actor(ev)
+    return {"audit_id": ev.get("id"),
+            "category": ev.get("category"),
+            "sub_category": ev.get("subCategory"),
+            "action": ev.get("action"),
+            "target_object_id": ev.get("targetObjectId") or ev.get("objectId"),
+            "user_id": actor.get("userId"),
+            "user_email": actor.get("userEmail"),
+            "hubspot_occured_at": ev.get("occurredAt") or ev.get("timestamp"),
+            "ip_address": ev.get("ipAddress"),
+            "country_code": ev.get("countryCode"),
+            "region_code": ev.get("regionCode"),
+            }
+def enrich_audit_row_by_category(row: Dict,
+                                 login_idx: Dict[Tuple[Optional[str], Optional[str]], List[Dict]],
+                                 security_idx: Dict[Tuple[Optional[str], Optional[str]], List[Dict]],
+                                 match_window_seconds: int = 300) -> Dict:
+    """
+    Enriches an audit event row by looking up the corresponding user's most recent login
+    or critical action event within a given time window.
+    :param row: The audit event row to enrich.
+    :param login_idx: A dictionary mapping user IDs/emails to lists of login events.
+    :param security_idx: A dictionary mapping user IDs/emails to lists of critical action events.
+    :param match_window_seconds: The time window in seconds to search for a matching login
+    or critical action event.
+    :return: The enriched audit event row with network fields filled in from the matching event, if found.
+    """
+    cat = (row.get("category") or "").upper()
+    if cat not in ("LOGIN", "CRITICAL_ACTION"):
+        return row
+    uid = str(row.get("user_id")) if row.get("user_id") is not None else None
+    email = row.get("user_email")
+    key = (uid, email)
+    ts = parse_ts_dt(row.get("hubspot_occured_at"))
+    picked = None
+    if cat == "LOGIN":
+        picked = find_best_time_match(ts, key, login_idx, match_window_seconds)
+    elif cat == "CRITICAL_ACTION":
+        picked = find_best_time_match(
+            ts, key, security_idx, match_window_seconds)
+    if picked:
+        fill_network_fields(row, picked)
+        # Optional backfill of user fields
+        if row.get("user_email") in _MISSING and picked.get("email") not in _MISSING:
+            row["user_email"] = picked.get("email")
+        if row.get("user_id") in _MISSING and picked.get("userId") not in _MISSING:
+            row["user_id"] = picked.get("userId")
+    return row

python/load_hubspot_data.py ADDED Viewed

	@@ -0,0 +1,163 @@

+"""
+Unified orchestrator for HubSpot → Supabase pipelines with a timestamp cursor.
+CLI:
+    # epoch ms
+    python hubspot_orchestrator.py 1754025600000
+    # ISO-8601
+    python hubspot_orchestrator.py 2025-08-01T09:30:00Z
+    # Back-compat date (floors to 00:00Z)
+    python hubspot_orchestrator.py 2025-08-01
+    # No arg: defaults to today@00:00Z
+"""
+import sys
+import re
+import logging
+import datetime
+# Pipelines (each exposes main(since_ms: Optional[int]) and can fall back to env)
+import hubspot_deals
+import hubspot_emails
+import hubspot_tickets
+import hubspot_contacts
+import hubspot_companies
+import hubspot_billing
+import hubspot_audit
+logging.basicConfig(
+    filename=f"logs/hubspot_unified_orchestrator_{datetime.datetime.now().strftime('%Y-%m-%d')}.log",
+    filemode="a",
+    level=logging.INFO,
+    format="%(asctime)s [%(levelname)s] %(message)s",
+)
+# ---------------------------
+# Time parsing helpers
+# ---------------------------
+def _ensure_utc(dt: datetime.datetime) -> datetime.datetime:
+    if dt.tzinfo is None:
+        dt = dt.replace(tzinfo=datetime.timezone.utc)
+    return dt.astimezone(datetime.timezone.utc)
+def floor_to_utc_midnight(dt: datetime.datetime) -> datetime.datetime:
+    dt = _ensure_utc(dt)
+    return dt.replace(hour=0, minute=0, second=0, microsecond=0)
+def _parse_iso_like_to_dt(value: str) -> datetime.datetime:
+    if isinstance(value, str) and value.endswith("Z"):
+        value = value[:-1] + "+00:00"
+    dt = datetime.datetime.fromisoformat(value)
+    return _ensure_utc(dt)
+def to_epoch_ms(dt_or_str) -> int:
+    if isinstance(dt_or_str, str):
+        dt = _parse_iso_like_to_dt(dt_or_str)
+    elif isinstance(dt_or_str, datetime.datetime):
+        dt = _ensure_utc(dt_or_str)
+    else:
+        raise TypeError(f"Unsupported type for to_epoch_ms: {type(dt_or_str)}")
+    return int(dt.timestamp() * 1000)
+def parse_since_arg_to_ms() -> int:
+    """
+    Accepts:
+      - integer epoch ms (or seconds; seconds auto *1000)
+      - ISO-8601 (Z or offset)
+      - YYYY-MM-DD (floors to 00:00Z)
+    If not provided, defaults to today@00:00Z.
+    """
+    if len(sys.argv) > 1:
+        arg = sys.argv[1].strip()
+        # epoch seconds or ms
+        if re.fullmatch(r"\d{10,13}", arg):
+            v = int(arg)
+            if v < 10_000_000_000_000:  # seconds -> ms
+                v *= 1000
+            return v
+        # YYYY-MM-DD
+        if re.fullmatch(r"\d{4}-\d{2}-\d{2}", arg):
+            d = datetime.datetime.strptime(
+                arg, "%Y-%m-%d").replace(tzinfo=datetime.timezone.utc)
+            return to_epoch_ms(floor_to_utc_midnight(d))
+        # ISO-8601
+        try:
+            return to_epoch_ms(arg)
+        except Exception:
+            print("Invalid --since argument. Use epoch ms, ISO-8601, or YYYY-MM-DD.")
+            sys.exit(1)
+    # default: today@00:00Z
+    today0 = floor_to_utc_midnight(
+        datetime.datetime.now(datetime.timezone.utc))
+    return to_epoch_ms(today0)
+# ---------------------------
+# Main
+# ---------------------------
+def main():
+    """
+    Runs pipelines in order with a shared timestamp cursor.
+    Each pipeline may internally advance its own stored cursor.
+    """
+    since_ms = parse_since_arg_to_ms()
+    print(f"=== Running HubSpot sync pipeline since_ms={since_ms} ===")
+    try:
+        print("\n[1/7] Companies")
+        hubspot_companies.main(since_ms=since_ms)
+    except Exception as e:
+        logging.exception("Error running companies pipeline: %s", e)
+    try:
+        print("\n[2/7] Contacts")
+        hubspot_contacts.main(since_ms=since_ms)
+    except Exception as e:
+        logging.exception("Error running contacts pipeline: %s", e)
+    try:
+        print("\n[3/7] Deals")
+        hubspot_deals.main(since_ms=since_ms)
+    except Exception as e:
+        logging.exception("Error running deals pipeline: %s", e)
+    try:
+        print("\n[4/7] Tickets")
+        hubspot_tickets.main(since_ms=since_ms)
+    except Exception as e:
+        logging.exception("Error running tickets pipeline: %s", e)
+    try:
+        print("\n[5/7] Emails")
+        hubspot_emails.main(since_ms=since_ms)
+    except Exception as e:
+        logging.exception("Error running emails pipeline: %s", e)
+    try:
+        print("\n[6/7] Billing")
+        hubspot_billing.main(since_ms=since_ms)
+    except Exception as e:
+        logging.exception("Error running billing pipeline: %s", e)
+    try:
+        print("\n[7/7] Billing")
+        hubspot_audit.main(since_ms=since_ms)
+    except Exception as e:
+        logging.exception("Error running billing pipeline: %s", e)
+    print("\n=== HubSpot sync complete ===")
+if __name__ == "__main__":
+    main()

python/supabase_utils.py ADDED Viewed

	@@ -0,0 +1,394 @@

+"""
+This script contains utility functions for working with Supabase.
+"""
+import time
+import datetime
+import json
+import logging
+import httpx
+from storage3.exceptions import StorageApiError
+from typing import List, Dict, Any, Optional
+try:
+    from postgrest import APIError  # supabase-py v2
+except Exception:  # pragma: no cover
+    APIError = Exception
+try:
+    # supabase-py v2
+    from postgrest.types import ReturnMethod  # Returning.MINIMAL
+    _RETURN_MINIMAL = ReturnMethod.MINIMAL
+except Exception:
+    # supabase-py v1 fallback
+    _RETURN_MINIMAL = "minimal"
+def insert_into_supabase_table(
+    supabase_client: Any,
+    table_name: str,
+    rows: List[Dict[str, Any]],
+    on_conflict: Optional[List[str]] = None,
+    *,
+    max_retries: int = 5,
+    backoff_base_seconds: float = 1.0,
+    use_returning_minimal: bool = True,
+) -> None:
+    """
+    Insert a list of rows into a Supabase table with optional conflict handling.
+    :param supabase_client: Supabase client
+    :param table_name: Name of the table to upsert into
+    :param rows: List of dictionaries to insert; each dictionary becomes a row
+    :param on_conflict: List of column names to use as conflict target
+    :param max_retries: Maximum number of times to retry on failure
+    :param backoff_base_seconds: Base amount of time to sleep between retries
+    :param use_returning_minimal: Whether to use returning=MINIMAL (faster but no inserted IDs)
+    :return: None. If no rows are provided, returns None immediately without attempting an insert.
+    This function will retry up to `max_retries` times on failure with an exponential
+    backoff schedule. If `on_conflict` is provided, it will use the given columns as the
+    conflict target. If `use_returning_minimal`, it will use the `returning=MINIMAL` parameter
+    to reduce response size (but won't get inserted IDs back).
+    NOTE: This function does not support transactions; it is intended for use in simple
+    data pipelines where retrying on failure is sufficient. If you need stronger
+    consistency guarantees, use transactions or another approach.
+    """
+    if not rows:
+        logging.info("No rows to insert for table %s.", table_name)
+        return None
+    q = supabase_client.table(table_name)
+    conflict_target = ",".join(on_conflict) if on_conflict else None
+    if conflict_target and use_returning_minimal:
+        query = q.upsert(rows, on_conflict=conflict_target,
+                         returning=_RETURN_MINIMAL)
+    elif conflict_target:
+        query = q.upsert(rows, on_conflict=conflict_target)
+    elif use_returning_minimal:
+        query = q.upsert(rows, returning=_RETURN_MINIMAL)
+    else:
+        query = q.upsert(rows)
+    attempt = 0
+    while True:
+        try:
+            query.execute()
+            logging.info("Inserted %s records into %s.", len(rows), table_name)
+            return True
+        except APIError as e:
+            code = getattr(e, "code", None)
+            attempt += 1
+            if attempt > max_retries:
+                logging.error(
+                    "Permanent failure upserting into %s after %d attempts: %s",
+                    table_name, attempt, e
+                )
+                return False
+            sleep_for = backoff_base_seconds * (2 ** (attempt - 1))
+            if code == "57014":
+                logging.warning(
+                    "Timeout (57014) upserting into %s (attempt %d/%d). Retrying in %.1fs.",
+                    table_name, attempt, max_retries, sleep_for
+                )
+            else:
+                logging.warning(
+                    "APIError upserting into %s (attempt %d/%d): %s. Retrying in %.1fs.",
+                    table_name, attempt, max_retries, e, sleep_for
+                )
+            time.sleep(sleep_for)
+        except (httpx.RequestError, httpx.HTTPStatusError) as e:
+            attempt += 1
+            if attempt > max_retries:
+                logging.error(
+                    "Permanent HTTP failure upserting into %s after %d attempts: %s",
+                    table_name, attempt, e
+                )
+                return False
+            sleep_for = backoff_base_seconds * (2 ** (attempt - 1))
+            logging.warning(
+                "HTTP error upserting into %s (attempt %d/%d): %s. Retrying in %.1fs.",
+                table_name, attempt, max_retries, e, sleep_for
+            )
+            time.sleep(sleep_for)
+        except Exception as e:
+            attempt += 1
+            if attempt > max_retries:
+                logging.error(
+                    "Permanent unexpected failure upserting into %s after %d attempts: %s",
+                    table_name, attempt, e
+                )
+                return False
+            sleep_for = backoff_base_seconds * (2 ** (attempt - 1))
+            logging.warning(
+                "Unexpected error upserting into %s (attempt %d/%d): %s. Retrying in %.1fs.",
+                table_name, attempt, max_retries, e, sleep_for
+            )
+            time.sleep(sleep_for)
+def batched_insert(
+    supabase_client,
+    table_name: str,
+    rows: List[Dict[str, Any]],
+    batch_size: int = 500,
+    on_conflict: Optional[List[str]] = None,
+    max_retries: int = 4,
+    backoff_base_seconds: float = 1.0,
+    min_batch_size: int = 50,
+) -> list:
+    """
+    Insert a list of rows into a Supabase table with optional conflict handling, batching, and retries.
+    :param supabase_client: Supabase client
+    :param table_name: Name of the table to upsert into
+    :param rows: List of dictionaries to insert; each dictionary becomes a row
+    :param batch_size: Maximum number of rows to send in a single upsert request
+    :param on_conflict: List of column names to use as conflict target
+    :param max_retries: Maximum number of times to retry on failure
+    :param backoff_base_seconds: Base amount of time to sleep between retries
+    :param min_batch_size: Minimum size of a batch; if a timeout occurs, the
+        batch will be shrunk to this size before retrying
+    :return: List of responses from Supabase (empty if no rows to insert)
+    This function will retry up to `max_retries` times on failure with an exponential
+    backoff schedule. If `on_conflict` is provided, it will use the given columns as the
+    conflict target. If a timeout occurs, it will shrink the batch size to `min_batch_size`
+    and retry the first half of the batch. If the batch is already at `min_batch_size` or
+    smaller, it will retry with backoff. If all retries fail, it will raise the last
+    exception. If the `rows` list is empty, it will return an empty list without sending
+    any requests to Supabase.
+    """
+    if not rows:
+        return []
+    results = []
+    n = len(rows)
+    i = 0
+    while i < n:
+        current_batch_size = min(batch_size, n - i)
+        batch = rows[i: i + current_batch_size]
+        # Attempt to insert this batch with retries
+        attempt = 0
+        while True:
+            try:
+                q = supabase_client.table(table_name)
+                if on_conflict:
+                    # `on_conflict` in supabase-py v2 expects a comma-separated string
+                    conflict_target = ",".join(on_conflict)
+                    resp = q.upsert(batch, on_conflict=conflict_target,
+                                    returning=_RETURN_MINIMAL).execute()
+                else:
+                    resp = q.upsert(batch, returning=_RETURN_MINIMAL).execute()
+                results.append(resp)
+                logging.info(
+                    f"Inserted batch rows {i}–{i + len(batch)} into {table_name}")
+                i += len(batch)  # advance window
+                break  # batch succeeded, move to next
+            except APIError as e:
+                if getattr(e, "code", None) == "57014" and current_batch_size > min_batch_size:
+                    # Halve the batch and retry the first half now;
+                    # the second half will be handled subsequently
+                    half = max(min_batch_size, current_batch_size // 2)
+                    logging.warning(
+                        f"Timeout on {table_name} rows {i}–{i + current_batch_size}. "
+                        f"Shrinking batch to {half} and retrying."
+                    )
+                    batch_size = half
+                    time.sleep(backoff_base_seconds)
+                    break
+                # Other errors or batch already at min size -> retry with backoff
+                attempt += 1
+                if attempt > max_retries:
+                    logging.error(
+                        f"Failed inserting batch rows {i}–{i + current_batch_size} into {table_name} "
+                        f"after {max_retries} retries: {e}"
+                    )
+                    raise
+                sleep_for = backoff_base_seconds * (2 ** (attempt - 1))
+                logging.warning(
+                    f"Error inserting batch rows {i}–{i + current_batch_size} into {table_name} "
+                    f"(attempt {attempt}/{max_retries}): {e}. Retrying in {sleep_for:.1f}s."
+                )
+                time.sleep(sleep_for)
+                continue
+            except Exception as e:
+                # Non-API errors: retry with backoff
+                attempt += 1
+                if attempt > max_retries:
+                    logging.error(
+                        f"Failed inserting batch rows {i}–{i + current_batch_size} into {table_name} "
+                        f"after {max_retries} retries: {e}"
+                    )
+                    raise
+                sleep_for = backoff_base_seconds * (2 ** (attempt - 1))
+                logging.warning(
+                    f"Unexpected error inserting batch rows {i}–{i + current_batch_size} into {table_name} "
+                    f"(attempt {attempt}/{max_retries}): {e}. Retrying in {sleep_for:.1f}s."
+                )
+                time.sleep(sleep_for)
+                continue
+    return results
+def upload_raw_json_to_supabase(supabase_client, json_data, object_type="contacts"):
+    """
+    Uploads raw JSON data to Supabase storage under a specified object type directory.
+    :param supabase_client: The Supabase client object.
+    :param json_data: The JSON data to be uploaded.
+    :param object_type: The type of object, used to determine the directory path for storage.
+    :return: The path where the JSON file is stored in Supabase.
+    """
+    now_str = datetime.datetime.now(
+        datetime.timezone.utc).strftime("%Y%m%d_%H%M%S")
+    path = f"{object_type}/{now_str}.json"
+    file_bytes = json.dumps(json_data, indent=2, default=str).encode("utf-8")
+    supabase_client.storage.from_("hubspot-raw-data").remove([path])
+    try:
+        supabase_client.storage.from_("hubspot-raw-data").upload(
+            path, file=file_bytes, file_options={
+                "content-type": "application/json"}
+        )
+    except StorageApiError as e:
+        if e.status_code == 413:
+            logging.warning(
+                "Upload failed: payload too large for Supabase Storage.")
+        else:
+            logging.error("Storage API error during upload: %s", e)
+    except httpx.RequestError as e:
+        logging.error("Upload error: %s", e)
+    return path
+def get_last_sync_time(supabase_client, object_type):
+    """
+    Retrieves the last sync time for the given object type from the hubspot_sync_metadata table.
+    :param supabase_client: The Supabase client object.
+    :param object_type: The type of object to retrieve the last sync time for.
+    :return: The last sync time for the given object type as a datetime object.
+             If no previous sync was found, returns None.
+    """
+    res = supabase_client.table("hubspot_sync_metadata").select(
+        "last_sync_time").eq("object_type", object_type).execute()
+    data = res.data
+    if data and data[0]["last_sync_time"]:
+        last_sync = datetime.datetime.fromisoformat(data[0]["last_sync_time"])
+        return last_sync
+    return None
+def update_sync_metadata(supabase_client, object_type, sync_time):
+    """
+    Updates the last sync time for the given object type in the hubspot_sync_metadata table.
+    :param supabase_client: The Supabase client object.
+    :param object_type: The type of object to update the last sync time for.
+    :param sync_time: The last sync time to update.
+    :return: The result of the Supabase upsert operation.
+    """
+    payload = [{
+        "object_type": object_type,
+        "last_sync_time": sync_time,
+        "updated_at": datetime.datetime.now(datetime.timezone.utc).isoformat()
+    }]
+    return insert_into_supabase_table(supabase_client, "hubspot_sync_metadata", payload)
+def get_existing_email_content_ids(supabase_client):
+    """
+    Fetches existing email_ids from the hubspot_email_contents table.
+    :param supabase_client: Supabase client
+    :return: A set of email_id values already in the database
+    """
+    existing_ids = set()
+    page_size = 1000
+    offset = 0
+    while True:
+        res = supabase_client.table("hubspot_email_contents")\
+            .select("email_id")\
+            .range(offset, offset + page_size - 1)\
+            .execute()
+        if not res.data:
+            break
+        batch = {row["email_id"] for row in res.data}
+        existing_ids.update(batch)
+        offset += page_size
+    return existing_ids
+def fetch_supabase_table(
+    supabase_client, table_name="hubspot_contacts",
+    id_column="contact_id"
+):
+    """
+    Fetches all rows from a Supabase table and returns them as a dict.
+    :param supabase_client: Supabase client instance
+    :param table_name: Name of the table to fetch from
+    :param id_column: Unique ID column to use as the key for the returned dict
+    :return: A dict of rows with the ID column as the key and the full row as the value
+    """
+    all_rows = {}
+    page = 0
+    page_size = 1000
+    while True:
+        res = supabase_client.table(table_name).select(
+            "*").range(page * page_size, (page + 1) * page_size - 1).execute()
+        if not res.data:
+            break
+        for row in res.data:
+            all_rows[str(row[id_column])] = row
+        if len(res.data) < page_size:
+            break
+        page += 1
+    return all_rows
+def enrich_supabase_row(base_row: dict) -> dict:
+    """
+    Enriches a Supabase row with additional columns needed for duplicate detection.
+    :param base_row: The base row to enrich.
+    :return: The enriched row.
+    """
+    base_row.update({
+        "duplicate_id": None,
+        "duplicate_status": None,
+        "duplicate_action": None,
+        "is_primary": True,
+        "updated_at": datetime.datetime.now(datetime.timezone.utc).isoformat()
+    })
+    return base_row

requirements.txt ADDED Viewed

	@@ -0,0 +1,9 @@

+python-dotenv==1.1.0
+hubspot-api-client==12.0.0
+supabase==2.16.0
+httpx==0.28.1
+pandas==2.2.2
+openpyxl==3.1.5
+phonenumbers
+tldextract
+rapidfuzz