Spaces:
Paused
Paused
niwayandm commited on
Commit ·
a16378e
1
Parent(s): 4f74457
Initial files commit
Browse files- .gitattributes +2 -0
- .gitignore +1 -0
- Dockerfile +115 -0
- README.md +6 -4
- docker-custom-entrypoint.sh +41 -0
- fonts/HanyiSentyPagoda_Regular.ttf +3 -0
- logs/.gitkeep +0 -0
- python/hubspot_audit.py +252 -0
- python/hubspot_billing.py +454 -0
- python/hubspot_companies.py +475 -0
- python/hubspot_contacts.py +415 -0
- python/hubspot_deals.py +445 -0
- python/hubspot_emails.py +472 -0
- python/hubspot_tickets.py +491 -0
- python/hubspot_utils.py +946 -0
- python/load_hubspot_data.py +163 -0
- python/supabase_utils.py +394 -0
- requirements.txt +9 -0
.gitattributes
CHANGED
|
@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
+
fonts/HanyiSentyPagoda[[:space:]]Regular.ttf filter=lfs diff=lfs merge=lfs -text
|
| 37 |
+
fonts/HanyiSentyPagoda_Regular.ttf filter=lfs diff=lfs merge=lfs -text
|
.gitignore
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
.env
|
Dockerfile
ADDED
|
@@ -0,0 +1,115 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
FROM node:24-alpine
|
| 2 |
+
|
| 3 |
+
# -------------------------
|
| 4 |
+
# Base + build args
|
| 5 |
+
# -------------------------
|
| 6 |
+
USER root
|
| 7 |
+
ARG N8N_PATH=/usr/local/lib/node_modules/n8n
|
| 8 |
+
ARG BASE_PATH=/root/.n8n
|
| 9 |
+
ARG DATABASE_PATH=$BASE_PATH/database
|
| 10 |
+
ARG CONFIG_PATH=$BASE_PATH/config
|
| 11 |
+
ARG WORKFLOWS_PATH=$BASE_PATH/workflows
|
| 12 |
+
ARG LOGS_PATH=$BASE_PATH/logs
|
| 13 |
+
|
| 14 |
+
# Optional args passed at build time (no secrets here)
|
| 15 |
+
ARG N8N_ENFORCE_SETTINGS_FILE_PERMISSIONS=$N8N_ENFORCE_SETTINGS_FILE_PERMISSIONS
|
| 16 |
+
ARG N8N_HOST=$N8N_HOST
|
| 17 |
+
ARG N8N_PORT=$N8N_PORT
|
| 18 |
+
ARG N8N_PROTOCOL=https
|
| 19 |
+
ARG N8N_EDITOR_BASE_URL=$N8N_EDITOR_BASE_URL
|
| 20 |
+
ARG WEBHOOK_URL=$WEBHOOK_URL
|
| 21 |
+
ARG GENERIC_TIMEZONE=$GENERIC_TIMEZONE
|
| 22 |
+
ARG TZ=$TZ
|
| 23 |
+
ARG N8N_ENCRYPTION_KEY=$N8N_ENCRYPTION_KEY
|
| 24 |
+
ARG DB_TYPE=$DB_TYPE
|
| 25 |
+
ARG DB_POSTGRESDB_SCHEMA=$DB_POSTGRESDB_SCHEMA
|
| 26 |
+
ARG DB_POSTGRESDB_HOST=$DB_POSTGRESDB_HOST
|
| 27 |
+
ARG DB_POSTGRESDB_DATABASE=$DB_POSTGRESDB_DATABASE
|
| 28 |
+
ARG DB_POSTGRESDB_PORT=$DB_POSTGRESDB_PORT
|
| 29 |
+
ARG DB_POSTGRESDB_USER=$DB_POSTGRESDB_USER
|
| 30 |
+
ARG DB_POSTGRESDB_PASSWORD=$DB_POSTGRESDB_PASSWORD
|
| 31 |
+
|
| 32 |
+
# -------------------------
|
| 33 |
+
# System deps
|
| 34 |
+
# -------------------------
|
| 35 |
+
RUN apk add --no-cache \
|
| 36 |
+
bash \
|
| 37 |
+
git \
|
| 38 |
+
python3 \
|
| 39 |
+
py3-pip \
|
| 40 |
+
make \
|
| 41 |
+
g++ \
|
| 42 |
+
build-base \
|
| 43 |
+
cairo-dev \
|
| 44 |
+
pango-dev \
|
| 45 |
+
chromium \
|
| 46 |
+
postgresql-client \
|
| 47 |
+
ffmpeg \
|
| 48 |
+
yt-dlp
|
| 49 |
+
|
| 50 |
+
|
| 51 |
+
# Tell Puppeteer to skip installing Chrome.
|
| 52 |
+
ENV PUPPETEER_SKIP_DOWNLOAD=true
|
| 53 |
+
ENV PUPPETEER_EXECUTABLE_PATH=/usr/bin/chromium-browser
|
| 54 |
+
|
| 55 |
+
RUN yarn add puppeteer@24.27.0
|
| 56 |
+
|
| 57 |
+
# Install n8n-nodes-puppeteer in a permanent location
|
| 58 |
+
RUN mkdir -p /opt/n8n-custom-nodes && \
|
| 59 |
+
cd /opt/n8n-custom-nodes && \
|
| 60 |
+
npm install n8n-nodes-puppeteer && \
|
| 61 |
+
chown -R node:node /opt/n8n-custom-nodes
|
| 62 |
+
|
| 63 |
+
# Copy our custom entrypoint
|
| 64 |
+
COPY docker-custom-entrypoint.sh /docker-custom-entrypoint.sh
|
| 65 |
+
RUN chmod +x /docker-custom-entrypoint.sh && \
|
| 66 |
+
chown node:node /docker-custom-entrypoint.sh
|
| 67 |
+
|
| 68 |
+
# -------------------------
|
| 69 |
+
# n8n
|
| 70 |
+
# -------------------------
|
| 71 |
+
RUN npm install -g n8n@1.118.1
|
| 72 |
+
|
| 73 |
+
# -------------------------
|
| 74 |
+
# Python venv + requirements
|
| 75 |
+
# -------------------------
|
| 76 |
+
RUN python3 -m venv /opt/venv
|
| 77 |
+
ENV PATH="/opt/venv/bin:$PATH"
|
| 78 |
+
|
| 79 |
+
# Python dependencies
|
| 80 |
+
COPY requirements.txt /tmp/requirements.txt
|
| 81 |
+
RUN pip install --no-cache-dir -r /tmp/requirements.txt
|
| 82 |
+
|
| 83 |
+
# Python code
|
| 84 |
+
COPY python/ /app/python/
|
| 85 |
+
|
| 86 |
+
# -------------------------
|
| 87 |
+
# Runtime layout
|
| 88 |
+
# -------------------------
|
| 89 |
+
# Persist n8n data under /data (enable “Persistent storage” in Space settings)
|
| 90 |
+
ENV N8N_USER_FOLDER="/data" \
|
| 91 |
+
N8N_LISTEN_ADDRESS="0.0.0.0" \
|
| 92 |
+
N8N_PORT=7860 \
|
| 93 |
+
N8N_PROTOCOL="http"
|
| 94 |
+
|
| 95 |
+
# Create standard dirs + logs
|
| 96 |
+
RUN mkdir -p $DATABASE_PATH $CONFIG_PATH $WORKFLOWS_PATH $LOGS_PATH \
|
| 97 |
+
&& mkdir -p /data/logs /app/logs \
|
| 98 |
+
&& chmod -R 777 $BASE_PATH /data /app
|
| 99 |
+
|
| 100 |
+
COPY logs/ /data/logs/
|
| 101 |
+
|
| 102 |
+
# -------------------------
|
| 103 |
+
# Non-root user for Chromium
|
| 104 |
+
# -------------------------
|
| 105 |
+
RUN addgroup -S pptruser && adduser -S -G pptruser pptruser \
|
| 106 |
+
&& mkdir -p /home/pptruser/Downloads \
|
| 107 |
+
&& chown -R pptruser:pptruser /home/pptruser /data /app
|
| 108 |
+
|
| 109 |
+
USER pptruser
|
| 110 |
+
|
| 111 |
+
WORKDIR /data
|
| 112 |
+
EXPOSE 7860
|
| 113 |
+
|
| 114 |
+
# Start n8n
|
| 115 |
+
CMD ["n8n", "start"]
|
README.md
CHANGED
|
@@ -1,10 +1,12 @@
|
|
| 1 |
---
|
| 2 |
-
title:
|
| 3 |
-
emoji:
|
| 4 |
-
colorFrom:
|
| 5 |
-
colorTo:
|
| 6 |
sdk: docker
|
| 7 |
pinned: false
|
|
|
|
|
|
|
| 8 |
---
|
| 9 |
|
| 10 |
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
|
|
|
| 1 |
---
|
| 2 |
+
title: N8N
|
| 3 |
+
emoji: ⚡
|
| 4 |
+
colorFrom: blue
|
| 5 |
+
colorTo: gray
|
| 6 |
sdk: docker
|
| 7 |
pinned: false
|
| 8 |
+
license: mit
|
| 9 |
+
short_description: Free n8n with Supabase
|
| 10 |
---
|
| 11 |
|
| 12 |
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
docker-custom-entrypoint.sh
ADDED
|
@@ -0,0 +1,41 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/bin/sh
|
| 2 |
+
|
| 3 |
+
print_banner() {
|
| 4 |
+
echo "----------------------------------------"
|
| 5 |
+
echo "n8n Puppeteer Node - Environment Details"
|
| 6 |
+
echo "----------------------------------------"
|
| 7 |
+
echo "Node.js version: $(node -v)"
|
| 8 |
+
echo "n8n version: $(n8n --version)"
|
| 9 |
+
|
| 10 |
+
# Get Chromium version specifically from the path we're using for Puppeteer
|
| 11 |
+
CHROME_VERSION=$("$PUPPETEER_EXECUTABLE_PATH" --version 2>/dev/null || echo "Chromium not found")
|
| 12 |
+
echo "Chromium version: $CHROME_VERSION"
|
| 13 |
+
|
| 14 |
+
# Get Puppeteer version if installed
|
| 15 |
+
PUPPETEER_PATH="/opt/n8n-custom-nodes/node_modules/n8n-nodes-puppeteer"
|
| 16 |
+
if [ -f "$PUPPETEER_PATH/package.json" ]; then
|
| 17 |
+
PUPPETEER_VERSION=$(node -p "require('$PUPPETEER_PATH/package.json').version")
|
| 18 |
+
echo "n8n-nodes-puppeteer version: $PUPPETEER_VERSION"
|
| 19 |
+
|
| 20 |
+
# Try to resolve puppeteer package from the n8n-nodes-puppeteer directory
|
| 21 |
+
CORE_PUPPETEER_VERSION=$(cd "$PUPPETEER_PATH" && node -e "try { const version = require('puppeteer/package.json').version; console.log(version); } catch(e) { console.log('not found'); }")
|
| 22 |
+
echo "Puppeteer core version: $CORE_PUPPETEER_VERSION"
|
| 23 |
+
else
|
| 24 |
+
echo "n8n-nodes-puppeteer: not installed"
|
| 25 |
+
fi
|
| 26 |
+
|
| 27 |
+
echo "Puppeteer executable path: $PUPPETEER_EXECUTABLE_PATH"
|
| 28 |
+
echo "----------------------------------------"
|
| 29 |
+
}
|
| 30 |
+
|
| 31 |
+
# Add custom nodes to the NODE_PATH
|
| 32 |
+
if [ -n "$N8N_CUSTOM_EXTENSIONS" ]; then
|
| 33 |
+
export N8N_CUSTOM_EXTENSIONS="/opt/n8n-custom-nodes:${N8N_CUSTOM_EXTENSIONS}"
|
| 34 |
+
else
|
| 35 |
+
export N8N_CUSTOM_EXTENSIONS="/opt/n8n-custom-nodes"
|
| 36 |
+
fi
|
| 37 |
+
|
| 38 |
+
print_banner
|
| 39 |
+
|
| 40 |
+
# Execute the original n8n entrypoint script
|
| 41 |
+
exec /docker-entrypoint.sh "$@"
|
fonts/HanyiSentyPagoda_Regular.ttf
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:754eb3539249898f9daffc23b0068561bb9a618a79d517708513b5119fd7e6e5
|
| 3 |
+
size 9982284
|
logs/.gitkeep
ADDED
|
File without changes
|
python/hubspot_audit.py
ADDED
|
@@ -0,0 +1,252 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
HubSpot Audit → Supabase (incremental since a millisecond cursor)
|
| 3 |
+
|
| 4 |
+
Usage from orchestrator:
|
| 5 |
+
import load_hubspot_companies
|
| 6 |
+
load_hubspot_companies.main(since_ms=<int milliseconds since epoch UTC>)
|
| 7 |
+
|
| 8 |
+
Direct CLI:
|
| 9 |
+
# epoch ms
|
| 10 |
+
python load_hubspot_companies.py 1754025600000
|
| 11 |
+
# ISO-8601
|
| 12 |
+
python load_hubspot_companies.py 2025-08-01T09:30:00Z
|
| 13 |
+
# Back-compat date (floors to 00:00Z)
|
| 14 |
+
python load_hubspot_companies.py 2025-08-01
|
| 15 |
+
"""
|
| 16 |
+
import os
|
| 17 |
+
import logging
|
| 18 |
+
import datetime
|
| 19 |
+
from typing import Dict, List, Optional, Union
|
| 20 |
+
import re
|
| 21 |
+
from dotenv import load_dotenv
|
| 22 |
+
from supabase import create_client
|
| 23 |
+
from supabase_utils import batched_insert, update_sync_metadata
|
| 24 |
+
from hubspot_utils import (
|
| 25 |
+
to_epoch_ms_from_utc_iso,
|
| 26 |
+
page_account_activity,
|
| 27 |
+
build_login_index,
|
| 28 |
+
build_security_index,
|
| 29 |
+
normalize_audit_event,
|
| 30 |
+
enrich_audit_row_by_category,
|
| 31 |
+
deduplicate_by_key,
|
| 32 |
+
)
|
| 33 |
+
|
| 34 |
+
# Logging
|
| 35 |
+
logging.basicConfig(
|
| 36 |
+
filename=f"logs/hubspot_audit_logs_{datetime.datetime.now().strftime('%Y-%m-%d')}.log",
|
| 37 |
+
filemode="a",
|
| 38 |
+
level=logging.INFO,
|
| 39 |
+
format="%(asctime)s [%(levelname)s] %(message)s",
|
| 40 |
+
)
|
| 41 |
+
|
| 42 |
+
# Environment
|
| 43 |
+
load_dotenv()
|
| 44 |
+
HUBSPOT_TOKEN = os.getenv("HUBSPOT_TOKEN")
|
| 45 |
+
SUPABASE_URL = os.getenv("SUPABASE_URL")
|
| 46 |
+
SUPABASE_SERVICE_ROLE_KEY = os.getenv("SUPABASE_SERVICE_ROLE_KEY")
|
| 47 |
+
|
| 48 |
+
if not HUBSPOT_TOKEN:
|
| 49 |
+
raise RuntimeError("HUBSPOT_TOKEN is required")
|
| 50 |
+
if not SUPABASE_URL or not SUPABASE_SERVICE_ROLE_KEY:
|
| 51 |
+
raise RuntimeError(
|
| 52 |
+
"SUPABASE_URL and SUPABASE_SERVICE_ROLE_KEY are required")
|
| 53 |
+
|
| 54 |
+
supabase_client = create_client(SUPABASE_URL, SUPABASE_SERVICE_ROLE_KEY)
|
| 55 |
+
|
| 56 |
+
# Config
|
| 57 |
+
AUDITLOG_TABLE = os.getenv("HUBSPOT_AUDITLOG_TABLE", "hubspot_audits")
|
| 58 |
+
HUBSPOT_LIMIT = int(os.getenv("HUBSPOT_AUDITLOG_LIMIT", "100"))
|
| 59 |
+
HUBSPOT_MAX_PAGES = int(os.getenv("HUBSPOT_AUDITLOG_MAX_PAGES", "10000"))
|
| 60 |
+
MATCH_WINDOW_SECONDS = int(os.getenv("HUBSPOT_MATCH_WINDOW_SECONDS", "300"))
|
| 61 |
+
|
| 62 |
+
INITIAL_BACKOFF_SECONDS = float(os.getenv("HUBSPOT_BACKOFF_START", "1.0"))
|
| 63 |
+
MAX_BACKOFF_SECONDS = float(os.getenv("HUBSPOT_BACKOFF_MAX", "16.0"))
|
| 64 |
+
|
| 65 |
+
BOOTSTRAP_SINCE_MS_ENV = os.getenv(
|
| 66 |
+
"BOOTSTRAP_SINCE_MS_ENV") # may be epoch ms or ISO-8601
|
| 67 |
+
|
| 68 |
+
|
| 69 |
+
def _today_midnight_ms_utc() -> int:
|
| 70 |
+
now = datetime.datetime.now(datetime.timezone.utc)
|
| 71 |
+
dt = now.replace(hour=0, minute=0, second=0, microsecond=0)
|
| 72 |
+
# hubspot_utils.to_epoch_ms_from_utc_iso expects ISO; format accordingly
|
| 73 |
+
return to_epoch_ms_from_utc_iso(dt.isoformat().replace("+00:00", "Z"))
|
| 74 |
+
|
| 75 |
+
|
| 76 |
+
def _ensure_utc(dt: datetime.datetime) -> datetime.datetime:
|
| 77 |
+
if dt.tzinfo is None:
|
| 78 |
+
dt = dt.replace(tzinfo=datetime.timezone.utc)
|
| 79 |
+
return dt.astimezone(datetime.timezone.utc)
|
| 80 |
+
|
| 81 |
+
|
| 82 |
+
def floor_to_utc_midnight(dt: datetime.datetime) -> datetime.datetime:
|
| 83 |
+
dt = _ensure_utc(dt)
|
| 84 |
+
return dt.replace(hour=0, minute=0, second=0, microsecond=0)
|
| 85 |
+
|
| 86 |
+
|
| 87 |
+
def _parse_iso_like_to_dt(value: str) -> datetime.datetime:
|
| 88 |
+
if value.endswith("Z"):
|
| 89 |
+
value = value[:-1] + "+00:00"
|
| 90 |
+
dt = datetime.datetime.fromisoformat(value)
|
| 91 |
+
return _ensure_utc(dt)
|
| 92 |
+
|
| 93 |
+
|
| 94 |
+
def to_epoch_ms(dt_or_str: Union[str, datetime.datetime]) -> int:
|
| 95 |
+
if isinstance(dt_or_str, str):
|
| 96 |
+
dt = _parse_iso_like_to_dt(dt_or_str)
|
| 97 |
+
elif isinstance(dt_or_str, datetime.datetime):
|
| 98 |
+
dt = _ensure_utc(dt_or_str)
|
| 99 |
+
else:
|
| 100 |
+
raise TypeError(f"Unsupported type for to_epoch_ms: {type(dt_or_str)}")
|
| 101 |
+
return int(dt.timestamp() * 1000)
|
| 102 |
+
|
| 103 |
+
|
| 104 |
+
BASE_AUDIT_URL = "https://api.hubapi.com/account-info/v3/activity/audit-logs"
|
| 105 |
+
BASE_SECURITY_URL = "https://api.hubspot.com/account-info/v3/activity/security".replace(
|
| 106 |
+
"hubspot.com", "hubapi.com") # ensure hubapi
|
| 107 |
+
BASE_LOGIN_URL = "https://api.hubapi.com/account-info/v3/activity/login"
|
| 108 |
+
|
| 109 |
+
|
| 110 |
+
def fetch_streams(occurred_after_ms: int) -> Dict[str, List[dict]]:
|
| 111 |
+
"""
|
| 112 |
+
Fetch the three streams from HubSpot account-info API:
|
| 113 |
+
- Audit logs
|
| 114 |
+
- Login activity
|
| 115 |
+
- Security activity
|
| 116 |
+
|
| 117 |
+
We pass occurred_after_ms and omit occurred_before_ms (None).
|
| 118 |
+
"""
|
| 119 |
+
audit = page_account_activity(
|
| 120 |
+
base_url=BASE_AUDIT_URL,
|
| 121 |
+
token=HUBSPOT_TOKEN,
|
| 122 |
+
occurred_after_ms=occurred_after_ms,
|
| 123 |
+
occurred_before_ms=None,
|
| 124 |
+
limit=HUBSPOT_LIMIT,
|
| 125 |
+
max_pages=HUBSPOT_MAX_PAGES,
|
| 126 |
+
)
|
| 127 |
+
login = page_account_activity(
|
| 128 |
+
base_url=BASE_LOGIN_URL,
|
| 129 |
+
token=HUBSPOT_TOKEN,
|
| 130 |
+
occurred_after_ms=occurred_after_ms,
|
| 131 |
+
occurred_before_ms=None,
|
| 132 |
+
limit=HUBSPOT_LIMIT,
|
| 133 |
+
max_pages=HUBSPOT_MAX_PAGES,
|
| 134 |
+
)
|
| 135 |
+
security = page_account_activity(
|
| 136 |
+
base_url=BASE_SECURITY_URL,
|
| 137 |
+
token=HUBSPOT_TOKEN,
|
| 138 |
+
occurred_after_ms=occurred_after_ms,
|
| 139 |
+
occurred_before_ms=None,
|
| 140 |
+
limit=HUBSPOT_LIMIT,
|
| 141 |
+
max_pages=HUBSPOT_MAX_PAGES,
|
| 142 |
+
)
|
| 143 |
+
logging.info("Fetched counts: audit=%d login=%d security=%d",
|
| 144 |
+
len(audit), len(login), len(security))
|
| 145 |
+
return {"audit": audit, "login": login, "security": security}
|
| 146 |
+
|
| 147 |
+
|
| 148 |
+
def build_indices(login_events: List[dict], security_events: List[dict]):
|
| 149 |
+
login_idx = build_login_index(login_events)
|
| 150 |
+
security_idx = build_security_index(security_events)
|
| 151 |
+
return login_idx, security_idx
|
| 152 |
+
|
| 153 |
+
|
| 154 |
+
def normalize_and_enrich(audit_events: List[dict], login_idx, security_idx) -> List[dict]:
|
| 155 |
+
rows: List[dict] = []
|
| 156 |
+
for ev in audit_events:
|
| 157 |
+
row = normalize_audit_event(ev)
|
| 158 |
+
if not row.get("audit_id"):
|
| 159 |
+
continue
|
| 160 |
+
row = enrich_audit_row_by_category(
|
| 161 |
+
row, login_idx, security_idx, match_window_seconds=MATCH_WINDOW_SECONDS)
|
| 162 |
+
rows.append(row)
|
| 163 |
+
rows = deduplicate_by_key(rows, "audit_id")
|
| 164 |
+
return rows
|
| 165 |
+
|
| 166 |
+
|
| 167 |
+
def upsert(rows: List[dict]) -> None:
|
| 168 |
+
if not rows:
|
| 169 |
+
logging.info("No rows to upsert.")
|
| 170 |
+
sync_time = datetime.datetime.now(datetime.timezone.utc)
|
| 171 |
+
update_sync_metadata(
|
| 172 |
+
supabase_client, AUDITLOG_TABLE, sync_time.isoformat())
|
| 173 |
+
return
|
| 174 |
+
|
| 175 |
+
batched_insert(
|
| 176 |
+
supabase_client,
|
| 177 |
+
AUDITLOG_TABLE,
|
| 178 |
+
rows,
|
| 179 |
+
batch_size=500,
|
| 180 |
+
on_conflict=["audit_id"],
|
| 181 |
+
)
|
| 182 |
+
logging.info("Upserted %d rows into %s", len(rows), AUDITLOG_TABLE)
|
| 183 |
+
|
| 184 |
+
sync_time = datetime.datetime.now(datetime.timezone.utc)
|
| 185 |
+
update_sync_metadata(supabase_client, AUDITLOG_TABLE,
|
| 186 |
+
sync_time.isoformat())
|
| 187 |
+
|
| 188 |
+
|
| 189 |
+
def main(since_ms: Optional[int] = None):
|
| 190 |
+
logging.info(
|
| 191 |
+
"Starting HubSpot audit sync (occurredAfter_ms=%s, occurredBefore=SKIPPED).", since_ms)
|
| 192 |
+
|
| 193 |
+
if since_ms is None and BOOTSTRAP_SINCE_MS_ENV:
|
| 194 |
+
try:
|
| 195 |
+
since_ms = int(BOOTSTRAP_SINCE_MS_ENV)
|
| 196 |
+
except ValueError:
|
| 197 |
+
raise RuntimeError(
|
| 198 |
+
"HUBSPOT_BILLING_SINCE_MS must be an integer (ms) if set.")
|
| 199 |
+
|
| 200 |
+
if since_ms is None:
|
| 201 |
+
# Default: today@00:00:00Z for first run
|
| 202 |
+
today0 = floor_to_utc_midnight(
|
| 203 |
+
datetime.datetime.now(datetime.timezone.utc))
|
| 204 |
+
since_ms = to_epoch_ms(today0)
|
| 205 |
+
|
| 206 |
+
print(f"Fetching HubSpot audit logs occurredAfter > {since_ms} ...")
|
| 207 |
+
streams = fetch_streams(occurred_after_ms=since_ms)
|
| 208 |
+
login_idx, security_idx = build_indices(
|
| 209 |
+
streams["login"], streams["security"])
|
| 210 |
+
rows = normalize_and_enrich(streams["audit"], login_idx, security_idx)
|
| 211 |
+
|
| 212 |
+
print("Upserting into Supabase...")
|
| 213 |
+
upsert(rows)
|
| 214 |
+
print(f"Synced {len(rows)} audit rows into '{AUDITLOG_TABLE}'.")
|
| 215 |
+
print("Audit logs sync complete.")
|
| 216 |
+
|
| 217 |
+
|
| 218 |
+
def _parse_cli_arg_to_ms(arg: str) -> int:
|
| 219 |
+
"""
|
| 220 |
+
Accept:
|
| 221 |
+
- integer epoch ms
|
| 222 |
+
- ISO-8601 (Z or offset)
|
| 223 |
+
- YYYY-MM-DD (floors to 00:00Z)
|
| 224 |
+
"""
|
| 225 |
+
# epoch ms or seconds
|
| 226 |
+
if re.fullmatch(r"\d{10,13}", arg):
|
| 227 |
+
v = int(arg)
|
| 228 |
+
if v < 10_000_000_000_000: # seconds -> ms
|
| 229 |
+
v *= 1000
|
| 230 |
+
return v
|
| 231 |
+
|
| 232 |
+
# YYYY-MM-DD
|
| 233 |
+
if re.fullmatch(r"\d{4}-\d{2}-\d{2}", arg):
|
| 234 |
+
d = datetime.datetime.strptime(
|
| 235 |
+
arg, "%Y-%m-%d").replace(tzinfo=datetime.timezone.utc)
|
| 236 |
+
return to_epoch_ms(floor_to_utc_midnight(d))
|
| 237 |
+
|
| 238 |
+
# ISO-8601
|
| 239 |
+
return to_epoch_ms(arg)
|
| 240 |
+
|
| 241 |
+
|
| 242 |
+
if __name__ == "__main__":
|
| 243 |
+
import sys
|
| 244 |
+
if len(sys.argv) > 1:
|
| 245 |
+
try:
|
| 246 |
+
main(since_ms=_parse_cli_arg_to_ms(sys.argv[1]))
|
| 247 |
+
except Exception as e:
|
| 248 |
+
print(
|
| 249 |
+
f"Invalid timestamp. Provide epoch ms, ISO-8601, or YYYY-MM-DD. Error: {e}")
|
| 250 |
+
sys.exit(1)
|
| 251 |
+
else:
|
| 252 |
+
main()
|
python/hubspot_billing.py
ADDED
|
@@ -0,0 +1,454 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
HubSpot Billing → Supabase (incremental since a millisecond cursor)
|
| 3 |
+
|
| 4 |
+
Usage from orchestrator:
|
| 5 |
+
import load_hubspot_billing
|
| 6 |
+
load_hubspot_billing.main(since_ms=<int milliseconds since epoch UTC>)
|
| 7 |
+
|
| 8 |
+
Direct CLI:
|
| 9 |
+
# epoch ms
|
| 10 |
+
python load_hubspot_billing.py 1754025600000
|
| 11 |
+
# ISO-8601
|
| 12 |
+
python load_hubspot_billing.py 2025-08-01T09:30:00Z
|
| 13 |
+
# Back-compat date (floors to 00:00Z)
|
| 14 |
+
python load_hubspot_billing.py 2025-08-01
|
| 15 |
+
"""
|
| 16 |
+
import os
|
| 17 |
+
import re
|
| 18 |
+
import time
|
| 19 |
+
import logging
|
| 20 |
+
import datetime
|
| 21 |
+
from typing import List, Dict, Optional, Tuple, Union
|
| 22 |
+
|
| 23 |
+
import httpx
|
| 24 |
+
import hubspot
|
| 25 |
+
from dotenv import load_dotenv
|
| 26 |
+
from supabase import create_client
|
| 27 |
+
|
| 28 |
+
from hubspot_utils import (
|
| 29 |
+
parse_ts, try_parse_int, deduplicate_by_key,
|
| 30 |
+
)
|
| 31 |
+
from supabase_utils import (
|
| 32 |
+
batched_insert, update_sync_metadata,
|
| 33 |
+
)
|
| 34 |
+
|
| 35 |
+
# -----------------------------------------------------------------------------
|
| 36 |
+
# Logging
|
| 37 |
+
# -----------------------------------------------------------------------------
|
| 38 |
+
logging.basicConfig(
|
| 39 |
+
filename=f"logs/hubspot_billing_pipeline_{datetime.datetime.now().strftime('%Y-%m-%d')}.log",
|
| 40 |
+
filemode="a",
|
| 41 |
+
level=logging.INFO,
|
| 42 |
+
format="%(asctime)s [%(levelname)s] %(message)s",
|
| 43 |
+
)
|
| 44 |
+
|
| 45 |
+
# -----------------------------------------------------------------------------
|
| 46 |
+
# Environment
|
| 47 |
+
# -----------------------------------------------------------------------------
|
| 48 |
+
load_dotenv()
|
| 49 |
+
HUBSPOT_TOKEN = os.getenv("HUBSPOT_TOKEN")
|
| 50 |
+
SUPABASE_URL = os.getenv("SUPABASE_URL")
|
| 51 |
+
SUPABASE_SERVICE_ROLE_KEY = os.getenv("SUPABASE_SERVICE_ROLE_KEY")
|
| 52 |
+
# Optional bootstrap cursor if orchestrator doesn't provide one
|
| 53 |
+
BOOTSTRAP_SINCE_MS_ENV = os.getenv("HUBSPOT_BILLING_SINCE_MS")
|
| 54 |
+
|
| 55 |
+
if not HUBSPOT_TOKEN:
|
| 56 |
+
raise RuntimeError("HUBSPOT_TOKEN is not set")
|
| 57 |
+
if not SUPABASE_URL or not SUPABASE_SERVICE_ROLE_KEY:
|
| 58 |
+
raise RuntimeError("Supabase env vars are not set")
|
| 59 |
+
|
| 60 |
+
hubspot_client = hubspot.Client.create(access_token=HUBSPOT_TOKEN)
|
| 61 |
+
supabase_client = create_client(SUPABASE_URL, SUPABASE_SERVICE_ROLE_KEY)
|
| 62 |
+
|
| 63 |
+
# -----------------------------------------------------------------------------
|
| 64 |
+
# Config
|
| 65 |
+
# -----------------------------------------------------------------------------
|
| 66 |
+
|
| 67 |
+
# Custom object type ID for Billing Services
|
| 68 |
+
BILLING_OBJECT_TYPE = "2-38060359"
|
| 69 |
+
|
| 70 |
+
BILLING_PROPERTIES = [
|
| 71 |
+
"cli",
|
| 72 |
+
"account_code",
|
| 73 |
+
"service_name",
|
| 74 |
+
"status",
|
| 75 |
+
"type",
|
| 76 |
+
"tariff_name",
|
| 77 |
+
"contract_renewal_date",
|
| 78 |
+
"supplier_network",
|
| 79 |
+
"network_name",
|
| 80 |
+
"product_type_name",
|
| 81 |
+
"hs_created_by_user_id",
|
| 82 |
+
"hs_createdate",
|
| 83 |
+
# "hs_lastmodifieddate", # optionally include if needed later
|
| 84 |
+
]
|
| 85 |
+
|
| 86 |
+
PROPERTY_RENAME = {
|
| 87 |
+
"hs_created_by_user_id": "hubspot_created_by",
|
| 88 |
+
"hs_createdate": "hubspot_created_at",
|
| 89 |
+
}
|
| 90 |
+
|
| 91 |
+
# -----------------------------------------------------------------------------
|
| 92 |
+
# Time helpers (mirroring hubspot_tickets)
|
| 93 |
+
# -----------------------------------------------------------------------------
|
| 94 |
+
|
| 95 |
+
|
| 96 |
+
def _ensure_utc(dt: datetime.datetime) -> datetime.datetime:
|
| 97 |
+
if dt.tzinfo is None:
|
| 98 |
+
dt = dt.replace(tzinfo=datetime.timezone.utc)
|
| 99 |
+
return dt.astimezone(datetime.timezone.utc)
|
| 100 |
+
|
| 101 |
+
|
| 102 |
+
def floor_to_utc_midnight(dt: datetime.datetime) -> datetime.datetime:
|
| 103 |
+
dt = _ensure_utc(dt)
|
| 104 |
+
return dt.replace(hour=0, minute=0, second=0, microsecond=0)
|
| 105 |
+
|
| 106 |
+
|
| 107 |
+
def _parse_iso_like_to_dt(value: str) -> datetime.datetime:
|
| 108 |
+
if value.endswith("Z"):
|
| 109 |
+
value = value[:-1] + "+00:00"
|
| 110 |
+
dt = datetime.datetime.fromisoformat(value)
|
| 111 |
+
return _ensure_utc(dt)
|
| 112 |
+
|
| 113 |
+
|
| 114 |
+
def to_epoch_ms(dt_or_str: Union[str, datetime.datetime]) -> int:
|
| 115 |
+
if isinstance(dt_or_str, str):
|
| 116 |
+
dt = _parse_iso_like_to_dt(dt_or_str)
|
| 117 |
+
elif isinstance(dt_or_str, datetime.datetime):
|
| 118 |
+
dt = _ensure_utc(dt_or_str)
|
| 119 |
+
else:
|
| 120 |
+
raise TypeError(f"Unsupported type for to_epoch_ms: {type(dt_or_str)}")
|
| 121 |
+
return int(dt.timestamp() * 1000)
|
| 122 |
+
|
| 123 |
+
|
| 124 |
+
def parse_any_ts_ms(value: Optional[Union[str, int, float]]) -> Optional[int]:
|
| 125 |
+
"""
|
| 126 |
+
Accepts:
|
| 127 |
+
- ms-epoch as str/int/float
|
| 128 |
+
- seconds-epoch as str/int/float (auto *1000)
|
| 129 |
+
- ISO-8601 string
|
| 130 |
+
Returns ms since epoch or None.
|
| 131 |
+
"""
|
| 132 |
+
if value is None:
|
| 133 |
+
return None
|
| 134 |
+
|
| 135 |
+
# numeric-ish string or number
|
| 136 |
+
try:
|
| 137 |
+
v = int(float(value)) # handles numeric strings
|
| 138 |
+
# heuristics: treat 10-digit as seconds
|
| 139 |
+
if v < 10_000_000_000_000:
|
| 140 |
+
v *= 1000
|
| 141 |
+
return v
|
| 142 |
+
except Exception:
|
| 143 |
+
pass
|
| 144 |
+
|
| 145 |
+
# ISO-8601
|
| 146 |
+
try:
|
| 147 |
+
return to_epoch_ms(str(value))
|
| 148 |
+
except Exception:
|
| 149 |
+
return None
|
| 150 |
+
|
| 151 |
+
|
| 152 |
+
# -----------------------------------------------------------------------------
|
| 153 |
+
# Search (by timestamp cursor)
|
| 154 |
+
# -----------------------------------------------------------------------------
|
| 155 |
+
|
| 156 |
+
|
| 157 |
+
def _search_billing_ids_from(since_ms: int, prop: str) -> List[str]:
|
| 158 |
+
"""
|
| 159 |
+
Search billing custom object IDs where {prop} > since_ms.
|
| 160 |
+
Sort ascending so we can advance cursor monotonically.
|
| 161 |
+
"""
|
| 162 |
+
url = f"https://api.hubapi.com/crm/v3/objects/{BILLING_OBJECT_TYPE}/search"
|
| 163 |
+
headers = {
|
| 164 |
+
"Authorization": f"Bearer {HUBSPOT_TOKEN}",
|
| 165 |
+
"Content-Type": "application/json",
|
| 166 |
+
"Accept": "application/json",
|
| 167 |
+
}
|
| 168 |
+
payload = {
|
| 169 |
+
"filterGroups": [{
|
| 170 |
+
"filters": [
|
| 171 |
+
{"propertyName": prop, "operator": "GT", "value": str(since_ms)},
|
| 172 |
+
]
|
| 173 |
+
}],
|
| 174 |
+
"limit": 100,
|
| 175 |
+
"sorts": [{"propertyName": prop, "direction": "ASCENDING"}],
|
| 176 |
+
"properties": ["hs_object_id"],
|
| 177 |
+
}
|
| 178 |
+
|
| 179 |
+
ids: List[str] = []
|
| 180 |
+
after: Optional[str] = None
|
| 181 |
+
with httpx.Client(timeout=30.0) as client:
|
| 182 |
+
while True:
|
| 183 |
+
body = dict(payload)
|
| 184 |
+
if after:
|
| 185 |
+
body["after"] = after
|
| 186 |
+
|
| 187 |
+
resp = client.post(url, headers=headers, json=body)
|
| 188 |
+
if resp.status_code >= 400:
|
| 189 |
+
try:
|
| 190 |
+
logging.error(
|
| 191 |
+
"Billing search error for prop '%s': %s", prop, resp.json())
|
| 192 |
+
except Exception:
|
| 193 |
+
logging.error(
|
| 194 |
+
"Billing search error for prop '%s': %s", prop, resp.text)
|
| 195 |
+
resp.raise_for_status()
|
| 196 |
+
|
| 197 |
+
data = resp.json()
|
| 198 |
+
ids.extend([obj["id"] for obj in data.get("results", []) or []])
|
| 199 |
+
|
| 200 |
+
after = (data.get("paging") or {}).get("next", {}).get("after")
|
| 201 |
+
if not after:
|
| 202 |
+
break
|
| 203 |
+
time.sleep(0.1)
|
| 204 |
+
|
| 205 |
+
return ids
|
| 206 |
+
|
| 207 |
+
|
| 208 |
+
def search_billing_ids_after_ms(since_ms: int) -> Tuple[List[str], str]:
|
| 209 |
+
"""
|
| 210 |
+
Search billing IDs where {prop} > since_ms (strictly greater),
|
| 211 |
+
sorted ASC so the max timestamp at the end is monotonic.
|
| 212 |
+
Returns a tuple of (ids, prop_used).
|
| 213 |
+
"""
|
| 214 |
+
|
| 215 |
+
props_to_try = ["hs_createdate"]
|
| 216 |
+
last_err = None
|
| 217 |
+
|
| 218 |
+
for prop in props_to_try:
|
| 219 |
+
try:
|
| 220 |
+
ids = _search_billing_ids_from(since_ms, prop)
|
| 221 |
+
logging.info(
|
| 222 |
+
"Billing search with '%s' returned %d IDs.", prop, len(ids))
|
| 223 |
+
return ids, prop
|
| 224 |
+
except httpx.HTTPStatusError as e:
|
| 225 |
+
last_err = e
|
| 226 |
+
continue
|
| 227 |
+
|
| 228 |
+
if last_err:
|
| 229 |
+
raise last_err
|
| 230 |
+
return [], "hs_createdate"
|
| 231 |
+
|
| 232 |
+
|
| 233 |
+
# -----------------------------------------------------------------------------
|
| 234 |
+
# Read-by-ID (with associations)
|
| 235 |
+
# -----------------------------------------------------------------------------
|
| 236 |
+
|
| 237 |
+
|
| 238 |
+
def read_billing_by_ids(
|
| 239 |
+
billing_ids: List[str],
|
| 240 |
+
cursor_prop: str,
|
| 241 |
+
) -> Tuple[List[Dict], List[Dict], List[Dict], Optional[int]]:
|
| 242 |
+
"""
|
| 243 |
+
Read billing services by ID with properties and associations (companies/deals).
|
| 244 |
+
Returns: services, company_links, deal_links, max_ts_ms_for_cursor_prop
|
| 245 |
+
"""
|
| 246 |
+
if not billing_ids:
|
| 247 |
+
return [], [], [], None
|
| 248 |
+
|
| 249 |
+
services: List[Dict] = []
|
| 250 |
+
company_links: List[Dict] = []
|
| 251 |
+
deal_links: List[Dict] = []
|
| 252 |
+
|
| 253 |
+
max_ts_ms: Optional[int] = None
|
| 254 |
+
|
| 255 |
+
assoc_types = ["companies", "deals"]
|
| 256 |
+
|
| 257 |
+
for i, bid in enumerate(billing_ids, start=1):
|
| 258 |
+
try:
|
| 259 |
+
record = hubspot_client.crm.objects.basic_api.get_by_id(
|
| 260 |
+
object_type=BILLING_OBJECT_TYPE,
|
| 261 |
+
object_id=bid,
|
| 262 |
+
properties=BILLING_PROPERTIES,
|
| 263 |
+
associations=assoc_types,
|
| 264 |
+
archived=False,
|
| 265 |
+
)
|
| 266 |
+
|
| 267 |
+
p = record.properties or {}
|
| 268 |
+
|
| 269 |
+
# Track max timestamp based on cursor_prop
|
| 270 |
+
cursor_val = p.get(cursor_prop)
|
| 271 |
+
ts_ms = parse_any_ts_ms(cursor_val)
|
| 272 |
+
if ts_ms is not None and (max_ts_ms is None or ts_ms > max_ts_ms):
|
| 273 |
+
max_ts_ms = ts_ms
|
| 274 |
+
|
| 275 |
+
# Build service row
|
| 276 |
+
row = {
|
| 277 |
+
"billing_id": str(record.id),
|
| 278 |
+
"cli": p.get("cli"),
|
| 279 |
+
"account_code": p.get("account_code"),
|
| 280 |
+
"service_name": p.get("service_name"),
|
| 281 |
+
"status": p.get("status"),
|
| 282 |
+
"type": p.get("type"),
|
| 283 |
+
"contract_renewal_date": parse_ts(p.get("contract_renewal_date")),
|
| 284 |
+
"supplier_network": p.get("supplier_network"),
|
| 285 |
+
"network_name": p.get("network_name"),
|
| 286 |
+
"product_type_name": p.get("product_type_name"),
|
| 287 |
+
"hubspot_created_by": try_parse_int(p.get("hs_created_by_user_id")),
|
| 288 |
+
"hubspot_created_at": parse_ts(p.get("hs_createdate")),
|
| 289 |
+
}
|
| 290 |
+
services.append(row)
|
| 291 |
+
|
| 292 |
+
# Associations
|
| 293 |
+
assoc = record.associations or {}
|
| 294 |
+
|
| 295 |
+
if assoc.get("companies") and getattr(assoc["companies"], "results", None):
|
| 296 |
+
for a in assoc["companies"].results:
|
| 297 |
+
if a.id:
|
| 298 |
+
company_links.append({
|
| 299 |
+
"billing_id": str(record.id),
|
| 300 |
+
"company_id": str(a.id),
|
| 301 |
+
})
|
| 302 |
+
|
| 303 |
+
if assoc.get("deals") and getattr(assoc["deals"], "results", None):
|
| 304 |
+
for a in assoc["deals"].results:
|
| 305 |
+
if a.id:
|
| 306 |
+
deal_links.append({
|
| 307 |
+
"billing_id": str(record.id),
|
| 308 |
+
"deal_id": str(a.id),
|
| 309 |
+
})
|
| 310 |
+
|
| 311 |
+
if i % 200 == 0:
|
| 312 |
+
logging.info("Read %d billing services...", i)
|
| 313 |
+
|
| 314 |
+
time.sleep(0.05)
|
| 315 |
+
|
| 316 |
+
except httpx.HTTPStatusError as e:
|
| 317 |
+
logging.error("HTTP error reading billing %s: %s", bid, e)
|
| 318 |
+
except Exception as e:
|
| 319 |
+
logging.error("Error reading billing %s: %s", bid, e)
|
| 320 |
+
|
| 321 |
+
return services, company_links, deal_links, max_ts_ms
|
| 322 |
+
|
| 323 |
+
|
| 324 |
+
# -----------------------------------------------------------------------------
|
| 325 |
+
# Upsert
|
| 326 |
+
# -----------------------------------------------------------------------------
|
| 327 |
+
|
| 328 |
+
|
| 329 |
+
def upsert_billing(
|
| 330 |
+
services: List[Dict],
|
| 331 |
+
company_links: List[Dict],
|
| 332 |
+
deal_links: List[Dict],
|
| 333 |
+
) -> None:
|
| 334 |
+
if services:
|
| 335 |
+
services = deduplicate_by_key(services, key="billing_id")
|
| 336 |
+
batched_insert(
|
| 337 |
+
supabase_client, "hubspot_billing_services", services,
|
| 338 |
+
batch_size=1000, on_conflict=["billing_id"]
|
| 339 |
+
)
|
| 340 |
+
print(f"Upserted {len(services)} billing services.")
|
| 341 |
+
|
| 342 |
+
if company_links:
|
| 343 |
+
company_links = deduplicate_by_key(
|
| 344 |
+
company_links, key=("billing_id", "company_id"))
|
| 345 |
+
batched_insert(
|
| 346 |
+
supabase_client, "hubspot_billing_companies", company_links,
|
| 347 |
+
batch_size=1000, on_conflict=["billing_id", "company_id"]
|
| 348 |
+
)
|
| 349 |
+
print(f"Upserted {len(company_links)} billing-company associations.")
|
| 350 |
+
|
| 351 |
+
if deal_links:
|
| 352 |
+
deal_links = deduplicate_by_key(
|
| 353 |
+
deal_links, key=("billing_id", "deal_id"))
|
| 354 |
+
batched_insert(
|
| 355 |
+
supabase_client, "hubspot_billing_deals", deal_links,
|
| 356 |
+
batch_size=1000, on_conflict=["billing_id", "deal_id"]
|
| 357 |
+
)
|
| 358 |
+
print(f"Upserted {len(deal_links)} billing-deal associations.")
|
| 359 |
+
|
| 360 |
+
|
| 361 |
+
# -----------------------------------------------------------------------------
|
| 362 |
+
# Main (timestamp cursor)
|
| 363 |
+
# -----------------------------------------------------------------------------
|
| 364 |
+
|
| 365 |
+
|
| 366 |
+
def main(since_ms: Optional[int] = None):
|
| 367 |
+
"""
|
| 368 |
+
Orchestrates:
|
| 369 |
+
1) Search billing IDs with <cursor_prop> > since_ms
|
| 370 |
+
2) Read full records with associations (track max timestamp)
|
| 371 |
+
3) Upsert into Supabase
|
| 372 |
+
4) Update sync metadata with { last_sync_metadata, last_sync_time, cursor_prop }
|
| 373 |
+
"""
|
| 374 |
+
# Resolve since_ms
|
| 375 |
+
if since_ms is None and BOOTSTRAP_SINCE_MS_ENV:
|
| 376 |
+
try:
|
| 377 |
+
since_ms = int(BOOTSTRAP_SINCE_MS_ENV)
|
| 378 |
+
except ValueError:
|
| 379 |
+
raise RuntimeError(
|
| 380 |
+
"HUBSPOT_BILLING_SINCE_MS must be an integer (ms) if set.")
|
| 381 |
+
|
| 382 |
+
if since_ms is None:
|
| 383 |
+
# Default: today@00:00:00Z for first run
|
| 384 |
+
today0 = floor_to_utc_midnight(
|
| 385 |
+
datetime.datetime.now(datetime.timezone.utc))
|
| 386 |
+
since_ms = to_epoch_ms(today0)
|
| 387 |
+
|
| 388 |
+
print(f"Searching billing services with timestamp > {since_ms} ...")
|
| 389 |
+
ids, cursor_prop = search_billing_ids_after_ms(since_ms)
|
| 390 |
+
print(f"Search property: {cursor_prop}. Found {len(ids)} billing IDs.")
|
| 391 |
+
|
| 392 |
+
now_iso = datetime.datetime.now(datetime.timezone.utc).isoformat()
|
| 393 |
+
|
| 394 |
+
if not ids:
|
| 395 |
+
print("No billing services beyond the cursor. Updating sync metadata and exiting.")
|
| 396 |
+
update_sync_metadata(supabase_client, "hubspot_billing_services", now_iso)
|
| 397 |
+
return
|
| 398 |
+
|
| 399 |
+
print("Reading billing services (with associations)...")
|
| 400 |
+
services, company_links, deal_links, max_ts_ms = read_billing_by_ids(
|
| 401 |
+
ids, cursor_prop)
|
| 402 |
+
|
| 403 |
+
print("Upserting into Supabase...")
|
| 404 |
+
upsert_billing(services, company_links, deal_links)
|
| 405 |
+
|
| 406 |
+
# Advance cursor to max timestamp we actually ingested for the chosen property
|
| 407 |
+
new_cursor_ms = max_ts_ms if max_ts_ms is not None else since_ms
|
| 408 |
+
|
| 409 |
+
update_sync_metadata(supabase_client, "hubspot_billing_services", now_iso)
|
| 410 |
+
|
| 411 |
+
print(
|
| 412 |
+
f"Billing sync complete. Advanced cursor to {new_cursor_ms} using prop '{cursor_prop}'.")
|
| 413 |
+
|
| 414 |
+
|
| 415 |
+
# -----------------------------------------------------------------------------
|
| 416 |
+
# CLI
|
| 417 |
+
# -----------------------------------------------------------------------------
|
| 418 |
+
|
| 419 |
+
|
| 420 |
+
def _parse_cli_arg_to_ms(arg: str) -> int:
|
| 421 |
+
"""
|
| 422 |
+
Accept:
|
| 423 |
+
- integer epoch ms
|
| 424 |
+
- ISO-8601 (Z or offset)
|
| 425 |
+
- YYYY-MM-DD (floors to 00:00Z)
|
| 426 |
+
"""
|
| 427 |
+
# epoch ms or seconds
|
| 428 |
+
if re.fullmatch(r"\d{10,13}", arg):
|
| 429 |
+
v = int(arg)
|
| 430 |
+
if v < 10_000_000_000_000: # seconds -> ms
|
| 431 |
+
v *= 1000
|
| 432 |
+
return v
|
| 433 |
+
|
| 434 |
+
# YYYY-MM-DD
|
| 435 |
+
if re.fullmatch(r"\d{4}-\d{2}-\d{2}", arg):
|
| 436 |
+
d = datetime.datetime.strptime(
|
| 437 |
+
arg, "%Y-%m-%d").replace(tzinfo=datetime.timezone.utc)
|
| 438 |
+
return to_epoch_ms(floor_to_utc_midnight(d))
|
| 439 |
+
|
| 440 |
+
# ISO-8601
|
| 441 |
+
return to_epoch_ms(arg)
|
| 442 |
+
|
| 443 |
+
|
| 444 |
+
if __name__ == "__main__":
|
| 445 |
+
import sys
|
| 446 |
+
if len(sys.argv) > 1:
|
| 447 |
+
try:
|
| 448 |
+
main(since_ms=_parse_cli_arg_to_ms(sys.argv[1]))
|
| 449 |
+
except Exception as e:
|
| 450 |
+
print(
|
| 451 |
+
f"Invalid timestamp. Provide epoch ms, ISO-8601, or YYYY-MM-DD. Error: {e}")
|
| 452 |
+
raise SystemExit(1)
|
| 453 |
+
else:
|
| 454 |
+
main()
|
python/hubspot_companies.py
ADDED
|
@@ -0,0 +1,475 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
HubSpot Companies → Supabase (incremental since a millisecond cursor)
|
| 3 |
+
|
| 4 |
+
Usage from orchestrator:
|
| 5 |
+
import load_hubspot_companies
|
| 6 |
+
load_hubspot_companies.main(since_ms=<int milliseconds since epoch UTC>)
|
| 7 |
+
|
| 8 |
+
Direct CLI:
|
| 9 |
+
# epoch ms
|
| 10 |
+
python load_hubspot_companies.py 1754025600000
|
| 11 |
+
# ISO-8601
|
| 12 |
+
python load_hubspot_companies.py 2025-08-01T09:30:00Z
|
| 13 |
+
# Back-compat date (floors to 00:00Z)
|
| 14 |
+
python load_hubspot_companies.py 2025-08-01
|
| 15 |
+
"""
|
| 16 |
+
|
| 17 |
+
import os
|
| 18 |
+
import re
|
| 19 |
+
import time
|
| 20 |
+
import logging
|
| 21 |
+
import datetime
|
| 22 |
+
from typing import List, Dict, Optional, Tuple, Union
|
| 23 |
+
|
| 24 |
+
import httpx
|
| 25 |
+
import hubspot
|
| 26 |
+
from dotenv import load_dotenv
|
| 27 |
+
from supabase import create_client
|
| 28 |
+
from hubspot.crm.companies import ApiException as CompaniesApiException
|
| 29 |
+
|
| 30 |
+
from hubspot_utils import (
|
| 31 |
+
try_parse_int, parse_ts, get_property_label_mapping,
|
| 32 |
+
)
|
| 33 |
+
from supabase_utils import (
|
| 34 |
+
update_sync_metadata, enrich_supabase_row, upload_raw_json_to_supabase,
|
| 35 |
+
batched_insert, fetch_supabase_table,
|
| 36 |
+
)
|
| 37 |
+
|
| 38 |
+
# -----------------------------------------------------------------------------
|
| 39 |
+
# Logging
|
| 40 |
+
# -----------------------------------------------------------------------------
|
| 41 |
+
logging.basicConfig(
|
| 42 |
+
filename=f"logs/hubspot_company_pipeline_{datetime.datetime.now().strftime('%Y-%m-%d')}.log",
|
| 43 |
+
filemode="a",
|
| 44 |
+
level=logging.INFO,
|
| 45 |
+
format="%(asctime)s [%(levelname)s] %(message)s",
|
| 46 |
+
)
|
| 47 |
+
|
| 48 |
+
# -----------------------------------------------------------------------------
|
| 49 |
+
# Environment
|
| 50 |
+
# -----------------------------------------------------------------------------
|
| 51 |
+
load_dotenv()
|
| 52 |
+
HUBSPOT_TOKEN = os.getenv("HUBSPOT_TOKEN")
|
| 53 |
+
SUPABASE_URL = os.getenv("SUPABASE_URL")
|
| 54 |
+
SUPABASE_SERVICE_ROLE_KEY = os.getenv("SUPABASE_SERVICE_ROLE_KEY")
|
| 55 |
+
# Optional bootstrap cursor if orchestrator doesn't provide one
|
| 56 |
+
BOOTSTRAP_SINCE_MS_ENV = os.getenv("HUBSPOT_COMPANIES_SINCE_MS")
|
| 57 |
+
|
| 58 |
+
if not HUBSPOT_TOKEN:
|
| 59 |
+
raise RuntimeError("HUBSPOT_TOKEN is not set")
|
| 60 |
+
if not SUPABASE_URL or not SUPABASE_SERVICE_ROLE_KEY:
|
| 61 |
+
raise RuntimeError("Supabase env vars are not set")
|
| 62 |
+
|
| 63 |
+
hubspot_client = hubspot.Client.create(access_token=HUBSPOT_TOKEN)
|
| 64 |
+
supabase_client = create_client(SUPABASE_URL, SUPABASE_SERVICE_ROLE_KEY)
|
| 65 |
+
|
| 66 |
+
# -----------------------------------------------------------------------------
|
| 67 |
+
# Config
|
| 68 |
+
# -----------------------------------------------------------------------------
|
| 69 |
+
COMPANY_PROPERTIES = [
|
| 70 |
+
"name",
|
| 71 |
+
"city",
|
| 72 |
+
"company_email",
|
| 73 |
+
"address",
|
| 74 |
+
"address2",
|
| 75 |
+
"domain",
|
| 76 |
+
"number_of_active__cli_s",
|
| 77 |
+
"number_of__mobile___cloned_",
|
| 78 |
+
"createdate",
|
| 79 |
+
"hs_lastmodifieddate",
|
| 80 |
+
"lastmodifieddate",
|
| 81 |
+
"review_date_updated_by___clone",
|
| 82 |
+
"industry",
|
| 83 |
+
"macro_industry_grouping",
|
| 84 |
+
"closest_review_date___clone",
|
| 85 |
+
]
|
| 86 |
+
|
| 87 |
+
# -----------------------------------------------------------------------------
|
| 88 |
+
# Time helpers
|
| 89 |
+
# -----------------------------------------------------------------------------
|
| 90 |
+
|
| 91 |
+
|
| 92 |
+
def _ensure_utc(dt: datetime.datetime) -> datetime.datetime:
|
| 93 |
+
if dt.tzinfo is None:
|
| 94 |
+
dt = dt.replace(tzinfo=datetime.timezone.utc)
|
| 95 |
+
return dt.astimezone(datetime.timezone.utc)
|
| 96 |
+
|
| 97 |
+
|
| 98 |
+
def floor_to_utc_midnight(dt: datetime.datetime) -> datetime.datetime:
|
| 99 |
+
dt = _ensure_utc(dt)
|
| 100 |
+
return dt.replace(hour=0, minute=0, second=0, microsecond=0)
|
| 101 |
+
|
| 102 |
+
|
| 103 |
+
def _parse_iso_like_to_dt(value: str) -> datetime.datetime:
|
| 104 |
+
if isinstance(value, str) and value.endswith("Z"):
|
| 105 |
+
value = value[:-1] + "+00:00"
|
| 106 |
+
dt = datetime.datetime.fromisoformat(value)
|
| 107 |
+
return _ensure_utc(dt)
|
| 108 |
+
|
| 109 |
+
|
| 110 |
+
def to_epoch_ms(dt_or_str: Union[str, datetime.datetime]) -> int:
|
| 111 |
+
if isinstance(dt_or_str, str):
|
| 112 |
+
dt = _parse_iso_like_to_dt(dt_or_str)
|
| 113 |
+
elif isinstance(dt_or_str, datetime.datetime):
|
| 114 |
+
dt = _ensure_utc(dt_or_str)
|
| 115 |
+
else:
|
| 116 |
+
raise TypeError(f"Unsupported type for to_epoch_ms: {type(dt_or_str)}")
|
| 117 |
+
return int(dt.timestamp() * 1000)
|
| 118 |
+
|
| 119 |
+
|
| 120 |
+
def parse_any_ts_ms(value: Optional[Union[str, int, float]]) -> Optional[int]:
|
| 121 |
+
if value is None:
|
| 122 |
+
return None
|
| 123 |
+
try:
|
| 124 |
+
v = int(str(value))
|
| 125 |
+
if v < 10_000_000_000_000: # seconds → ms
|
| 126 |
+
v *= 1000
|
| 127 |
+
return v
|
| 128 |
+
except ValueError:
|
| 129 |
+
pass
|
| 130 |
+
try:
|
| 131 |
+
return to_epoch_ms(str(value))
|
| 132 |
+
except Exception:
|
| 133 |
+
logging.warning("Could not parse timestamp value=%r", value)
|
| 134 |
+
return None
|
| 135 |
+
|
| 136 |
+
# -----------------------------------------------------------------------------
|
| 137 |
+
# Search IDs (ts > since_ms) with property fallback
|
| 138 |
+
# -----------------------------------------------------------------------------
|
| 139 |
+
|
| 140 |
+
|
| 141 |
+
def _search_company_ids_from(since_ms: int, prop: str) -> List[str]:
|
| 142 |
+
"""
|
| 143 |
+
Search companies where {prop} > since_ms (epoch-ms).
|
| 144 |
+
Sort ascending so we can advance the cursor monotonically.
|
| 145 |
+
"""
|
| 146 |
+
url = "https://api.hubapi.com/crm/v3/objects/companies/search"
|
| 147 |
+
headers = {
|
| 148 |
+
"Authorization": f"Bearer {HUBSPOT_TOKEN}",
|
| 149 |
+
"Content-Type": "application/json",
|
| 150 |
+
"Accept": "application/json",
|
| 151 |
+
}
|
| 152 |
+
payload = {
|
| 153 |
+
"filterGroups": [{
|
| 154 |
+
"filters": [
|
| 155 |
+
{"propertyName": prop, "operator": "GT",
|
| 156 |
+
"value": str(since_ms)},
|
| 157 |
+
]
|
| 158 |
+
}],
|
| 159 |
+
"limit": 100,
|
| 160 |
+
"sorts": [{"propertyName": prop, "direction": "ASCENDING"}],
|
| 161 |
+
}
|
| 162 |
+
|
| 163 |
+
ids: List[str] = []
|
| 164 |
+
after: Optional[str] = None
|
| 165 |
+
with httpx.Client(timeout=30.0) as client:
|
| 166 |
+
while True:
|
| 167 |
+
body = dict(payload)
|
| 168 |
+
if after:
|
| 169 |
+
body["after"] = after
|
| 170 |
+
|
| 171 |
+
resp = client.post(url, headers=headers, json=body)
|
| 172 |
+
if resp.status_code >= 400:
|
| 173 |
+
try:
|
| 174 |
+
logging.error(
|
| 175 |
+
"Company search error for prop '%s': %s", prop, resp.json())
|
| 176 |
+
except Exception:
|
| 177 |
+
logging.error(
|
| 178 |
+
"Company search error for prop '%s': %s", prop, resp.text)
|
| 179 |
+
resp.raise_for_status()
|
| 180 |
+
|
| 181 |
+
data = resp.json()
|
| 182 |
+
ids.extend([obj["id"] for obj in data.get("results", []) or []])
|
| 183 |
+
|
| 184 |
+
after = (data.get("paging") or {}).get("next", {}).get("after")
|
| 185 |
+
if not after:
|
| 186 |
+
break
|
| 187 |
+
time.sleep(0.1)
|
| 188 |
+
|
| 189 |
+
return ids
|
| 190 |
+
|
| 191 |
+
|
| 192 |
+
def search_company_ids_after_ms(since_ms: int) -> Tuple[List[str], str]:
|
| 193 |
+
"""
|
| 194 |
+
Try these properties in order; return (ids, prop_used) for the first successful search:
|
| 195 |
+
1) hs_lastmodifieddate
|
| 196 |
+
2) lastmodifieddate
|
| 197 |
+
3) createdate
|
| 198 |
+
"""
|
| 199 |
+
props_to_try = ["createdate"]
|
| 200 |
+
last_err = None
|
| 201 |
+
|
| 202 |
+
for prop in props_to_try:
|
| 203 |
+
try:
|
| 204 |
+
ids = _search_company_ids_from(since_ms, prop)
|
| 205 |
+
logging.info(
|
| 206 |
+
"Company search with '%s' returned %d IDs.", prop, len(ids))
|
| 207 |
+
return ids, prop
|
| 208 |
+
except httpx.HTTPStatusError as e:
|
| 209 |
+
last_err = e
|
| 210 |
+
continue
|
| 211 |
+
|
| 212 |
+
if last_err:
|
| 213 |
+
raise last_err
|
| 214 |
+
return [], "hs_lastmodifieddate"
|
| 215 |
+
|
| 216 |
+
# -----------------------------------------------------------------------------
|
| 217 |
+
# Read-by-ID (with associations) → enrich & track max cursor ts
|
| 218 |
+
# -----------------------------------------------------------------------------
|
| 219 |
+
|
| 220 |
+
|
| 221 |
+
def _enrich_company_data_from_record(
|
| 222 |
+
record,
|
| 223 |
+
industry_map: Dict[str, str],
|
| 224 |
+
macro_industry_map: Dict[str, str],
|
| 225 |
+
) -> Dict:
|
| 226 |
+
props = record.properties or {}
|
| 227 |
+
company_data: Dict[str, Optional[str]] = {"id": record.id}
|
| 228 |
+
for p in COMPANY_PROPERTIES:
|
| 229 |
+
company_data[p] = props.get(p)
|
| 230 |
+
|
| 231 |
+
# Association counts
|
| 232 |
+
num_contacts = 0
|
| 233 |
+
num_deals = 0
|
| 234 |
+
if getattr(record, "associations", None):
|
| 235 |
+
if record.associations.get("contacts") and getattr(record.associations["contacts"], "results", None):
|
| 236 |
+
num_contacts = len(
|
| 237 |
+
{a.id for a in record.associations["contacts"].results if getattr(a, "id", None)})
|
| 238 |
+
if record.associations.get("deals") and getattr(record.associations["deals"], "results", None):
|
| 239 |
+
num_deals = len(
|
| 240 |
+
{a.id for a in record.associations["deals"].results if getattr(a, "id", None)})
|
| 241 |
+
company_data["number_of_associated_contacts"] = num_contacts
|
| 242 |
+
company_data["number_of_associated_deals"] = num_deals
|
| 243 |
+
|
| 244 |
+
# Map label properties
|
| 245 |
+
code = company_data.get("industry")
|
| 246 |
+
company_data["industry"] = industry_map.get(
|
| 247 |
+
code) if code in industry_map else None
|
| 248 |
+
|
| 249 |
+
macro = company_data.get("macro_industry_grouping")
|
| 250 |
+
company_data["macro_industry_grouping"] = macro_industry_map.get(
|
| 251 |
+
macro) if macro in macro_industry_map else None
|
| 252 |
+
|
| 253 |
+
return company_data
|
| 254 |
+
|
| 255 |
+
|
| 256 |
+
def read_companies_by_ids(company_ids: List[str], cursor_prop: str) -> Tuple[List[Dict], Optional[int]]:
|
| 257 |
+
if not company_ids:
|
| 258 |
+
return [], None
|
| 259 |
+
|
| 260 |
+
companies: List[Dict] = []
|
| 261 |
+
assoc_types = ["contacts", "deals"]
|
| 262 |
+
|
| 263 |
+
# Fetch label maps once
|
| 264 |
+
try:
|
| 265 |
+
industry_map = get_property_label_mapping(
|
| 266 |
+
hubspot_client, "companies", "industry")
|
| 267 |
+
except Exception as e:
|
| 268 |
+
logging.warning("Failed to fetch industry map: %s", e)
|
| 269 |
+
industry_map = {}
|
| 270 |
+
|
| 271 |
+
try:
|
| 272 |
+
macro_industry_map = get_property_label_mapping(
|
| 273 |
+
hubspot_client, "companies", "macro_industry_grouping")
|
| 274 |
+
except Exception as e:
|
| 275 |
+
logging.warning("Failed to fetch macro_industry_grouping map: %s", e)
|
| 276 |
+
macro_industry_map = {}
|
| 277 |
+
|
| 278 |
+
max_ts_ms: Optional[int] = None
|
| 279 |
+
|
| 280 |
+
for i, cid in enumerate(company_ids, start=1):
|
| 281 |
+
try:
|
| 282 |
+
record = hubspot_client.crm.companies.basic_api.get_by_id(
|
| 283 |
+
company_id=cid,
|
| 284 |
+
properties=COMPANY_PROPERTIES,
|
| 285 |
+
associations=assoc_types,
|
| 286 |
+
archived=False,
|
| 287 |
+
)
|
| 288 |
+
|
| 289 |
+
# Track max timestamp for the chosen cursor property
|
| 290 |
+
cursor_val = (record.properties or {}).get(cursor_prop)
|
| 291 |
+
ts_ms = parse_any_ts_ms(cursor_val)
|
| 292 |
+
if ts_ms is not None and (max_ts_ms is None or ts_ms > max_ts_ms):
|
| 293 |
+
max_ts_ms = ts_ms
|
| 294 |
+
|
| 295 |
+
companies.append(_enrich_company_data_from_record(
|
| 296 |
+
record, industry_map, macro_industry_map))
|
| 297 |
+
|
| 298 |
+
if i % 200 == 0:
|
| 299 |
+
logging.info("Read %d companies...", i)
|
| 300 |
+
|
| 301 |
+
time.sleep(0.05)
|
| 302 |
+
|
| 303 |
+
except httpx.HTTPStatusError as e:
|
| 304 |
+
logging.error("HTTP error reading company %s: %s", cid, e)
|
| 305 |
+
except (CompaniesApiException, httpx.HTTPError) as e:
|
| 306 |
+
logging.error("Error reading company %s: %s", cid, e)
|
| 307 |
+
|
| 308 |
+
return companies, max_ts_ms
|
| 309 |
+
|
| 310 |
+
# -----------------------------------------------------------------------------
|
| 311 |
+
# Map → Supabase rows and diff
|
| 312 |
+
# -----------------------------------------------------------------------------
|
| 313 |
+
|
| 314 |
+
|
| 315 |
+
def map_company_data_for_db(companies: List[Dict]) -> List[Dict]:
|
| 316 |
+
mapped: List[Dict] = []
|
| 317 |
+
for c in companies:
|
| 318 |
+
base_row = {
|
| 319 |
+
"company_id": try_parse_int(c["id"]),
|
| 320 |
+
"company_name": c.get("name"),
|
| 321 |
+
"company_email": c.get("company_email"),
|
| 322 |
+
"city": c.get("city"),
|
| 323 |
+
"domain": c.get("domain"),
|
| 324 |
+
"street_address": c.get("address"),
|
| 325 |
+
"street_address2": c.get("address2"),
|
| 326 |
+
"hubspot_create_date": parse_ts(c.get("createdate")),
|
| 327 |
+
"hubspot_modified_date": parse_ts(c.get("hs_lastmodifieddate") or c.get("lastmodifieddate")),
|
| 328 |
+
"review_date_updated_by": c.get("review_date_updated_by___clone") or None,
|
| 329 |
+
"number_of_active_clis": try_parse_int(c.get("number_of_active__cli_s")),
|
| 330 |
+
"number_of_clis": try_parse_int(c.get("number_of__mobile___cloned_")),
|
| 331 |
+
"number_of_associated_contacts": c.get("number_of_associated_contacts", 0),
|
| 332 |
+
"number_of_associated_deals": c.get("number_of_associated_deals", 0),
|
| 333 |
+
"industry": c.get("industry"),
|
| 334 |
+
"macro_industry_grouping": c.get("macro_industry_grouping"),
|
| 335 |
+
"closest_review_date": parse_ts(c.get("closest_review_date___clone")),
|
| 336 |
+
}
|
| 337 |
+
mapped.append(enrich_supabase_row(base_row))
|
| 338 |
+
return mapped
|
| 339 |
+
|
| 340 |
+
|
| 341 |
+
def companies_are_different(new_row: Dict, old_row: Dict) -> bool:
|
| 342 |
+
compare_keys = [
|
| 343 |
+
"company_name", "company_email", "city", "domain", "street_address",
|
| 344 |
+
"street_address2", "hubspot_modified_date", "review_date_updated_by",
|
| 345 |
+
"number_of_active_clis", "number_of_clis",
|
| 346 |
+
"number_of_associated_contacts", "number_of_associated_deals",
|
| 347 |
+
"industry", "macro_industry_grouping", "closest_review_date",
|
| 348 |
+
]
|
| 349 |
+
for key in compare_keys:
|
| 350 |
+
if str(new_row.get(key)) != str(old_row.get(key)):
|
| 351 |
+
return True
|
| 352 |
+
return False
|
| 353 |
+
|
| 354 |
+
# -----------------------------------------------------------------------------
|
| 355 |
+
# Upsert
|
| 356 |
+
# -----------------------------------------------------------------------------
|
| 357 |
+
|
| 358 |
+
|
| 359 |
+
def upsert_companies(companies: List[Dict]) -> None:
|
| 360 |
+
if not companies:
|
| 361 |
+
print("No companies to upsert.")
|
| 362 |
+
return
|
| 363 |
+
|
| 364 |
+
existing = fetch_supabase_table(
|
| 365 |
+
supabase_client, "hubspot_companies", "company_id")
|
| 366 |
+
mapped = map_company_data_for_db(companies)
|
| 367 |
+
|
| 368 |
+
rows_to_upsert: List[Dict] = []
|
| 369 |
+
for row in mapped:
|
| 370 |
+
cid = row.get("company_id")
|
| 371 |
+
if not cid:
|
| 372 |
+
continue
|
| 373 |
+
old_row = existing.get(str(cid))
|
| 374 |
+
if not old_row or companies_are_different(row, old_row):
|
| 375 |
+
rows_to_upsert.append(row)
|
| 376 |
+
|
| 377 |
+
print(f"{len(rows_to_upsert)} companies to insert/update (out of {len(companies)} read).")
|
| 378 |
+
|
| 379 |
+
if rows_to_upsert:
|
| 380 |
+
# upload_raw_json_to_supabase(supabase_client, rows_to_upsert, object_type="companies")
|
| 381 |
+
batched_insert(supabase_client, "hubspot_companies",
|
| 382 |
+
rows_to_upsert, batch_size=1000)
|
| 383 |
+
|
| 384 |
+
# -----------------------------------------------------------------------------
|
| 385 |
+
# Main (timestamp cursor)
|
| 386 |
+
# -----------------------------------------------------------------------------
|
| 387 |
+
|
| 388 |
+
|
| 389 |
+
def main(since_ms: Optional[int] = None):
|
| 390 |
+
"""
|
| 391 |
+
Orchestrates:
|
| 392 |
+
1) Search company IDs with <cursor_prop> > since_ms (property fallback)
|
| 393 |
+
2) Read full companies (track max timestamp for <cursor_prop>)
|
| 394 |
+
3) Upsert into Supabase
|
| 395 |
+
4) Update sync metadata with { last_sync_metadata, last_sync_time, cursor_prop }
|
| 396 |
+
"""
|
| 397 |
+
# Resolve since_ms
|
| 398 |
+
if since_ms is None and BOOTSTRAP_SINCE_MS_ENV:
|
| 399 |
+
try:
|
| 400 |
+
since_ms = int(BOOTSTRAP_SINCE_MS_ENV)
|
| 401 |
+
except ValueError:
|
| 402 |
+
raise RuntimeError(
|
| 403 |
+
"HUBSPOT_COMPANIES_SINCE_MS must be an integer (ms) if set.")
|
| 404 |
+
|
| 405 |
+
if since_ms is None:
|
| 406 |
+
# Default: today@00:00:00Z for first run
|
| 407 |
+
today0 = floor_to_utc_midnight(
|
| 408 |
+
datetime.datetime.now(datetime.timezone.utc))
|
| 409 |
+
since_ms = to_epoch_ms(today0)
|
| 410 |
+
|
| 411 |
+
print(f"Searching companies with timestamp > {since_ms} ...")
|
| 412 |
+
ids, cursor_prop = search_company_ids_after_ms(since_ms)
|
| 413 |
+
print(f"Search property: {cursor_prop}. Found {len(ids)} company IDs.")
|
| 414 |
+
|
| 415 |
+
now_iso = datetime.datetime.now(datetime.timezone.utc).isoformat()
|
| 416 |
+
|
| 417 |
+
if not ids:
|
| 418 |
+
print("No companies beyond the cursor. Updating sync metadata and exiting.")
|
| 419 |
+
update_sync_metadata(supabase_client, "companies", now_iso)
|
| 420 |
+
return
|
| 421 |
+
|
| 422 |
+
print("Reading companies (with associations)...")
|
| 423 |
+
companies, max_ts_ms = read_companies_by_ids(ids, cursor_prop)
|
| 424 |
+
|
| 425 |
+
print("Upserting into Supabase...")
|
| 426 |
+
upsert_companies(companies)
|
| 427 |
+
|
| 428 |
+
# Advance cursor to max timestamp we actually ingested for the chosen property
|
| 429 |
+
new_cursor_ms = max_ts_ms if max_ts_ms is not None else since_ms
|
| 430 |
+
|
| 431 |
+
update_sync_metadata(supabase_client, "companies", now_iso)
|
| 432 |
+
|
| 433 |
+
print(
|
| 434 |
+
f"Companies sync complete. Advanced cursor to {new_cursor_ms} using prop '{cursor_prop}'.")
|
| 435 |
+
|
| 436 |
+
# -----------------------------------------------------------------------------
|
| 437 |
+
# CLI
|
| 438 |
+
# -----------------------------------------------------------------------------
|
| 439 |
+
|
| 440 |
+
|
| 441 |
+
def _parse_cli_arg_to_ms(arg: str) -> int:
|
| 442 |
+
"""
|
| 443 |
+
Accept:
|
| 444 |
+
- integer epoch ms
|
| 445 |
+
- ISO-8601 (Z or offset)
|
| 446 |
+
- YYYY-MM-DD (floors to 00:00Z)
|
| 447 |
+
"""
|
| 448 |
+
# epoch ms or seconds
|
| 449 |
+
if re.fullmatch(r"\d{10,13}", arg):
|
| 450 |
+
v = int(arg)
|
| 451 |
+
if v < 10_000_000_000_000: # seconds -> ms
|
| 452 |
+
v *= 1000
|
| 453 |
+
return v
|
| 454 |
+
|
| 455 |
+
# YYYY-MM-DD
|
| 456 |
+
if re.fullmatch(r"\d{4}-\d{2}-\d{2}", arg):
|
| 457 |
+
d = datetime.datetime.strptime(
|
| 458 |
+
arg, "%Y-%m-%d").replace(tzinfo=datetime.timezone.utc)
|
| 459 |
+
return to_epoch_ms(floor_to_utc_midnight(d))
|
| 460 |
+
|
| 461 |
+
# ISO-8601
|
| 462 |
+
return to_epoch_ms(arg)
|
| 463 |
+
|
| 464 |
+
|
| 465 |
+
if __name__ == "__main__":
|
| 466 |
+
import sys
|
| 467 |
+
if len(sys.argv) > 1:
|
| 468 |
+
try:
|
| 469 |
+
main(since_ms=_parse_cli_arg_to_ms(sys.argv[1]))
|
| 470 |
+
except Exception as e:
|
| 471 |
+
print(
|
| 472 |
+
f"Invalid timestamp. Provide epoch ms, ISO-8601, or YYYY-MM-DD. Error: {e}")
|
| 473 |
+
sys.exit(1)
|
| 474 |
+
else:
|
| 475 |
+
main()
|
python/hubspot_contacts.py
ADDED
|
@@ -0,0 +1,415 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
HubSpot Contacts → Supabase (incremental since a millisecond cursor)
|
| 3 |
+
|
| 4 |
+
Usage from orchestrator:
|
| 5 |
+
import load_hubspot_contacts
|
| 6 |
+
load_hubspot_contacts.main(since_ms=<int milliseconds since epoch UTC>)
|
| 7 |
+
"""
|
| 8 |
+
|
| 9 |
+
import os
|
| 10 |
+
import re
|
| 11 |
+
import time
|
| 12 |
+
import logging
|
| 13 |
+
import datetime
|
| 14 |
+
from typing import List, Dict, Tuple, Optional, Union
|
| 15 |
+
|
| 16 |
+
import httpx
|
| 17 |
+
import hubspot
|
| 18 |
+
from dotenv import load_dotenv
|
| 19 |
+
from supabase import create_client
|
| 20 |
+
from hubspot.crm.contacts import ApiException as ContactsApiException
|
| 21 |
+
|
| 22 |
+
from hubspot_utils import (
|
| 23 |
+
try_parse_int, parse_ts, get_property_label_mapping,
|
| 24 |
+
)
|
| 25 |
+
from supabase_utils import (
|
| 26 |
+
fetch_supabase_table, update_sync_metadata, enrich_supabase_row,
|
| 27 |
+
upload_raw_json_to_supabase, batched_insert,
|
| 28 |
+
)
|
| 29 |
+
|
| 30 |
+
# -----------------------------------------------------------------------------
|
| 31 |
+
# Logging
|
| 32 |
+
# -----------------------------------------------------------------------------
|
| 33 |
+
logging.basicConfig(
|
| 34 |
+
filename=f"logs/hubspot_contact_pipeline_{datetime.datetime.now().strftime('%Y-%m-%d')}.log",
|
| 35 |
+
filemode="a",
|
| 36 |
+
level=logging.INFO,
|
| 37 |
+
format="%(asctime)s [%(levelname)s] %(message)s",
|
| 38 |
+
)
|
| 39 |
+
|
| 40 |
+
# -----------------------------------------------------------------------------
|
| 41 |
+
# Environment
|
| 42 |
+
# -----------------------------------------------------------------------------
|
| 43 |
+
load_dotenv()
|
| 44 |
+
HUBSPOT_TOKEN = os.getenv("HUBSPOT_TOKEN")
|
| 45 |
+
SUPABASE_URL = os.getenv("SUPABASE_URL")
|
| 46 |
+
SUPABASE_SERVICE_ROLE_KEY = os.getenv("SUPABASE_SERVICE_ROLE_KEY")
|
| 47 |
+
# Optional bootstrap cursor if orchestrator doesn't provide one
|
| 48 |
+
BOOTSTRAP_SINCE_MS_ENV = os.getenv("HUBSPOT_CONTACTS_SINCE_MS")
|
| 49 |
+
|
| 50 |
+
if not HUBSPOT_TOKEN:
|
| 51 |
+
raise RuntimeError("HUBSPOT_TOKEN is not set")
|
| 52 |
+
if not SUPABASE_URL or not SUPABASE_SERVICE_ROLE_KEY:
|
| 53 |
+
raise RuntimeError("Supabase env vars are not set")
|
| 54 |
+
|
| 55 |
+
hubspot_client = hubspot.Client.create(access_token=HUBSPOT_TOKEN)
|
| 56 |
+
supabase_client = create_client(SUPABASE_URL, SUPABASE_SERVICE_ROLE_KEY)
|
| 57 |
+
|
| 58 |
+
# -----------------------------------------------------------------------------
|
| 59 |
+
# Config
|
| 60 |
+
# -----------------------------------------------------------------------------
|
| 61 |
+
CONTACT_PROPERTIES = [
|
| 62 |
+
"full_name",
|
| 63 |
+
"firstname",
|
| 64 |
+
"lastname",
|
| 65 |
+
"email",
|
| 66 |
+
"phone",
|
| 67 |
+
"job_title_level",
|
| 68 |
+
"createdate",
|
| 69 |
+
"lastmodifieddate",
|
| 70 |
+
"hs_lastmodifieddate",
|
| 71 |
+
"notes_last_updated",
|
| 72 |
+
"associatedcompanyid",
|
| 73 |
+
]
|
| 74 |
+
|
| 75 |
+
# -----------------------------------------------------------------------------
|
| 76 |
+
# Time helpers
|
| 77 |
+
# -----------------------------------------------------------------------------
|
| 78 |
+
def _ensure_utc(dt: datetime.datetime) -> datetime.datetime:
|
| 79 |
+
if dt.tzinfo is None:
|
| 80 |
+
dt = dt.replace(tzinfo=datetime.timezone.utc)
|
| 81 |
+
return dt.astimezone(datetime.timezone.utc)
|
| 82 |
+
|
| 83 |
+
def floor_to_utc_midnight(dt: datetime.datetime) -> datetime.datetime:
|
| 84 |
+
dt = _ensure_utc(dt)
|
| 85 |
+
return dt.replace(hour=0, minute=0, second=0, microsecond=0)
|
| 86 |
+
|
| 87 |
+
def _parse_iso_like_to_dt(value: str) -> datetime.datetime:
|
| 88 |
+
if isinstance(value, str) and value.endswith("Z"):
|
| 89 |
+
value = value[:-1] + "+00:00"
|
| 90 |
+
dt = datetime.datetime.fromisoformat(value)
|
| 91 |
+
return _ensure_utc(dt)
|
| 92 |
+
|
| 93 |
+
def to_epoch_ms(dt_or_str: Union[str, datetime.datetime]) -> int:
|
| 94 |
+
if isinstance(dt_or_str, str):
|
| 95 |
+
dt = _parse_iso_like_to_dt(dt_or_str)
|
| 96 |
+
elif isinstance(dt_or_str, datetime.datetime):
|
| 97 |
+
dt = _ensure_utc(dt_or_str)
|
| 98 |
+
else:
|
| 99 |
+
raise TypeError(f"Unsupported type for to_epoch_ms: {type(dt_or_str)}")
|
| 100 |
+
return int(dt.timestamp() * 1000)
|
| 101 |
+
|
| 102 |
+
def parse_any_ts_ms(value: Optional[Union[str, int, float]]) -> Optional[int]:
|
| 103 |
+
"""
|
| 104 |
+
Accepts ms-epoch / sec-epoch / ISO-8601; returns ms since epoch or None.
|
| 105 |
+
"""
|
| 106 |
+
if value is None:
|
| 107 |
+
return None
|
| 108 |
+
try:
|
| 109 |
+
v = int(str(value))
|
| 110 |
+
if v < 10_000_000_000_000: # seconds → ms
|
| 111 |
+
v *= 1000
|
| 112 |
+
return v
|
| 113 |
+
except ValueError:
|
| 114 |
+
pass
|
| 115 |
+
try:
|
| 116 |
+
return to_epoch_ms(str(value))
|
| 117 |
+
except Exception:
|
| 118 |
+
logging.warning("Could not parse timestamp value=%r", value)
|
| 119 |
+
return None
|
| 120 |
+
|
| 121 |
+
# -----------------------------------------------------------------------------
|
| 122 |
+
# Mapping & helpers
|
| 123 |
+
# -----------------------------------------------------------------------------
|
| 124 |
+
def map_contact_data_for_db(contacts: List[Dict]) -> List[Dict]:
|
| 125 |
+
mapped = []
|
| 126 |
+
for c in contacts:
|
| 127 |
+
base_row = {
|
| 128 |
+
"contact_id": try_parse_int(c["id"]),
|
| 129 |
+
"full_name": c.get("full_name"),
|
| 130 |
+
"first_name": c.get("firstname"),
|
| 131 |
+
"last_name": c.get("lastname"),
|
| 132 |
+
"email": c.get("email"),
|
| 133 |
+
"phone_number": c.get("phone"),
|
| 134 |
+
"job_title_level": c.get("job_title_level"),
|
| 135 |
+
"hubspot_create_date": parse_ts(c.get("createdate")) or None,
|
| 136 |
+
"hubspot_modified_date": parse_ts(c.get("lastmodifieddate") or c.get("hs_lastmodifieddate")) or None,
|
| 137 |
+
"hubspot_last_activity_date": parse_ts(c.get("notes_last_updated")) or None,
|
| 138 |
+
"number_of_associated_deals": c.get("number_of_associated_deals", 0),
|
| 139 |
+
"associated_company_id": try_parse_int(c.get("associatedcompanyid")),
|
| 140 |
+
}
|
| 141 |
+
mapped.append(enrich_supabase_row(base_row))
|
| 142 |
+
return mapped
|
| 143 |
+
|
| 144 |
+
def contacts_are_different(new_row: Dict, old_row: Dict) -> bool:
|
| 145 |
+
compare_keys = [
|
| 146 |
+
"full_name", "first_name", "last_name", "email", "phone_number",
|
| 147 |
+
"hubspot_create_date", "hubspot_modified_date",
|
| 148 |
+
"hubspot_last_activity_date", "number_of_associated_deals",
|
| 149 |
+
"associated_company_id"
|
| 150 |
+
]
|
| 151 |
+
for key in compare_keys:
|
| 152 |
+
if str(new_row.get(key)) != str(old_row.get(key)):
|
| 153 |
+
return True
|
| 154 |
+
return False
|
| 155 |
+
|
| 156 |
+
# -----------------------------------------------------------------------------
|
| 157 |
+
# Search IDs (ts > since_ms) with property fallback
|
| 158 |
+
# -----------------------------------------------------------------------------
|
| 159 |
+
def _search_contact_ids_from(since_ms: int, prop: str) -> List[str]:
|
| 160 |
+
"""
|
| 161 |
+
Search contacts where {prop} > since_ms (epoch-ms).
|
| 162 |
+
Sort ascending so we can advance the cursor monotonically.
|
| 163 |
+
"""
|
| 164 |
+
url = "https://api.hubapi.com/crm/v3/objects/contacts/search"
|
| 165 |
+
headers = {
|
| 166 |
+
"Authorization": f"Bearer {HUBSPOT_TOKEN}",
|
| 167 |
+
"Content-Type": "application/json",
|
| 168 |
+
"Accept": "application/json",
|
| 169 |
+
}
|
| 170 |
+
payload = {
|
| 171 |
+
"filterGroups": [{
|
| 172 |
+
"filters": [
|
| 173 |
+
{"propertyName": prop, "operator": "GT", "value": str(since_ms)}
|
| 174 |
+
]
|
| 175 |
+
}],
|
| 176 |
+
"limit": 100,
|
| 177 |
+
"sorts": [{"propertyName": prop, "direction": "ASCENDING"}],
|
| 178 |
+
}
|
| 179 |
+
|
| 180 |
+
ids: List[str] = []
|
| 181 |
+
after: Optional[str] = None
|
| 182 |
+
with httpx.Client(timeout=30.0) as client:
|
| 183 |
+
while True:
|
| 184 |
+
body = dict(payload)
|
| 185 |
+
if after:
|
| 186 |
+
body["after"] = after
|
| 187 |
+
|
| 188 |
+
resp = client.post(url, headers=headers, json=body)
|
| 189 |
+
if resp.status_code >= 400:
|
| 190 |
+
try:
|
| 191 |
+
logging.error("Contacts search error for prop '%s': %s", prop, resp.json())
|
| 192 |
+
except Exception:
|
| 193 |
+
logging.error("Contacts search error for prop '%s': %s", prop, resp.text)
|
| 194 |
+
resp.raise_for_status()
|
| 195 |
+
|
| 196 |
+
data = resp.json()
|
| 197 |
+
ids.extend([obj["id"] for obj in data.get("results", []) or []])
|
| 198 |
+
|
| 199 |
+
after = (data.get("paging") or {}).get("next", {}).get("after")
|
| 200 |
+
if not after:
|
| 201 |
+
break
|
| 202 |
+
time.sleep(0.1)
|
| 203 |
+
|
| 204 |
+
return ids
|
| 205 |
+
|
| 206 |
+
def search_contact_ids_after_ms(since_ms: int) -> Tuple[List[str], str]:
|
| 207 |
+
"""
|
| 208 |
+
Try these properties in order; return (ids, prop_used) for the first successful search:
|
| 209 |
+
1) hs_lastmodifieddate
|
| 210 |
+
2) lastmodifieddate
|
| 211 |
+
3) createdate
|
| 212 |
+
"""
|
| 213 |
+
props_to_try = ["createdate"]
|
| 214 |
+
last_err = None
|
| 215 |
+
|
| 216 |
+
for prop in props_to_try:
|
| 217 |
+
try:
|
| 218 |
+
ids = _search_contact_ids_from(since_ms, prop)
|
| 219 |
+
logging.info("Contacts search with '%s' returned %d IDs.", prop, len(ids))
|
| 220 |
+
return ids, prop
|
| 221 |
+
except httpx.HTTPStatusError as e:
|
| 222 |
+
last_err = e
|
| 223 |
+
continue
|
| 224 |
+
|
| 225 |
+
if last_err:
|
| 226 |
+
raise last_err
|
| 227 |
+
return [], "hs_lastmodifieddate"
|
| 228 |
+
|
| 229 |
+
# -----------------------------------------------------------------------------
|
| 230 |
+
# Read-by-ID (with associations) → enrich & track max cursor ts
|
| 231 |
+
# -----------------------------------------------------------------------------
|
| 232 |
+
def _enrich_contact_data_from_record(record, job_level_map: Optional[Dict[str, str]]) -> Dict:
|
| 233 |
+
props = record.properties or {}
|
| 234 |
+
|
| 235 |
+
contact_data: Dict[str, Optional[str]] = {"id": record.id}
|
| 236 |
+
for p in CONTACT_PROPERTIES:
|
| 237 |
+
contact_data[p] = props.get(p)
|
| 238 |
+
|
| 239 |
+
# Full name fallback
|
| 240 |
+
if not contact_data.get("full_name") or not str(contact_data["full_name"]).strip():
|
| 241 |
+
first = contact_data.get("firstname") or ""
|
| 242 |
+
last = contact_data.get("lastname") or ""
|
| 243 |
+
contact_data["full_name"] = f"{first.strip()} {last.strip()}".strip()
|
| 244 |
+
|
| 245 |
+
# Associations: deals count
|
| 246 |
+
num_deals = 0
|
| 247 |
+
if getattr(record, "associations", None) and record.associations.get("deals"):
|
| 248 |
+
bucket = record.associations["deals"]
|
| 249 |
+
if getattr(bucket, "results", None):
|
| 250 |
+
num_deals = len({a.id for a in bucket.results if getattr(a, "id", None)})
|
| 251 |
+
|
| 252 |
+
contact_data["number_of_associated_deals"] = num_deals
|
| 253 |
+
|
| 254 |
+
# Map job_title_level code → label (if configured)
|
| 255 |
+
if job_level_map:
|
| 256 |
+
code = contact_data.get("job_title_level")
|
| 257 |
+
contact_data["job_title_level"] = job_level_map.get(code) if code in job_level_map else None
|
| 258 |
+
|
| 259 |
+
return contact_data
|
| 260 |
+
|
| 261 |
+
def read_contacts_by_ids(contact_ids: List[str], cursor_prop: str) -> Tuple[List[Dict], Optional[int]]:
|
| 262 |
+
if not contact_ids:
|
| 263 |
+
return [], None
|
| 264 |
+
|
| 265 |
+
contacts: List[Dict] = []
|
| 266 |
+
assoc_types = ["deals"]
|
| 267 |
+
|
| 268 |
+
# Fetch property label map once
|
| 269 |
+
try:
|
| 270 |
+
job_level_map = get_property_label_mapping(hubspot_client, "contacts", "job_title_level")
|
| 271 |
+
except Exception as e:
|
| 272 |
+
logging.warning("Failed to fetch job_title_level map: %s", e)
|
| 273 |
+
job_level_map = None
|
| 274 |
+
|
| 275 |
+
max_ts_ms: Optional[int] = None
|
| 276 |
+
|
| 277 |
+
for i, cid in enumerate(contact_ids, start=1):
|
| 278 |
+
try:
|
| 279 |
+
record = hubspot_client.crm.contacts.basic_api.get_by_id(
|
| 280 |
+
contact_id=cid,
|
| 281 |
+
properties=CONTACT_PROPERTIES,
|
| 282 |
+
associations=assoc_types,
|
| 283 |
+
archived=False,
|
| 284 |
+
)
|
| 285 |
+
|
| 286 |
+
# Track max timestamp for the chosen cursor property
|
| 287 |
+
cursor_val = (record.properties or {}).get(cursor_prop)
|
| 288 |
+
ts_ms = parse_any_ts_ms(cursor_val)
|
| 289 |
+
if ts_ms is not None and (max_ts_ms is None or ts_ms > max_ts_ms):
|
| 290 |
+
max_ts_ms = ts_ms
|
| 291 |
+
|
| 292 |
+
contacts.append(_enrich_contact_data_from_record(record, job_level_map))
|
| 293 |
+
|
| 294 |
+
if i % 200 == 0:
|
| 295 |
+
logging.info("Read %d contacts...", i)
|
| 296 |
+
|
| 297 |
+
time.sleep(0.05) # gentle pacing
|
| 298 |
+
|
| 299 |
+
except httpx.HTTPStatusError as e:
|
| 300 |
+
logging.error("HTTP error reading contact %s: %s", cid, e)
|
| 301 |
+
except (ContactsApiException, httpx.HTTPError) as e:
|
| 302 |
+
logging.error("Error reading contact %s: %s", cid, e)
|
| 303 |
+
|
| 304 |
+
return contacts, max_ts_ms
|
| 305 |
+
|
| 306 |
+
# -----------------------------------------------------------------------------
|
| 307 |
+
# Upsert flow
|
| 308 |
+
# -----------------------------------------------------------------------------
|
| 309 |
+
def upsert_contacts(contacts: List[Dict]) -> None:
|
| 310 |
+
if not contacts:
|
| 311 |
+
print("No contacts to upsert.")
|
| 312 |
+
return
|
| 313 |
+
|
| 314 |
+
existing = fetch_supabase_table(supabase_client, "hubspot_contacts", "contact_id")
|
| 315 |
+
rows_to_upsert: List[Dict] = []
|
| 316 |
+
|
| 317 |
+
for c in contacts:
|
| 318 |
+
contact_id = try_parse_int(c.get("id"))
|
| 319 |
+
if not contact_id:
|
| 320 |
+
continue
|
| 321 |
+
|
| 322 |
+
mapped_row = map_contact_data_for_db([c])[0]
|
| 323 |
+
existing_row = existing.get(str(contact_id))
|
| 324 |
+
|
| 325 |
+
if not existing_row or contacts_are_different(mapped_row, existing_row):
|
| 326 |
+
rows_to_upsert.append(mapped_row)
|
| 327 |
+
|
| 328 |
+
print(f"{len(rows_to_upsert)} contacts to insert/update (out of {len(contacts)} read).")
|
| 329 |
+
|
| 330 |
+
if rows_to_upsert:
|
| 331 |
+
# upload_raw_json_to_supabase(supabase_client, rows_to_upsert, object_type="contacts")
|
| 332 |
+
batched_insert(supabase_client, "hubspot_contacts", rows_to_upsert, batch_size=1000)
|
| 333 |
+
|
| 334 |
+
# -----------------------------------------------------------------------------
|
| 335 |
+
# Main (timestamp cursor)
|
| 336 |
+
# -----------------------------------------------------------------------------
|
| 337 |
+
def main(since_ms: Optional[int] = None):
|
| 338 |
+
"""
|
| 339 |
+
Orchestrates:
|
| 340 |
+
1) Search contact IDs with <cursor_prop> > since_ms (property fallback)
|
| 341 |
+
2) Read full contacts (track max timestamp for <cursor_prop>)
|
| 342 |
+
3) Upsert into Supabase
|
| 343 |
+
4) Update sync metadata with { last_sync_metadata, last_sync_time, cursor_prop }
|
| 344 |
+
"""
|
| 345 |
+
# Resolve since_ms
|
| 346 |
+
if since_ms is None and BOOTSTRAP_SINCE_MS_ENV:
|
| 347 |
+
try:
|
| 348 |
+
since_ms = int(BOOTSTRAP_SINCE_MS_ENV)
|
| 349 |
+
except ValueError:
|
| 350 |
+
raise RuntimeError("HUBSPOT_CONTACTS_SINCE_MS must be an integer (ms) if set.")
|
| 351 |
+
|
| 352 |
+
if since_ms is None:
|
| 353 |
+
# Default: today@00:00:00Z for first run
|
| 354 |
+
today0 = floor_to_utc_midnight(datetime.datetime.now(datetime.timezone.utc))
|
| 355 |
+
since_ms = to_epoch_ms(today0)
|
| 356 |
+
|
| 357 |
+
print(f"Searching contacts with timestamp > {since_ms} ...")
|
| 358 |
+
ids, cursor_prop = search_contact_ids_after_ms(since_ms)
|
| 359 |
+
print(f"Search property: {cursor_prop}. Found {len(ids)} contact IDs.")
|
| 360 |
+
|
| 361 |
+
now_iso = datetime.datetime.now(datetime.timezone.utc).isoformat()
|
| 362 |
+
|
| 363 |
+
if not ids:
|
| 364 |
+
print("No contacts beyond the cursor. Updating sync metadata and exiting.")
|
| 365 |
+
update_sync_metadata(supabase_client, "contacts", now_iso)
|
| 366 |
+
return
|
| 367 |
+
|
| 368 |
+
print("Reading contacts (with associations)...")
|
| 369 |
+
contacts, max_ts_ms = read_contacts_by_ids(ids, cursor_prop)
|
| 370 |
+
|
| 371 |
+
print("Upserting into Supabase...")
|
| 372 |
+
upsert_contacts(contacts)
|
| 373 |
+
|
| 374 |
+
# Advance cursor to max timestamp we actually ingested for the chosen property
|
| 375 |
+
new_cursor_ms = max_ts_ms if max_ts_ms is not None else since_ms
|
| 376 |
+
|
| 377 |
+
update_sync_metadata(supabase_client, "contacts", now_iso)
|
| 378 |
+
|
| 379 |
+
print(f"Contacts sync complete. Advanced cursor to {new_cursor_ms} using prop '{cursor_prop}'.")
|
| 380 |
+
|
| 381 |
+
# -----------------------------------------------------------------------------
|
| 382 |
+
# CLI
|
| 383 |
+
# -----------------------------------------------------------------------------
|
| 384 |
+
def _parse_cli_arg_to_ms(arg: str) -> int:
|
| 385 |
+
"""
|
| 386 |
+
Accept:
|
| 387 |
+
- integer epoch ms
|
| 388 |
+
- ISO-8601 (Z or offset)
|
| 389 |
+
- YYYY-MM-DD (floors to 00:00Z)
|
| 390 |
+
"""
|
| 391 |
+
# epoch ms or seconds
|
| 392 |
+
if re.fullmatch(r"\d{10,13}", arg):
|
| 393 |
+
v = int(arg)
|
| 394 |
+
if v < 10_000_000_000_000: # seconds -> ms
|
| 395 |
+
v *= 1000
|
| 396 |
+
return v
|
| 397 |
+
|
| 398 |
+
# YYYY-MM-DD
|
| 399 |
+
if re.fullmatch(r"\d{4}-\d{2}-\d{2}", arg):
|
| 400 |
+
d = datetime.datetime.strptime(arg, "%Y-%m-%d").replace(tzinfo=datetime.timezone.utc)
|
| 401 |
+
return to_epoch_ms(floor_to_utc_midnight(d))
|
| 402 |
+
|
| 403 |
+
# ISO-8601
|
| 404 |
+
return to_epoch_ms(arg)
|
| 405 |
+
|
| 406 |
+
if __name__ == "__main__":
|
| 407 |
+
import sys
|
| 408 |
+
if len(sys.argv) > 1:
|
| 409 |
+
try:
|
| 410 |
+
main(since_ms=_parse_cli_arg_to_ms(sys.argv[1]))
|
| 411 |
+
except Exception as e:
|
| 412 |
+
print(f"Invalid timestamp. Provide epoch ms, ISO-8601, or YYYY-MM-DD. Error: {e}")
|
| 413 |
+
sys.exit(1)
|
| 414 |
+
else:
|
| 415 |
+
main()
|
python/hubspot_deals.py
ADDED
|
@@ -0,0 +1,445 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
HubSpot Deals → Supabase (incremental since a millisecond cursor)
|
| 3 |
+
|
| 4 |
+
Usage from orchestrator:
|
| 5 |
+
import load_hubspot_deals
|
| 6 |
+
load_hubspot_deals.main(since_ms=<int milliseconds since epoch UTC>)
|
| 7 |
+
|
| 8 |
+
Direct CLI:
|
| 9 |
+
# epoch ms
|
| 10 |
+
python load_hubspot_deals.py 1754025600000
|
| 11 |
+
# ISO-8601
|
| 12 |
+
python load_hubspot_deals.py 2025-08-01T09:30:00Z
|
| 13 |
+
# Back-compat date (floors to 00:00Z)
|
| 14 |
+
python load_hubspot_deals.py 2025-08-01
|
| 15 |
+
"""
|
| 16 |
+
import os
|
| 17 |
+
import re
|
| 18 |
+
import time
|
| 19 |
+
import logging
|
| 20 |
+
import datetime
|
| 21 |
+
from typing import List, Dict, Optional, Tuple, Union
|
| 22 |
+
|
| 23 |
+
import httpx
|
| 24 |
+
import hubspot
|
| 25 |
+
from dotenv import load_dotenv
|
| 26 |
+
from supabase import create_client
|
| 27 |
+
from hubspot.crm.deals import ApiException as DealsApiException
|
| 28 |
+
|
| 29 |
+
from hubspot_utils import (
|
| 30 |
+
parse_ts, try_parse_int, try_parse_float, deduplicate_by_key
|
| 31 |
+
)
|
| 32 |
+
from supabase_utils import (
|
| 33 |
+
insert_into_supabase_table, update_sync_metadata
|
| 34 |
+
)
|
| 35 |
+
|
| 36 |
+
# -----------------------------------------------------------------------------
|
| 37 |
+
# Constants
|
| 38 |
+
# -----------------------------------------------------------------------------
|
| 39 |
+
DEAL_TO_COMPANY_ASSOC_TYPE = "deal_to_company"
|
| 40 |
+
|
| 41 |
+
HUBSPOT_TEAM_MAP = {
|
| 42 |
+
"1322863": "Fulfilment Team",
|
| 43 |
+
"1322864": "Customer Services Team",
|
| 44 |
+
"1322865": "Sales Team",
|
| 45 |
+
"1448134": "Accounts & Billing Team",
|
| 46 |
+
"3557793": "Marketing Team",
|
| 47 |
+
"57348939": "MDR Team",
|
| 48 |
+
}
|
| 49 |
+
|
| 50 |
+
DEAL_PROPERTIES = [
|
| 51 |
+
"dealname",
|
| 52 |
+
"hubspot_owner_id",
|
| 53 |
+
"pipeline",
|
| 54 |
+
"dealstage",
|
| 55 |
+
"createdate", # use createdate; hs_createdate is also seen in some portals
|
| 56 |
+
"hs_createdate",
|
| 57 |
+
"closedate",
|
| 58 |
+
"contract_signed_date",
|
| 59 |
+
"contract_end_date",
|
| 60 |
+
"notes_last_updated",
|
| 61 |
+
"num_contacted_notes",
|
| 62 |
+
"hubspot_team_id",
|
| 63 |
+
"hs_analytics_source",
|
| 64 |
+
"number_of_cli",
|
| 65 |
+
"amount",
|
| 66 |
+
"hs_acv",
|
| 67 |
+
"hs_tcv",
|
| 68 |
+
"margin",
|
| 69 |
+
"source_of_deal_2___migration",
|
| 70 |
+
"hs_primary_associated_company",
|
| 71 |
+
"hs_lastmodifieddate", # for mapping/debugging
|
| 72 |
+
"lastmodifieddate",
|
| 73 |
+
]
|
| 74 |
+
|
| 75 |
+
# -----------------------------------------------------------------------------
|
| 76 |
+
# Logging
|
| 77 |
+
# -----------------------------------------------------------------------------
|
| 78 |
+
logging.basicConfig(
|
| 79 |
+
filename=f"logs/hubspot_deals_pipeline_{datetime.datetime.now().strftime('%Y-%m-%d')}.log",
|
| 80 |
+
filemode="a",
|
| 81 |
+
level=logging.INFO,
|
| 82 |
+
format="%(asctime)s [%(levelname)s] %(message)s",
|
| 83 |
+
)
|
| 84 |
+
|
| 85 |
+
# -----------------------------------------------------------------------------
|
| 86 |
+
# Environment
|
| 87 |
+
# -----------------------------------------------------------------------------
|
| 88 |
+
load_dotenv()
|
| 89 |
+
HUBSPOT_TOKEN = os.getenv("HUBSPOT_TOKEN")
|
| 90 |
+
SUPABASE_URL = os.getenv("SUPABASE_URL")
|
| 91 |
+
SUPABASE_SERVICE_ROLE_KEY = os.getenv("SUPABASE_SERVICE_ROLE_KEY")
|
| 92 |
+
# Optional bootstrap cursor if orchestrator doesn't provide one
|
| 93 |
+
BOOTSTRAP_SINCE_MS_ENV = os.getenv("HUBSPOT_DEALS_SINCE_MS")
|
| 94 |
+
|
| 95 |
+
if not HUBSPOT_TOKEN:
|
| 96 |
+
raise RuntimeError("HUBSPOT_TOKEN is not set")
|
| 97 |
+
if not SUPABASE_URL or not SUPABASE_SERVICE_ROLE_KEY:
|
| 98 |
+
raise RuntimeError("Supabase env vars are not set")
|
| 99 |
+
|
| 100 |
+
hubspot_client = hubspot.Client.create(access_token=HUBSPOT_TOKEN)
|
| 101 |
+
supabase_client = create_client(SUPABASE_URL, SUPABASE_SERVICE_ROLE_KEY)
|
| 102 |
+
|
| 103 |
+
# -----------------------------------------------------------------------------
|
| 104 |
+
# Time helpers
|
| 105 |
+
# -----------------------------------------------------------------------------
|
| 106 |
+
def _ensure_utc(dt: datetime.datetime) -> datetime.datetime:
|
| 107 |
+
if dt.tzinfo is None:
|
| 108 |
+
dt = dt.replace(tzinfo=datetime.timezone.utc)
|
| 109 |
+
return dt.astimezone(datetime.timezone.utc)
|
| 110 |
+
|
| 111 |
+
def floor_to_utc_midnight(dt: datetime.datetime) -> datetime.datetime:
|
| 112 |
+
dt = _ensure_utc(dt)
|
| 113 |
+
return dt.replace(hour=0, minute=0, second=0, microsecond=0)
|
| 114 |
+
|
| 115 |
+
def _parse_iso_like_to_dt(value: str) -> datetime.datetime:
|
| 116 |
+
if isinstance(value, str) and value.endswith("Z"):
|
| 117 |
+
value = value[:-1] + "+00:00"
|
| 118 |
+
dt = datetime.datetime.fromisoformat(value)
|
| 119 |
+
return _ensure_utc(dt)
|
| 120 |
+
|
| 121 |
+
def to_epoch_ms(dt_or_str: Union[str, datetime.datetime]) -> int:
|
| 122 |
+
if isinstance(dt_or_str, str):
|
| 123 |
+
dt = _parse_iso_like_to_dt(dt_or_str)
|
| 124 |
+
elif isinstance(dt_or_str, datetime.datetime):
|
| 125 |
+
dt = _ensure_utc(dt_or_str)
|
| 126 |
+
else:
|
| 127 |
+
raise TypeError(f"Unsupported type for to_epoch_ms: {type(dt_or_str)}")
|
| 128 |
+
return int(dt.timestamp() * 1000)
|
| 129 |
+
|
| 130 |
+
def parse_any_ts_ms(value: Optional[Union[str, int, float]]) -> Optional[int]:
|
| 131 |
+
"""
|
| 132 |
+
Accepts ms-epoch / sec-epoch / ISO-8601; returns ms since epoch or None.
|
| 133 |
+
"""
|
| 134 |
+
if value is None:
|
| 135 |
+
return None
|
| 136 |
+
try:
|
| 137 |
+
v = int(str(value))
|
| 138 |
+
if v < 10_000_000_000_000: # seconds → ms
|
| 139 |
+
v *= 1000
|
| 140 |
+
return v
|
| 141 |
+
except ValueError:
|
| 142 |
+
pass
|
| 143 |
+
try:
|
| 144 |
+
return to_epoch_ms(str(value))
|
| 145 |
+
except Exception:
|
| 146 |
+
logging.warning("Could not parse timestamp value=%r", value)
|
| 147 |
+
return None
|
| 148 |
+
|
| 149 |
+
# -----------------------------------------------------------------------------
|
| 150 |
+
# Pipeline / Stage labels
|
| 151 |
+
# -----------------------------------------------------------------------------
|
| 152 |
+
def get_pipeline_and_stage_mappings() -> Tuple[Dict[str, str], Dict[str, str]]:
|
| 153 |
+
"""Retrieve pipeline and stage label mappings for deals."""
|
| 154 |
+
try:
|
| 155 |
+
resp = hubspot_client.crm.pipelines.pipelines_api.get_all(object_type="deals")
|
| 156 |
+
pipeline_mapping: Dict[str, str] = {}
|
| 157 |
+
stage_mapping: Dict[str, str] = {}
|
| 158 |
+
for p in resp.results:
|
| 159 |
+
pipeline_mapping[p.id] = p.label
|
| 160 |
+
for s in p.stages:
|
| 161 |
+
stage_mapping[s.id] = s.label
|
| 162 |
+
return pipeline_mapping, stage_mapping
|
| 163 |
+
except Exception as e:
|
| 164 |
+
logging.error("Failed to fetch pipeline/stage mappings: %s", e)
|
| 165 |
+
return {}, {}
|
| 166 |
+
|
| 167 |
+
# -----------------------------------------------------------------------------
|
| 168 |
+
# Search IDs (ts > since_ms) with property fallback
|
| 169 |
+
# -----------------------------------------------------------------------------
|
| 170 |
+
def _search_deal_ids_from(since_ms: int, prop: str) -> List[str]:
|
| 171 |
+
"""
|
| 172 |
+
Search deals where {prop} > since_ms (epoch-ms).
|
| 173 |
+
Sort ascending so we can advance the cursor monotonically.
|
| 174 |
+
"""
|
| 175 |
+
url = "https://api.hubapi.com/crm/v3/objects/deals/search"
|
| 176 |
+
headers = {
|
| 177 |
+
"Authorization": f"Bearer {HUBSPOT_TOKEN}",
|
| 178 |
+
"Content-Type": "application/json",
|
| 179 |
+
"Accept": "application/json",
|
| 180 |
+
}
|
| 181 |
+
payload = {
|
| 182 |
+
"filterGroups": [{
|
| 183 |
+
"filters": [
|
| 184 |
+
{"propertyName": prop, "operator": "GT", "value": str(since_ms)},
|
| 185 |
+
]
|
| 186 |
+
}],
|
| 187 |
+
"limit": 100,
|
| 188 |
+
"sorts": [{"propertyName": prop, "direction": "ASCENDING"}],
|
| 189 |
+
}
|
| 190 |
+
|
| 191 |
+
ids: List[str] = []
|
| 192 |
+
after: Optional[str] = None
|
| 193 |
+
with httpx.Client(timeout=30.0) as client:
|
| 194 |
+
while True:
|
| 195 |
+
body = dict(payload)
|
| 196 |
+
if after:
|
| 197 |
+
body["after"] = after
|
| 198 |
+
|
| 199 |
+
resp = client.post(url, headers=headers, json=body)
|
| 200 |
+
if resp.status_code >= 400:
|
| 201 |
+
try:
|
| 202 |
+
logging.error("Deal search error for prop '%s': %s", prop, resp.json())
|
| 203 |
+
except Exception:
|
| 204 |
+
logging.error("Deal search error for prop '%s': %s", prop, resp.text)
|
| 205 |
+
resp.raise_for_status()
|
| 206 |
+
|
| 207 |
+
data = resp.json()
|
| 208 |
+
ids.extend([obj["id"] for obj in data.get("results", []) or []])
|
| 209 |
+
|
| 210 |
+
after = (data.get("paging") or {}).get("next", {}).get("after")
|
| 211 |
+
if not after:
|
| 212 |
+
break
|
| 213 |
+
time.sleep(0.1)
|
| 214 |
+
|
| 215 |
+
return ids
|
| 216 |
+
|
| 217 |
+
def search_deal_ids_after_ms(since_ms: int) -> Tuple[List[str], str]:
|
| 218 |
+
"""
|
| 219 |
+
Try these properties in order; return (ids, prop_used) for the first successful search:
|
| 220 |
+
1) hs_lastmodifieddate
|
| 221 |
+
2) lastmodifieddate
|
| 222 |
+
3) createdate
|
| 223 |
+
4) hs_createdate
|
| 224 |
+
"""
|
| 225 |
+
props_to_try = ["createdate", "hs_createdate"]
|
| 226 |
+
last_err = None
|
| 227 |
+
|
| 228 |
+
for prop in props_to_try:
|
| 229 |
+
try:
|
| 230 |
+
ids = _search_deal_ids_from(since_ms, prop)
|
| 231 |
+
logging.info("Deal search with '%s' returned %d IDs.", prop, len(ids))
|
| 232 |
+
return ids, prop
|
| 233 |
+
except httpx.HTTPStatusError as e:
|
| 234 |
+
last_err = e
|
| 235 |
+
continue
|
| 236 |
+
|
| 237 |
+
if last_err:
|
| 238 |
+
raise last_err
|
| 239 |
+
return [], "hs_lastmodifieddate"
|
| 240 |
+
|
| 241 |
+
# -----------------------------------------------------------------------------
|
| 242 |
+
# Read-by-ID (with associations) → map rows and track max cursor ts
|
| 243 |
+
# -----------------------------------------------------------------------------
|
| 244 |
+
def _extract_primary_company_id(record, props: Dict) -> Optional[int]:
|
| 245 |
+
"""
|
| 246 |
+
Prefer hs_primary_associated_company property. If missing, fall back to
|
| 247 |
+
associations of type 'deal_to_company' (if available).
|
| 248 |
+
"""
|
| 249 |
+
primary = props.get("hs_primary_associated_company")
|
| 250 |
+
if primary:
|
| 251 |
+
return try_parse_int(primary)
|
| 252 |
+
|
| 253 |
+
assoc = getattr(record, "associations", None)
|
| 254 |
+
if assoc and assoc.get("companies") and getattr(assoc["companies"], "results", None):
|
| 255 |
+
for a in assoc["companies"].results:
|
| 256 |
+
if getattr(a, "type", None) == DEAL_TO_COMPANY_ASSOC_TYPE or not hasattr(a, "type"):
|
| 257 |
+
return try_parse_int(a.id)
|
| 258 |
+
return None
|
| 259 |
+
|
| 260 |
+
def read_deals_by_ids(
|
| 261 |
+
deal_ids: List[str],
|
| 262 |
+
cursor_prop: str,
|
| 263 |
+
) -> Tuple[List[Dict], List[Dict], Optional[int]]:
|
| 264 |
+
"""
|
| 265 |
+
Read deals by ID including contacts/companies associations.
|
| 266 |
+
Returns: (all_deals, deal_contact_links, max_ts_ms_for_cursor_prop)
|
| 267 |
+
"""
|
| 268 |
+
if not deal_ids:
|
| 269 |
+
return [], [], None
|
| 270 |
+
|
| 271 |
+
all_deals: List[Dict] = []
|
| 272 |
+
deal_contact_links: List[Dict] = []
|
| 273 |
+
|
| 274 |
+
assoc_types = ["contacts", "companies"]
|
| 275 |
+
pipeline_map, stage_map = get_pipeline_and_stage_mappings()
|
| 276 |
+
|
| 277 |
+
max_ts_ms: Optional[int] = None
|
| 278 |
+
|
| 279 |
+
for i, did in enumerate(deal_ids, start=1):
|
| 280 |
+
try:
|
| 281 |
+
record = hubspot_client.crm.deals.basic_api.get_by_id(
|
| 282 |
+
deal_id=did, properties=DEAL_PROPERTIES, associations=assoc_types, archived=False
|
| 283 |
+
)
|
| 284 |
+
p = record.properties or {}
|
| 285 |
+
|
| 286 |
+
# Track max timestamp for the cursor property we used in the search
|
| 287 |
+
cursor_val = p.get(cursor_prop)
|
| 288 |
+
ts_ms = parse_any_ts_ms(cursor_val)
|
| 289 |
+
if ts_ms is not None and (max_ts_ms is None or ts_ms > max_ts_ms):
|
| 290 |
+
max_ts_ms = ts_ms
|
| 291 |
+
|
| 292 |
+
# Contacts
|
| 293 |
+
if getattr(record, "associations", None) and record.associations.get("contacts"):
|
| 294 |
+
bucket = record.associations["contacts"]
|
| 295 |
+
if getattr(bucket, "results", None):
|
| 296 |
+
for a in bucket.results:
|
| 297 |
+
if a.id and a.id.isdigit():
|
| 298 |
+
deal_contact_links.append({
|
| 299 |
+
"deal_id": try_parse_int(record.id),
|
| 300 |
+
"contact_id": try_parse_int(a.id),
|
| 301 |
+
})
|
| 302 |
+
|
| 303 |
+
# Company
|
| 304 |
+
company_id = _extract_primary_company_id(record, p)
|
| 305 |
+
|
| 306 |
+
# Created date: accept either createdate or hs_createdate
|
| 307 |
+
created_iso = p.get("createdate") or p.get("hs_createdate")
|
| 308 |
+
|
| 309 |
+
all_deals.append({
|
| 310 |
+
"deal_id": try_parse_int(record.id),
|
| 311 |
+
"dealname": p.get("dealname"),
|
| 312 |
+
"hubspot_owner_id": try_parse_int(p.get("hubspot_owner_id")),
|
| 313 |
+
"pipeline_id": try_parse_int(p.get("pipeline")),
|
| 314 |
+
"pipeline_label": pipeline_map.get(p.get("pipeline"), ""),
|
| 315 |
+
"dealstage": p.get("dealstage"),
|
| 316 |
+
"dealstage_label": stage_map.get(p.get("dealstage"), ""),
|
| 317 |
+
"hubspot_createdate": parse_ts(created_iso),
|
| 318 |
+
"closedate": parse_ts(p.get("closedate")) or None,
|
| 319 |
+
"contract_signed_date": p.get("contract_signed_date") or None,
|
| 320 |
+
"contract_end_date": p.get("contract_end_date") or None,
|
| 321 |
+
"hubspot_last_activity_date": parse_ts(p.get("notes_last_updated")),
|
| 322 |
+
"num_contacted_notes": p.get("num_contacted_notes"),
|
| 323 |
+
"hubspot_team_id": try_parse_int(p.get("hubspot_team_id")),
|
| 324 |
+
"hubspot_team_label": HUBSPOT_TEAM_MAP.get(p.get("hubspot_team_id"), ""),
|
| 325 |
+
"hs_analytics_source": p.get("hs_analytics_source"),
|
| 326 |
+
"number_of_cli": try_parse_int(p.get("number_of_cli")),
|
| 327 |
+
"amount": try_parse_int(p.get("amount")),
|
| 328 |
+
"annual_contract_value": try_parse_float(p.get("hs_acv")),
|
| 329 |
+
"total_contract_value": try_parse_float(p.get("hs_tcv")),
|
| 330 |
+
"margin": try_parse_float(p.get("margin")),
|
| 331 |
+
"source_of_deal": p.get("source_of_deal_2___migration"),
|
| 332 |
+
"company_id": company_id,
|
| 333 |
+
})
|
| 334 |
+
|
| 335 |
+
if i % 200 == 0:
|
| 336 |
+
logging.info("Read %d deals...", i)
|
| 337 |
+
time.sleep(0.05)
|
| 338 |
+
|
| 339 |
+
except httpx.HTTPStatusError as e:
|
| 340 |
+
logging.error("HTTP error reading deal %s: %s", did, e)
|
| 341 |
+
except (DealsApiException, httpx.HTTPError) as e:
|
| 342 |
+
logging.error("Error reading deal %s: %s", did, e)
|
| 343 |
+
|
| 344 |
+
return all_deals, deal_contact_links, max_ts_ms
|
| 345 |
+
|
| 346 |
+
# -----------------------------------------------------------------------------
|
| 347 |
+
# Upsert
|
| 348 |
+
# -----------------------------------------------------------------------------
|
| 349 |
+
def upsert_deals(deals: List[Dict], deal_contact_links: List[Dict]) -> None:
|
| 350 |
+
if deals:
|
| 351 |
+
insert_into_supabase_table(supabase_client, "hubspot_deals", deals)
|
| 352 |
+
print(f"Upserted {len(deals)} deals.")
|
| 353 |
+
|
| 354 |
+
if deal_contact_links:
|
| 355 |
+
deal_contact_links = deduplicate_by_key(deal_contact_links, key=("deal_id", "contact_id"))
|
| 356 |
+
insert_into_supabase_table(
|
| 357 |
+
supabase_client,
|
| 358 |
+
"hubspot_deal_contacts",
|
| 359 |
+
deal_contact_links,
|
| 360 |
+
on_conflict=["deal_id", "contact_id"],
|
| 361 |
+
)
|
| 362 |
+
print(f"Upserted {len(deal_contact_links)} deal-contact associations.")
|
| 363 |
+
|
| 364 |
+
# -----------------------------------------------------------------------------
|
| 365 |
+
# Main (timestamp cursor)
|
| 366 |
+
# -----------------------------------------------------------------------------
|
| 367 |
+
def main(since_ms: Optional[int] = None):
|
| 368 |
+
"""
|
| 369 |
+
Orchestrates:
|
| 370 |
+
1) Search deal IDs with <cursor_prop> > since_ms (property fallback)
|
| 371 |
+
2) Read full deals with associations (track max timestamp for <cursor_prop>)
|
| 372 |
+
3) Upsert into Supabase
|
| 373 |
+
4) Update sync metadata with { last_sync_metadata, last_sync_time, cursor_prop }
|
| 374 |
+
"""
|
| 375 |
+
# Resolve since_ms
|
| 376 |
+
if since_ms is None and BOOTSTRAP_SINCE_MS_ENV:
|
| 377 |
+
try:
|
| 378 |
+
since_ms = int(BOOTSTRAP_SINCE_MS_ENV)
|
| 379 |
+
except ValueError:
|
| 380 |
+
raise RuntimeError("HUBSPOT_DEALS_SINCE_MS must be an integer (ms) if set.")
|
| 381 |
+
|
| 382 |
+
if since_ms is None:
|
| 383 |
+
# Default: today@00:00:00Z for first run
|
| 384 |
+
today0 = floor_to_utc_midnight(datetime.datetime.now(datetime.timezone.utc))
|
| 385 |
+
since_ms = to_epoch_ms(today0)
|
| 386 |
+
|
| 387 |
+
print(f"Searching deals with timestamp > {since_ms} ...")
|
| 388 |
+
ids, cursor_prop = search_deal_ids_after_ms(since_ms)
|
| 389 |
+
print(f"Search property: {cursor_prop}. Found {len(ids)} deal IDs.")
|
| 390 |
+
|
| 391 |
+
now_iso = datetime.datetime.now(datetime.timezone.utc).isoformat()
|
| 392 |
+
|
| 393 |
+
if not ids:
|
| 394 |
+
print("No deals beyond the cursor. Updating sync metadata and exiting.")
|
| 395 |
+
update_sync_metadata(supabase_client, "hubspot_deals", now_iso)
|
| 396 |
+
return
|
| 397 |
+
|
| 398 |
+
print("Reading deals (with associations)...")
|
| 399 |
+
deals, deal_contact_links, max_ts_ms = read_deals_by_ids(ids, cursor_prop)
|
| 400 |
+
|
| 401 |
+
print("Upserting into Supabase...")
|
| 402 |
+
upsert_deals(deals, deal_contact_links)
|
| 403 |
+
|
| 404 |
+
# Advance cursor to max timestamp we actually ingested for the chosen property
|
| 405 |
+
new_cursor_ms = max_ts_ms if max_ts_ms is not None else since_ms
|
| 406 |
+
|
| 407 |
+
update_sync_metadata(supabase_client, "hubspot_deals", now_iso)
|
| 408 |
+
|
| 409 |
+
print(f"Deals sync complete. Advanced cursor to {new_cursor_ms} using prop '{cursor_prop}'.")
|
| 410 |
+
|
| 411 |
+
# -----------------------------------------------------------------------------
|
| 412 |
+
# CLI
|
| 413 |
+
# -----------------------------------------------------------------------------
|
| 414 |
+
def _parse_cli_arg_to_ms(arg: str) -> int:
|
| 415 |
+
"""
|
| 416 |
+
Accept:
|
| 417 |
+
- integer epoch ms
|
| 418 |
+
- ISO-8601 (Z or offset)
|
| 419 |
+
- YYYY-MM-DD (floors to 00:00Z)
|
| 420 |
+
"""
|
| 421 |
+
# epoch ms or seconds
|
| 422 |
+
if re.fullmatch(r"\d{10,13}", arg):
|
| 423 |
+
v = int(arg)
|
| 424 |
+
if v < 10_000_000_000_000: # seconds -> ms
|
| 425 |
+
v *= 1000
|
| 426 |
+
return v
|
| 427 |
+
|
| 428 |
+
# YYYY-MM-DD
|
| 429 |
+
if re.fullmatch(r"\d{4}-\d{2}-\d{2}", arg):
|
| 430 |
+
d = datetime.datetime.strptime(arg, "%Y-%m-%d").replace(tzinfo=datetime.timezone.utc)
|
| 431 |
+
return to_epoch_ms(floor_to_utc_midnight(d))
|
| 432 |
+
|
| 433 |
+
# ISO-8601
|
| 434 |
+
return to_epoch_ms(arg)
|
| 435 |
+
|
| 436 |
+
if __name__ == "__main__":
|
| 437 |
+
import sys
|
| 438 |
+
if len(sys.argv) > 1:
|
| 439 |
+
try:
|
| 440 |
+
main(since_ms=_parse_cli_arg_to_ms(sys.argv[1]))
|
| 441 |
+
except Exception as e:
|
| 442 |
+
print(f"Invalid timestamp. Provide epoch ms, ISO-8601, or YYYY-MM-DD. Error: {e}")
|
| 443 |
+
sys.exit(1)
|
| 444 |
+
else:
|
| 445 |
+
main()
|
python/hubspot_emails.py
ADDED
|
@@ -0,0 +1,472 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
HubSpot Emails → Supabase (incremental since a millisecond cursor)
|
| 3 |
+
|
| 4 |
+
Usage from orchestrator:
|
| 5 |
+
import load_hubspot_emails
|
| 6 |
+
load_hubspot_emails.main(since_ms=<int milliseconds since epoch UTC>)
|
| 7 |
+
|
| 8 |
+
Direct CLI:
|
| 9 |
+
# epoch ms
|
| 10 |
+
python load_hubspot_emails.py 1754025600000
|
| 11 |
+
|
| 12 |
+
# ISO-8601 (Z or offset)
|
| 13 |
+
python load_hubspot_emails.py 2025-08-01T09:30:00Z
|
| 14 |
+
|
| 15 |
+
# Back-compat date (floors to 00:00:00Z)
|
| 16 |
+
python load_hubspot_emails.py 2025-08-01
|
| 17 |
+
"""
|
| 18 |
+
|
| 19 |
+
import os
|
| 20 |
+
import time
|
| 21 |
+
import logging
|
| 22 |
+
import datetime
|
| 23 |
+
import re
|
| 24 |
+
from typing import List, Dict, Optional, Tuple, Union
|
| 25 |
+
|
| 26 |
+
import httpx
|
| 27 |
+
import hubspot
|
| 28 |
+
from dotenv import load_dotenv
|
| 29 |
+
from supabase import create_client
|
| 30 |
+
from hubspot.crm.objects.emails import ApiException as EmailApiException
|
| 31 |
+
|
| 32 |
+
from hubspot_utils import (
|
| 33 |
+
clean_text,
|
| 34 |
+
)
|
| 35 |
+
from supabase_utils import (
|
| 36 |
+
batched_insert, update_sync_metadata,
|
| 37 |
+
)
|
| 38 |
+
|
| 39 |
+
# -----------------------------------------------------------------------------
|
| 40 |
+
# Logging
|
| 41 |
+
# -----------------------------------------------------------------------------
|
| 42 |
+
logging.basicConfig(
|
| 43 |
+
filename=f"logs/hubspot_email_pipeline_{datetime.datetime.now().strftime('%Y-%m-%d')}.log",
|
| 44 |
+
filemode="a",
|
| 45 |
+
level=logging.INFO,
|
| 46 |
+
format="%(asctime)s [%(levelname)s] %(message)s",
|
| 47 |
+
)
|
| 48 |
+
|
| 49 |
+
# -----------------------------------------------------------------------------
|
| 50 |
+
# Environment
|
| 51 |
+
# -----------------------------------------------------------------------------
|
| 52 |
+
load_dotenv()
|
| 53 |
+
HUBSPOT_TOKEN = os.getenv("HUBSPOT_TOKEN")
|
| 54 |
+
SUPABASE_URL = os.getenv("SUPABASE_URL")
|
| 55 |
+
SUPABASE_SERVICE_ROLE_KEY = os.getenv("SUPABASE_SERVICE_ROLE_KEY")
|
| 56 |
+
# Optional bootstrap cursor if orchestrator doesn't provide one
|
| 57 |
+
BOOTSTRAP_SINCE_MS_ENV = os.getenv("HUBSPOT_EMAILS_SINCE_MS")
|
| 58 |
+
|
| 59 |
+
if not HUBSPOT_TOKEN:
|
| 60 |
+
raise RuntimeError("HUBSPOT_TOKEN is not set")
|
| 61 |
+
if not SUPABASE_URL or not SUPABASE_SERVICE_ROLE_KEY:
|
| 62 |
+
raise RuntimeError("Supabase env vars are not set")
|
| 63 |
+
|
| 64 |
+
hubspot_client = hubspot.Client.create(access_token=HUBSPOT_TOKEN)
|
| 65 |
+
supabase_client = create_client(SUPABASE_URL, SUPABASE_SERVICE_ROLE_KEY)
|
| 66 |
+
|
| 67 |
+
# -----------------------------------------------------------------------------
|
| 68 |
+
# Config
|
| 69 |
+
# -----------------------------------------------------------------------------
|
| 70 |
+
EMAIL_PROPERTIES = [
|
| 71 |
+
"hs_email_from_email",
|
| 72 |
+
"hs_email_to_email",
|
| 73 |
+
"hs_email_subject",
|
| 74 |
+
"hs_email_direction",
|
| 75 |
+
"hs_email_text",
|
| 76 |
+
"hs_timestamp",
|
| 77 |
+
]
|
| 78 |
+
|
| 79 |
+
# -----------------------------------------------------------------------------
|
| 80 |
+
# Email parsing
|
| 81 |
+
# -----------------------------------------------------------------------------
|
| 82 |
+
EMAIL_RE = re.compile(r'[\w\.\+\-]+@[\w\.\-]+\.\w+')
|
| 83 |
+
|
| 84 |
+
|
| 85 |
+
def parse_emails(raw: Optional[object]) -> List[str]:
|
| 86 |
+
if raw is None:
|
| 87 |
+
return []
|
| 88 |
+
candidates: List[str] = []
|
| 89 |
+
if isinstance(raw, list):
|
| 90 |
+
for item in raw or []:
|
| 91 |
+
if not item:
|
| 92 |
+
continue
|
| 93 |
+
if isinstance(item, str):
|
| 94 |
+
for chunk in re.split(r'[;,]', item):
|
| 95 |
+
candidates.extend(EMAIL_RE.findall(chunk))
|
| 96 |
+
elif isinstance(raw, str):
|
| 97 |
+
for chunk in re.split(r'[;,]', raw):
|
| 98 |
+
candidates.extend(EMAIL_RE.findall(chunk))
|
| 99 |
+
else:
|
| 100 |
+
candidates.extend(EMAIL_RE.findall(str(raw)))
|
| 101 |
+
return sorted({c.strip().lower() for c in candidates if c and c.strip()})
|
| 102 |
+
|
| 103 |
+
# -----------------------------------------------------------------------------
|
| 104 |
+
# Time helpers
|
| 105 |
+
# -----------------------------------------------------------------------------
|
| 106 |
+
|
| 107 |
+
|
| 108 |
+
def _ensure_utc(dt: datetime.datetime) -> datetime.datetime:
|
| 109 |
+
if dt.tzinfo is None:
|
| 110 |
+
dt = dt.replace(tzinfo=datetime.timezone.utc)
|
| 111 |
+
return dt.astimezone(datetime.timezone.utc)
|
| 112 |
+
|
| 113 |
+
|
| 114 |
+
def floor_to_utc_midnight(dt: datetime.datetime) -> datetime.datetime:
|
| 115 |
+
dt = _ensure_utc(dt)
|
| 116 |
+
return dt.replace(hour=0, minute=0, second=0, microsecond=0)
|
| 117 |
+
|
| 118 |
+
|
| 119 |
+
def _parse_iso_like_to_dt(value: str) -> datetime.datetime:
|
| 120 |
+
if value.endswith("Z"):
|
| 121 |
+
value = value[:-1] + "+00:00"
|
| 122 |
+
dt = datetime.datetime.fromisoformat(value)
|
| 123 |
+
return _ensure_utc(dt)
|
| 124 |
+
|
| 125 |
+
|
| 126 |
+
def to_epoch_ms(dt_or_str: Union[str, datetime.datetime]) -> int:
|
| 127 |
+
if isinstance(dt_or_str, str):
|
| 128 |
+
dt = _parse_iso_like_to_dt(dt_or_str)
|
| 129 |
+
elif isinstance(dt_or_str, datetime.datetime):
|
| 130 |
+
dt = _ensure_utc(dt_or_str)
|
| 131 |
+
else:
|
| 132 |
+
raise TypeError(f"Unsupported type for to_epoch_ms: {type(dt_or_str)}")
|
| 133 |
+
return int(dt.timestamp() * 1000)
|
| 134 |
+
|
| 135 |
+
|
| 136 |
+
def parse_hs_timestamp_ms(value: Optional[Union[str, int, float]]) -> Optional[int]:
|
| 137 |
+
"""
|
| 138 |
+
HubSpot CRM datetime properties often come back as strings.
|
| 139 |
+
Accepts:
|
| 140 |
+
- ms epoch as str/int/float
|
| 141 |
+
- ISO-8601 strings (rare for CRM objects, but robust to it)
|
| 142 |
+
Returns ms since epoch or None.
|
| 143 |
+
"""
|
| 144 |
+
if value is None:
|
| 145 |
+
return None
|
| 146 |
+
# Easy path: integer ms
|
| 147 |
+
try:
|
| 148 |
+
v = int(str(value))
|
| 149 |
+
# treat values too small as seconds and up-convert, just in case
|
| 150 |
+
if v < 10_000_000_000: # < ~2001-09 in seconds
|
| 151 |
+
v = v * 1000
|
| 152 |
+
return v
|
| 153 |
+
except ValueError:
|
| 154 |
+
pass
|
| 155 |
+
|
| 156 |
+
# Fallback: ISO
|
| 157 |
+
try:
|
| 158 |
+
return to_epoch_ms(str(value))
|
| 159 |
+
except Exception:
|
| 160 |
+
logging.warning("Could not parse hs_timestamp=%r", value)
|
| 161 |
+
return None
|
| 162 |
+
|
| 163 |
+
# -----------------------------------------------------------------------------
|
| 164 |
+
# Search IDs (ts > since_ms)
|
| 165 |
+
# -----------------------------------------------------------------------------
|
| 166 |
+
|
| 167 |
+
|
| 168 |
+
def search_email_ids_after_ms(since_ms: int) -> List[str]:
|
| 169 |
+
"""
|
| 170 |
+
Find email IDs where hs_timestamp > since_ms (strictly greater),
|
| 171 |
+
sorted ASC so the max timestamp at the end is monotonic.
|
| 172 |
+
"""
|
| 173 |
+
|
| 174 |
+
url = "https://api.hubapi.com/crm/v3/objects/emails/search"
|
| 175 |
+
headers = {
|
| 176 |
+
"Authorization": f"Bearer {HUBSPOT_TOKEN}",
|
| 177 |
+
"Content-Type": "application/json",
|
| 178 |
+
"Accept": "application/json",
|
| 179 |
+
}
|
| 180 |
+
payload = {
|
| 181 |
+
"filterGroups": [{
|
| 182 |
+
"filters": [
|
| 183 |
+
{"propertyName": "hs_timestamp",
|
| 184 |
+
"operator": "GT", "value": str(since_ms)}
|
| 185 |
+
]
|
| 186 |
+
}],
|
| 187 |
+
"limit": 100,
|
| 188 |
+
"sorts": [{"propertyName": "hs_timestamp", "direction": "ASCENDING"}],
|
| 189 |
+
# We only need IDs here; we’ll read full records later
|
| 190 |
+
}
|
| 191 |
+
|
| 192 |
+
ids: List[str] = []
|
| 193 |
+
after: Optional[str] = None
|
| 194 |
+
with httpx.Client(timeout=30.0) as client:
|
| 195 |
+
while True:
|
| 196 |
+
body = dict(payload)
|
| 197 |
+
if after:
|
| 198 |
+
body["after"] = after
|
| 199 |
+
|
| 200 |
+
resp = client.post(url, headers=headers, json=body)
|
| 201 |
+
if resp.status_code >= 400:
|
| 202 |
+
try:
|
| 203 |
+
logging.error("Email search error: %s", resp.json())
|
| 204 |
+
except Exception:
|
| 205 |
+
logging.error("Email search error: %s", resp.text)
|
| 206 |
+
resp.raise_for_status()
|
| 207 |
+
|
| 208 |
+
data = resp.json()
|
| 209 |
+
ids.extend([obj["id"] for obj in data.get("results", []) or []])
|
| 210 |
+
|
| 211 |
+
after = (data.get("paging") or {}).get("next", {}).get("after")
|
| 212 |
+
if not after:
|
| 213 |
+
break
|
| 214 |
+
time.sleep(0.1)
|
| 215 |
+
|
| 216 |
+
logging.info("Found %d email IDs after %d.", len(ids), since_ms)
|
| 217 |
+
return ids
|
| 218 |
+
|
| 219 |
+
# -----------------------------------------------------------------------------
|
| 220 |
+
# Read-by-ID (with associations)
|
| 221 |
+
# -----------------------------------------------------------------------------
|
| 222 |
+
|
| 223 |
+
|
| 224 |
+
def read_emails_by_ids(
|
| 225 |
+
email_ids: List[str]
|
| 226 |
+
) -> Tuple[List[Dict], List[Dict], List[Dict], List[Dict], Optional[int]]:
|
| 227 |
+
"""
|
| 228 |
+
Read emails by ID with properties and associations (companies/contacts/deals).
|
| 229 |
+
Returns data lists + max hs_timestamp (ms) seen.
|
| 230 |
+
"""
|
| 231 |
+
|
| 232 |
+
if not email_ids:
|
| 233 |
+
return [], [], [], [], None
|
| 234 |
+
|
| 235 |
+
email_metadata_data: List[Dict] = []
|
| 236 |
+
email_company_links: List[Dict] = []
|
| 237 |
+
email_contact_links: List[Dict] = []
|
| 238 |
+
email_deal_links: List[Dict] = []
|
| 239 |
+
email_ticket_links: List[Dict] = []
|
| 240 |
+
|
| 241 |
+
assoc_types = ["companies", "contacts", "deals", "tickets"]
|
| 242 |
+
max_ts_ms: Optional[int] = None
|
| 243 |
+
|
| 244 |
+
for i, email_id in enumerate(email_ids, start=1):
|
| 245 |
+
try:
|
| 246 |
+
record = hubspot_client.crm.objects.emails.basic_api.get_by_id(
|
| 247 |
+
email_id, properties=EMAIL_PROPERTIES, associations=assoc_types, archived=False
|
| 248 |
+
)
|
| 249 |
+
props = record.properties or {}
|
| 250 |
+
|
| 251 |
+
cleaned = clean_text(props.get("hs_email_text") or "")
|
| 252 |
+
|
| 253 |
+
# Robust timestamp handling
|
| 254 |
+
hs_ts_ms = parse_hs_timestamp_ms(props.get("hs_timestamp"))
|
| 255 |
+
if hs_ts_ms is not None:
|
| 256 |
+
if (max_ts_ms is None) or (hs_ts_ms > max_ts_ms):
|
| 257 |
+
max_ts_ms = hs_ts_ms
|
| 258 |
+
sent_at_iso = datetime.datetime.fromtimestamp(
|
| 259 |
+
hs_ts_ms / 1000, tz=datetime.timezone.utc).isoformat()
|
| 260 |
+
else:
|
| 261 |
+
sent_at_iso = None
|
| 262 |
+
|
| 263 |
+
email_metadata_data.append({
|
| 264 |
+
"email_id": record.id,
|
| 265 |
+
"subject": props.get("hs_email_subject"),
|
| 266 |
+
"from_email": props.get("hs_email_from_email") or "",
|
| 267 |
+
"to_emails": parse_emails(props.get("hs_email_to_email")),
|
| 268 |
+
"sent_at": sent_at_iso,
|
| 269 |
+
"direction": props.get("hs_email_direction"),
|
| 270 |
+
"email_content": cleaned,
|
| 271 |
+
})
|
| 272 |
+
|
| 273 |
+
assoc = record.associations or {}
|
| 274 |
+
if assoc.get("companies") and getattr(assoc["companies"], "results", None):
|
| 275 |
+
for a in assoc["companies"].results:
|
| 276 |
+
if a.id and a.id.isdigit():
|
| 277 |
+
email_company_links.append(
|
| 278 |
+
{"email_id": record.id, "company_id": int(a.id)})
|
| 279 |
+
if assoc.get("contacts") and getattr(assoc["contacts"], "results", None):
|
| 280 |
+
for a in assoc["contacts"].results:
|
| 281 |
+
if a.id and a.id.isdigit():
|
| 282 |
+
email_contact_links.append(
|
| 283 |
+
{"email_id": record.id, "contact_id": int(a.id)})
|
| 284 |
+
if assoc.get("deals") and getattr(assoc["deals"], "results", None):
|
| 285 |
+
for a in assoc["deals"].results:
|
| 286 |
+
if a.id and a.id.isdigit():
|
| 287 |
+
email_deal_links.append(
|
| 288 |
+
{"email_id": record.id, "deal_id": int(a.id)})
|
| 289 |
+
if assoc.get("tickets") and getattr(assoc["tickets"], "results", None):
|
| 290 |
+
for a in assoc["tickets"].results:
|
| 291 |
+
if a.id and a.id.isdigit():
|
| 292 |
+
email_ticket_links.append(
|
| 293 |
+
{"email_id": record.id, "ticket_id": int(a.id)})
|
| 294 |
+
|
| 295 |
+
if i % 200 == 0:
|
| 296 |
+
logging.info("Read %d emails...", i)
|
| 297 |
+
time.sleep(0.05)
|
| 298 |
+
|
| 299 |
+
except httpx.HTTPStatusError as e:
|
| 300 |
+
logging.error("HTTP error reading email %s: %s", email_id, e)
|
| 301 |
+
except (EmailApiException, httpx.HTTPError) as e:
|
| 302 |
+
logging.error("Error reading email %s: %s", email_id, e)
|
| 303 |
+
|
| 304 |
+
return (
|
| 305 |
+
email_metadata_data, email_company_links, email_contact_links,
|
| 306 |
+
email_deal_links, email_ticket_links, max_ts_ms
|
| 307 |
+
)
|
| 308 |
+
|
| 309 |
+
# -----------------------------------------------------------------------------
|
| 310 |
+
# Upsert
|
| 311 |
+
# -----------------------------------------------------------------------------
|
| 312 |
+
|
| 313 |
+
|
| 314 |
+
def upsert_emails(
|
| 315 |
+
email_metadata_data: List[Dict],
|
| 316 |
+
email_company_links: List[Dict],
|
| 317 |
+
email_contact_links: List[Dict],
|
| 318 |
+
email_deal_links: List[Dict],
|
| 319 |
+
email_ticket_links: List[Dict],
|
| 320 |
+
) -> None:
|
| 321 |
+
if email_metadata_data:
|
| 322 |
+
batched_insert(
|
| 323 |
+
supabase_client,
|
| 324 |
+
"hubspot_emails",
|
| 325 |
+
email_metadata_data,
|
| 326 |
+
batch_size=1000,
|
| 327 |
+
# If your helper supports on_conflict:
|
| 328 |
+
# on_conflict=["email_id"]
|
| 329 |
+
)
|
| 330 |
+
print(f"Upserted {len(email_metadata_data)} email metadata rows.")
|
| 331 |
+
|
| 332 |
+
if email_company_links:
|
| 333 |
+
batched_insert(
|
| 334 |
+
supabase_client,
|
| 335 |
+
"hubspot_email_companies",
|
| 336 |
+
email_company_links,
|
| 337 |
+
batch_size=1000,
|
| 338 |
+
on_conflict=["email_id", "company_id"],
|
| 339 |
+
)
|
| 340 |
+
print(f"Upserted {len(email_company_links)} email-company links.")
|
| 341 |
+
|
| 342 |
+
if email_contact_links:
|
| 343 |
+
batched_insert(
|
| 344 |
+
supabase_client,
|
| 345 |
+
"hubspot_email_contacts",
|
| 346 |
+
email_contact_links,
|
| 347 |
+
batch_size=1000,
|
| 348 |
+
on_conflict=["email_id", "contact_id"],
|
| 349 |
+
)
|
| 350 |
+
print(f"Upserted {len(email_contact_links)} email-contact links.")
|
| 351 |
+
|
| 352 |
+
if email_deal_links:
|
| 353 |
+
batched_insert(
|
| 354 |
+
supabase_client,
|
| 355 |
+
"hubspot_email_deals",
|
| 356 |
+
email_deal_links,
|
| 357 |
+
batch_size=1000,
|
| 358 |
+
on_conflict=["email_id", "deal_id"],
|
| 359 |
+
)
|
| 360 |
+
print(f"Upserted {len(email_deal_links)} email-deal links.")
|
| 361 |
+
|
| 362 |
+
if email_ticket_links:
|
| 363 |
+
batched_insert(
|
| 364 |
+
supabase_client,
|
| 365 |
+
"hubspot_email_tickets",
|
| 366 |
+
email_ticket_links,
|
| 367 |
+
batch_size=1000,
|
| 368 |
+
on_conflict=["email_id", "ticket_id"],
|
| 369 |
+
)
|
| 370 |
+
print(f"Upserted {len(email_ticket_links)} email-ticket links.")
|
| 371 |
+
|
| 372 |
+
# -----------------------------------------------------------------------------
|
| 373 |
+
# Main (timestamp cursor)
|
| 374 |
+
# -----------------------------------------------------------------------------
|
| 375 |
+
|
| 376 |
+
|
| 377 |
+
def main(since_ms: Optional[int] = None):
|
| 378 |
+
"""
|
| 379 |
+
Orchestrates:
|
| 380 |
+
1) Search email IDs with hs_timestamp > since_ms
|
| 381 |
+
2) Read full emails with associations (track max timestamp)
|
| 382 |
+
3) Upsert into Supabase
|
| 383 |
+
4) Update sync metadata with both last_sync_metadata and last_sync_time
|
| 384 |
+
"""
|
| 385 |
+
# Resolve since_ms
|
| 386 |
+
if since_ms is None and BOOTSTRAP_SINCE_MS_ENV:
|
| 387 |
+
try:
|
| 388 |
+
since_ms = int(BOOTSTRAP_SINCE_MS_ENV)
|
| 389 |
+
except ValueError:
|
| 390 |
+
raise RuntimeError(
|
| 391 |
+
"HUBSPOT_EMAILS_SINCE_MS must be an integer (ms) if set.")
|
| 392 |
+
|
| 393 |
+
if since_ms is None:
|
| 394 |
+
# Default: today@00:00:00Z for first run
|
| 395 |
+
today0 = floor_to_utc_midnight(
|
| 396 |
+
datetime.datetime.now(datetime.timezone.utc))
|
| 397 |
+
since_ms = to_epoch_ms(today0)
|
| 398 |
+
|
| 399 |
+
print(f"Searching emails with hs_timestamp > {since_ms} ...")
|
| 400 |
+
ids = search_email_ids_after_ms(since_ms)
|
| 401 |
+
print(f"Found {len(ids)} email IDs.")
|
| 402 |
+
|
| 403 |
+
if not ids:
|
| 404 |
+
print("No emails beyond the cursor. Updating sync metadata and exiting.")
|
| 405 |
+
# Record only last_sync_time; helper sets updated_at
|
| 406 |
+
now_iso = datetime.datetime.now(datetime.timezone.utc).isoformat()
|
| 407 |
+
update_sync_metadata(supabase_client, "emails", now_iso)
|
| 408 |
+
return
|
| 409 |
+
|
| 410 |
+
print("Reading emails (with associations)...")
|
| 411 |
+
(
|
| 412 |
+
email_metadata_data, email_company_links, email_contact_links,
|
| 413 |
+
email_deal_links, email_ticket_links, max_ts_ms
|
| 414 |
+
) = read_emails_by_ids(ids)
|
| 415 |
+
|
| 416 |
+
print("Upserting into Supabase...")
|
| 417 |
+
upsert_emails(
|
| 418 |
+
email_metadata_data,
|
| 419 |
+
email_company_links,
|
| 420 |
+
email_contact_links,
|
| 421 |
+
email_deal_links,
|
| 422 |
+
email_ticket_links,
|
| 423 |
+
)
|
| 424 |
+
|
| 425 |
+
# Advance cursor to max timestamp we actually ingested
|
| 426 |
+
new_cursor_ms = max_ts_ms if max_ts_ms is not None else since_ms
|
| 427 |
+
now_iso = datetime.datetime.now(datetime.timezone.utc).isoformat()
|
| 428 |
+
|
| 429 |
+
update_sync_metadata(supabase_client, "emails", now_iso)
|
| 430 |
+
|
| 431 |
+
print(f"Emails sync complete. Advanced cursor to {new_cursor_ms}.")
|
| 432 |
+
|
| 433 |
+
# -----------------------------------------------------------------------------
|
| 434 |
+
# CLI
|
| 435 |
+
# -----------------------------------------------------------------------------
|
| 436 |
+
|
| 437 |
+
|
| 438 |
+
def _parse_cli_arg_to_ms(arg: str) -> int:
|
| 439 |
+
"""
|
| 440 |
+
Accept:
|
| 441 |
+
- integer epoch ms
|
| 442 |
+
- ISO-8601 (Z or offset)
|
| 443 |
+
- YYYY-MM-DD (floors to 00:00Z for convenience/back-compat)
|
| 444 |
+
"""
|
| 445 |
+
# epoch ms
|
| 446 |
+
if re.fullmatch(r"\d{10,13}", arg):
|
| 447 |
+
v = int(arg)
|
| 448 |
+
if v < 10_000_000_000_000: # seconds -> ms
|
| 449 |
+
v *= 1000
|
| 450 |
+
return v
|
| 451 |
+
|
| 452 |
+
# YYYY-MM-DD
|
| 453 |
+
if re.fullmatch(r"\d{4}-\d{2}-\d{2}", arg):
|
| 454 |
+
d = datetime.datetime.strptime(
|
| 455 |
+
arg, "%Y-%m-%d").replace(tzinfo=datetime.timezone.utc)
|
| 456 |
+
return to_epoch_ms(floor_to_utc_midnight(d))
|
| 457 |
+
|
| 458 |
+
# ISO-8601
|
| 459 |
+
return to_epoch_ms(arg)
|
| 460 |
+
|
| 461 |
+
|
| 462 |
+
if __name__ == "__main__":
|
| 463 |
+
import sys
|
| 464 |
+
if len(sys.argv) > 1:
|
| 465 |
+
try:
|
| 466 |
+
main(since_ms=_parse_cli_arg_to_ms(sys.argv[1]))
|
| 467 |
+
except Exception as e:
|
| 468 |
+
print(
|
| 469 |
+
f"Invalid timestamp. Provide epoch ms, ISO-8601, or YYYY-MM-DD. Error: {e}")
|
| 470 |
+
sys.exit(1)
|
| 471 |
+
else:
|
| 472 |
+
main()
|
python/hubspot_tickets.py
ADDED
|
@@ -0,0 +1,491 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
HubSpot Tickets → Supabase (incremental since a millisecond cursor)
|
| 3 |
+
|
| 4 |
+
Usage from orchestrator:
|
| 5 |
+
import load_hubspot_tickets
|
| 6 |
+
load_hubspot_tickets.main(since_ms=<int milliseconds since epoch UTC>)
|
| 7 |
+
|
| 8 |
+
Direct CLI:
|
| 9 |
+
# epoch ms
|
| 10 |
+
python load_hubspot_tickets.py 1754025600000
|
| 11 |
+
# ISO-8601
|
| 12 |
+
python load_hubspot_tickets.py 2025-08-01T09:30:00Z
|
| 13 |
+
# Back-compat date (floors to 00:00Z)
|
| 14 |
+
python load_hubspot_tickets.py 2025-08-01
|
| 15 |
+
"""
|
| 16 |
+
|
| 17 |
+
import os
|
| 18 |
+
import re
|
| 19 |
+
import time
|
| 20 |
+
import logging
|
| 21 |
+
import datetime
|
| 22 |
+
from typing import List, Dict, Optional, Tuple, Union
|
| 23 |
+
|
| 24 |
+
import httpx
|
| 25 |
+
import hubspot
|
| 26 |
+
from dotenv import load_dotenv
|
| 27 |
+
from supabase import create_client
|
| 28 |
+
from hubspot.crm.tickets import ApiException as TicketsApiException
|
| 29 |
+
|
| 30 |
+
from hubspot_utils import (
|
| 31 |
+
parse_ts, try_parse_int, deduplicate_by_key,
|
| 32 |
+
)
|
| 33 |
+
from supabase_utils import (
|
| 34 |
+
batched_insert, update_sync_metadata,
|
| 35 |
+
)
|
| 36 |
+
|
| 37 |
+
# -----------------------------------------------------------------------------
|
| 38 |
+
# Logging
|
| 39 |
+
# -----------------------------------------------------------------------------
|
| 40 |
+
logging.basicConfig(
|
| 41 |
+
filename=f"logs/hubspot_tickets_pipeline_{datetime.datetime.now().strftime('%Y-%m-%d')}.log",
|
| 42 |
+
filemode="a",
|
| 43 |
+
level=logging.INFO,
|
| 44 |
+
format="%(asctime)s [%(levelname)s] %(message)s",
|
| 45 |
+
)
|
| 46 |
+
|
| 47 |
+
# -----------------------------------------------------------------------------
|
| 48 |
+
# Environment
|
| 49 |
+
# -----------------------------------------------------------------------------
|
| 50 |
+
load_dotenv()
|
| 51 |
+
HUBSPOT_TOKEN = os.getenv("HUBSPOT_TOKEN")
|
| 52 |
+
SUPABASE_URL = os.getenv("SUPABASE_URL")
|
| 53 |
+
SUPABASE_SERVICE_ROLE_KEY = os.getenv("SUPABASE_SERVICE_ROLE_KEY")
|
| 54 |
+
# Optional bootstrap cursor if orchestrator doesn't provide one
|
| 55 |
+
BOOTSTRAP_SINCE_MS_ENV = os.getenv("HUBSPOT_TICKETS_SINCE_MS")
|
| 56 |
+
|
| 57 |
+
if not HUBSPOT_TOKEN:
|
| 58 |
+
raise RuntimeError("HUBSPOT_TOKEN is not set")
|
| 59 |
+
if not SUPABASE_URL or not SUPABASE_SERVICE_ROLE_KEY:
|
| 60 |
+
raise RuntimeError("Supabase env vars are not set")
|
| 61 |
+
|
| 62 |
+
hubspot_client = hubspot.Client.create(access_token=HUBSPOT_TOKEN)
|
| 63 |
+
supabase_client = create_client(SUPABASE_URL, SUPABASE_SERVICE_ROLE_KEY)
|
| 64 |
+
|
| 65 |
+
# -----------------------------------------------------------------------------
|
| 66 |
+
# Config
|
| 67 |
+
# -----------------------------------------------------------------------------
|
| 68 |
+
# Custom object association (CLI)
|
| 69 |
+
CLI_ASSOC_TYPE_ID = "2-25629083"
|
| 70 |
+
CLI_ASSOC_FALLBACK_KEY = "p8600202_cli_s"
|
| 71 |
+
|
| 72 |
+
TICKET_PROPERTIES = [
|
| 73 |
+
"closed_date",
|
| 74 |
+
"content",
|
| 75 |
+
"ticket_type",
|
| 76 |
+
"createdate",
|
| 77 |
+
"hs_created_by_user_id",
|
| 78 |
+
"hs_lastmodifieddate",
|
| 79 |
+
"hs_object_id",
|
| 80 |
+
"hs_pipeline",
|
| 81 |
+
"hs_pipeline_stage",
|
| 82 |
+
"hs_ticket_priority",
|
| 83 |
+
"subject",
|
| 84 |
+
"hubspot_owner_id",
|
| 85 |
+
"source_type",
|
| 86 |
+
]
|
| 87 |
+
|
| 88 |
+
# -----------------------------------------------------------------------------
|
| 89 |
+
# Time helpers
|
| 90 |
+
# -----------------------------------------------------------------------------
|
| 91 |
+
|
| 92 |
+
|
| 93 |
+
def _ensure_utc(dt: datetime.datetime) -> datetime.datetime:
|
| 94 |
+
if dt.tzinfo is None:
|
| 95 |
+
dt = dt.replace(tzinfo=datetime.timezone.utc)
|
| 96 |
+
return dt.astimezone(datetime.timezone.utc)
|
| 97 |
+
|
| 98 |
+
|
| 99 |
+
def floor_to_utc_midnight(dt: datetime.datetime) -> datetime.datetime:
|
| 100 |
+
dt = _ensure_utc(dt)
|
| 101 |
+
return dt.replace(hour=0, minute=0, second=0, microsecond=0)
|
| 102 |
+
|
| 103 |
+
|
| 104 |
+
def _parse_iso_like_to_dt(value: str) -> datetime.datetime:
|
| 105 |
+
if value.endswith("Z"):
|
| 106 |
+
value = value[:-1] + "+00:00"
|
| 107 |
+
dt = datetime.datetime.fromisoformat(value)
|
| 108 |
+
return _ensure_utc(dt)
|
| 109 |
+
|
| 110 |
+
|
| 111 |
+
def to_epoch_ms(dt_or_str: Union[str, datetime.datetime]) -> int:
|
| 112 |
+
if isinstance(dt_or_str, str):
|
| 113 |
+
dt = _parse_iso_like_to_dt(dt_or_str)
|
| 114 |
+
elif isinstance(dt_or_str, datetime.datetime):
|
| 115 |
+
dt = _ensure_utc(dt_or_str)
|
| 116 |
+
else:
|
| 117 |
+
raise TypeError(f"Unsupported type for to_epoch_ms: {type(dt_or_str)}")
|
| 118 |
+
return int(dt.timestamp() * 1000)
|
| 119 |
+
|
| 120 |
+
|
| 121 |
+
def parse_any_ts_ms(value: Optional[Union[str, int, float]]) -> Optional[int]:
|
| 122 |
+
"""
|
| 123 |
+
Accepts:
|
| 124 |
+
- ms-epoch as str/int/float
|
| 125 |
+
- seconds-epoch as str/int/float (auto *1000)
|
| 126 |
+
- ISO-8601 string
|
| 127 |
+
Returns ms since epoch or None.
|
| 128 |
+
"""
|
| 129 |
+
if value is None:
|
| 130 |
+
return None
|
| 131 |
+
# try int first
|
| 132 |
+
try:
|
| 133 |
+
v = int(str(value))
|
| 134 |
+
if v < 10_000_000_000: # seconds → ms
|
| 135 |
+
v *= 1000
|
| 136 |
+
return v
|
| 137 |
+
except ValueError:
|
| 138 |
+
pass
|
| 139 |
+
# try ISO
|
| 140 |
+
try:
|
| 141 |
+
return to_epoch_ms(str(value))
|
| 142 |
+
except Exception:
|
| 143 |
+
logging.warning("Could not parse timestamp value=%r", value)
|
| 144 |
+
return None
|
| 145 |
+
|
| 146 |
+
# -----------------------------------------------------------------------------
|
| 147 |
+
# Pipeline & stage mappings
|
| 148 |
+
# -----------------------------------------------------------------------------
|
| 149 |
+
|
| 150 |
+
|
| 151 |
+
def get_pipeline_and_stage_mappings() -> Tuple[Dict[str, str], Dict[str, str]]:
|
| 152 |
+
try:
|
| 153 |
+
resp = hubspot_client.crm.pipelines.pipelines_api.get_all(
|
| 154 |
+
object_type="tickets")
|
| 155 |
+
pipeline_mapping: Dict[str, str] = {}
|
| 156 |
+
stage_mapping: Dict[str, str] = {}
|
| 157 |
+
for p in resp.results:
|
| 158 |
+
pipeline_mapping[p.id] = p.label
|
| 159 |
+
for s in p.stages:
|
| 160 |
+
stage_mapping[s.id] = s.label
|
| 161 |
+
return pipeline_mapping, stage_mapping
|
| 162 |
+
except Exception as e:
|
| 163 |
+
logging.error("Failed to fetch pipeline/stage mappings: %s", e)
|
| 164 |
+
return {}, {}
|
| 165 |
+
|
| 166 |
+
# -----------------------------------------------------------------------------
|
| 167 |
+
# Search IDs (ts > since_ms) with property fallback
|
| 168 |
+
# -----------------------------------------------------------------------------
|
| 169 |
+
|
| 170 |
+
|
| 171 |
+
def _search_ticket_ids_from(since_ms: int, prop: str) -> List[str]:
|
| 172 |
+
"""
|
| 173 |
+
Search tickets where {prop} > since_ms.
|
| 174 |
+
Sort ascending so we can advance cursor monotonically.
|
| 175 |
+
"""
|
| 176 |
+
url = "https://api.hubapi.com/crm/v3/objects/tickets/search"
|
| 177 |
+
headers = {
|
| 178 |
+
"Authorization": f"Bearer {HUBSPOT_TOKEN}",
|
| 179 |
+
"Content-Type": "application/json",
|
| 180 |
+
"Accept": "application/json",
|
| 181 |
+
}
|
| 182 |
+
payload = {
|
| 183 |
+
"filterGroups": [{
|
| 184 |
+
"filters": [
|
| 185 |
+
{"propertyName": prop, "operator": "GT",
|
| 186 |
+
"value": str(since_ms)},
|
| 187 |
+
]
|
| 188 |
+
}],
|
| 189 |
+
"limit": 100,
|
| 190 |
+
"sorts": [{"propertyName": prop, "direction": "ASCENDING"}],
|
| 191 |
+
}
|
| 192 |
+
|
| 193 |
+
ids: List[str] = []
|
| 194 |
+
after: Optional[str] = None
|
| 195 |
+
with httpx.Client(timeout=30.0) as client:
|
| 196 |
+
while True:
|
| 197 |
+
body = dict(payload)
|
| 198 |
+
if after:
|
| 199 |
+
body["after"] = after
|
| 200 |
+
|
| 201 |
+
resp = client.post(url, headers=headers, json=body)
|
| 202 |
+
if resp.status_code >= 400:
|
| 203 |
+
try:
|
| 204 |
+
logging.error(
|
| 205 |
+
"Ticket search error for prop '%s': %s", prop, resp.json())
|
| 206 |
+
except Exception:
|
| 207 |
+
logging.error(
|
| 208 |
+
"Ticket search error for prop '%s': %s", prop, resp.text)
|
| 209 |
+
resp.raise_for_status()
|
| 210 |
+
|
| 211 |
+
data = resp.json()
|
| 212 |
+
ids.extend([obj["id"] for obj in data.get("results", []) or []])
|
| 213 |
+
|
| 214 |
+
after = (data.get("paging") or {}).get("next", {}).get("after")
|
| 215 |
+
if not after:
|
| 216 |
+
break
|
| 217 |
+
time.sleep(0.1)
|
| 218 |
+
|
| 219 |
+
return ids
|
| 220 |
+
|
| 221 |
+
|
| 222 |
+
def search_ticket_ids_after_ms(since_ms: int) -> Tuple[List[str], str]:
|
| 223 |
+
"""
|
| 224 |
+
Search ticket IDs where {prop} > since_ms (strictly greater),
|
| 225 |
+
sorted ASC so the max timestamp at the end is monotonic.
|
| 226 |
+
Returns a tuple of (ticket_ids, prop_used).
|
| 227 |
+
"""
|
| 228 |
+
|
| 229 |
+
props_to_try = ["createdate"]
|
| 230 |
+
last_err = None
|
| 231 |
+
|
| 232 |
+
for prop in props_to_try:
|
| 233 |
+
try:
|
| 234 |
+
ids = _search_ticket_ids_from(since_ms, prop)
|
| 235 |
+
logging.info(
|
| 236 |
+
"Ticket search with '%s' returned %d IDs.", prop, len(ids))
|
| 237 |
+
return ids, prop
|
| 238 |
+
except httpx.HTTPStatusError as e:
|
| 239 |
+
last_err = e
|
| 240 |
+
continue
|
| 241 |
+
|
| 242 |
+
if last_err:
|
| 243 |
+
raise last_err
|
| 244 |
+
return [], "createdate"
|
| 245 |
+
|
| 246 |
+
# -----------------------------------------------------------------------------
|
| 247 |
+
# Read-by-ID (with associations)
|
| 248 |
+
# -----------------------------------------------------------------------------
|
| 249 |
+
|
| 250 |
+
|
| 251 |
+
def read_tickets_by_ids(
|
| 252 |
+
ticket_ids: List[str],
|
| 253 |
+
cursor_prop: str,
|
| 254 |
+
) -> Tuple[List[Dict], List[Dict], List[Dict], List[Dict], Optional[int]]:
|
| 255 |
+
"""
|
| 256 |
+
Read tickets by ID with properties and associations (CLI/deals/contacts).
|
| 257 |
+
Returns: tickets, ticket_cli_links, ticket_deal_links, ticket_contact_links, max_ts_ms_for_cursor_prop
|
| 258 |
+
"""
|
| 259 |
+
if not ticket_ids:
|
| 260 |
+
return [], [], [], [], None
|
| 261 |
+
|
| 262 |
+
tickets: List[Dict] = []
|
| 263 |
+
ticket_cli_links: List[Dict] = []
|
| 264 |
+
ticket_deal_links: List[Dict] = []
|
| 265 |
+
ticket_contact_links: List[Dict] = []
|
| 266 |
+
|
| 267 |
+
assoc_types = [CLI_ASSOC_TYPE_ID, "deals", "contacts"]
|
| 268 |
+
pipeline_map, stage_map = get_pipeline_and_stage_mappings()
|
| 269 |
+
|
| 270 |
+
max_ts_ms: Optional[int] = None
|
| 271 |
+
|
| 272 |
+
for i, tid in enumerate(ticket_ids, start=1):
|
| 273 |
+
try:
|
| 274 |
+
record = hubspot_client.crm.tickets.basic_api.get_by_id(
|
| 275 |
+
tid, properties=TICKET_PROPERTIES, associations=assoc_types, archived=False
|
| 276 |
+
)
|
| 277 |
+
p = record.properties or {}
|
| 278 |
+
|
| 279 |
+
# Track max timestamp based on cursor_prop we searched on
|
| 280 |
+
cursor_val = p.get(cursor_prop)
|
| 281 |
+
ts_ms = parse_any_ts_ms(cursor_val)
|
| 282 |
+
if ts_ms is not None and (max_ts_ms is None or ts_ms > max_ts_ms):
|
| 283 |
+
max_ts_ms = ts_ms
|
| 284 |
+
|
| 285 |
+
tickets.append({
|
| 286 |
+
"ticket_id": try_parse_int(record.id),
|
| 287 |
+
"subject": p.get("subject"),
|
| 288 |
+
"content": p.get("content"),
|
| 289 |
+
"ticket_type": p.get("ticket_type"),
|
| 290 |
+
"closed_date": parse_ts(p.get("closed_date")),
|
| 291 |
+
"hubspot_modified_date": parse_ts(p.get("hs_lastmodifieddate")),
|
| 292 |
+
"pipeline_id": try_parse_int(p.get("hs_pipeline")),
|
| 293 |
+
"pipeline_label": pipeline_map.get(p.get("hs_pipeline"), ""),
|
| 294 |
+
"ticket_status_id": try_parse_int(p.get("hs_pipeline_stage")),
|
| 295 |
+
"ticket_status_label": stage_map.get(p.get("hs_pipeline_stage"), ""),
|
| 296 |
+
"ticket_priority": p.get("hs_ticket_priority"),
|
| 297 |
+
"source_type": p.get("source_type"),
|
| 298 |
+
"hubspot_owner_id": try_parse_int(p.get("hubspot_owner_id")),
|
| 299 |
+
"hubspot_created_at": parse_ts(p.get("createdate")),
|
| 300 |
+
"hubspot_created_by": try_parse_int(p.get("hs_created_by_user_id")),
|
| 301 |
+
})
|
| 302 |
+
|
| 303 |
+
# Associations
|
| 304 |
+
assoc = record.associations or {}
|
| 305 |
+
|
| 306 |
+
# CLI (custom object) key may be numeric ID or the name key
|
| 307 |
+
cli_bucket = None
|
| 308 |
+
if assoc.get(CLI_ASSOC_TYPE_ID):
|
| 309 |
+
cli_bucket = assoc[CLI_ASSOC_TYPE_ID]
|
| 310 |
+
elif assoc.get(CLI_ASSOC_FALLBACK_KEY):
|
| 311 |
+
cli_bucket = assoc[CLI_ASSOC_FALLBACK_KEY]
|
| 312 |
+
|
| 313 |
+
if cli_bucket and getattr(cli_bucket, "results", None):
|
| 314 |
+
for a in cli_bucket.results:
|
| 315 |
+
if a.id and a.id.isdigit():
|
| 316 |
+
ticket_cli_links.append({
|
| 317 |
+
"ticket_id": try_parse_int(record.id),
|
| 318 |
+
"cli_id": try_parse_int(a.id),
|
| 319 |
+
})
|
| 320 |
+
|
| 321 |
+
if assoc.get("deals") and getattr(assoc["deals"], "results", None):
|
| 322 |
+
for a in assoc["deals"].results:
|
| 323 |
+
if a.id and a.id.isdigit():
|
| 324 |
+
ticket_deal_links.append({
|
| 325 |
+
"ticket_id": try_parse_int(record.id),
|
| 326 |
+
"deal_id": try_parse_int(a.id),
|
| 327 |
+
})
|
| 328 |
+
|
| 329 |
+
if assoc.get("contacts") and getattr(assoc["contacts"], "results", None):
|
| 330 |
+
for a in assoc["contacts"].results:
|
| 331 |
+
if a.id and a.id.isdigit():
|
| 332 |
+
ticket_contact_links.append({
|
| 333 |
+
"ticket_id": try_parse_int(record.id),
|
| 334 |
+
"contact_id": try_parse_int(a.id),
|
| 335 |
+
})
|
| 336 |
+
|
| 337 |
+
if i % 200 == 0:
|
| 338 |
+
logging.info("Read %d tickets...", i)
|
| 339 |
+
|
| 340 |
+
time.sleep(0.05)
|
| 341 |
+
|
| 342 |
+
except httpx.HTTPStatusError as e:
|
| 343 |
+
logging.error("HTTP error reading ticket %s: %s", tid, e)
|
| 344 |
+
except (TicketsApiException, httpx.HTTPError) as e:
|
| 345 |
+
logging.error("Error reading ticket %s: %s", tid, e)
|
| 346 |
+
|
| 347 |
+
return tickets, ticket_cli_links, ticket_deal_links, ticket_contact_links, max_ts_ms
|
| 348 |
+
|
| 349 |
+
# -----------------------------------------------------------------------------
|
| 350 |
+
# Upsert
|
| 351 |
+
# -----------------------------------------------------------------------------
|
| 352 |
+
|
| 353 |
+
|
| 354 |
+
def upsert_tickets(
|
| 355 |
+
tickets: List[Dict],
|
| 356 |
+
ticket_cli_links: List[Dict],
|
| 357 |
+
ticket_deal_links: List[Dict],
|
| 358 |
+
ticket_contact_links: List[Dict],
|
| 359 |
+
) -> None:
|
| 360 |
+
if tickets:
|
| 361 |
+
tickets = deduplicate_by_key(tickets, key="ticket_id")
|
| 362 |
+
batched_insert(
|
| 363 |
+
supabase_client, "hubspot_tickets", tickets,
|
| 364 |
+
batch_size=1000, on_conflict=["ticket_id"]
|
| 365 |
+
)
|
| 366 |
+
print(f"Upserted {len(tickets)} tickets.")
|
| 367 |
+
|
| 368 |
+
if ticket_cli_links:
|
| 369 |
+
ticket_cli_links = deduplicate_by_key(
|
| 370 |
+
ticket_cli_links, key=("ticket_id", "cli_id"))
|
| 371 |
+
batched_insert(
|
| 372 |
+
supabase_client, "hubspot_ticket_clis", ticket_cli_links,
|
| 373 |
+
batch_size=1000, on_conflict=["ticket_id", "cli_id"]
|
| 374 |
+
)
|
| 375 |
+
print(f"Upserted {len(ticket_cli_links)} ticket-cli associations.")
|
| 376 |
+
|
| 377 |
+
if ticket_deal_links:
|
| 378 |
+
ticket_deal_links = deduplicate_by_key(
|
| 379 |
+
ticket_deal_links, key=("ticket_id", "deal_id"))
|
| 380 |
+
batched_insert(
|
| 381 |
+
supabase_client, "hubspot_ticket_deals", ticket_deal_links,
|
| 382 |
+
batch_size=1000, on_conflict=["ticket_id", "deal_id"]
|
| 383 |
+
)
|
| 384 |
+
print(f"Upserted {len(ticket_deal_links)} ticket-deal associations.")
|
| 385 |
+
|
| 386 |
+
if ticket_contact_links:
|
| 387 |
+
ticket_contact_links = deduplicate_by_key(
|
| 388 |
+
ticket_contact_links, key=("ticket_id", "contact_id"))
|
| 389 |
+
batched_insert(
|
| 390 |
+
supabase_client, "hubspot_ticket_contacts", ticket_contact_links,
|
| 391 |
+
batch_size=1000, on_conflict=["ticket_id", "contact_id"]
|
| 392 |
+
)
|
| 393 |
+
print(
|
| 394 |
+
f"Upserted {len(ticket_contact_links)} ticket-contact associations.")
|
| 395 |
+
|
| 396 |
+
# -----------------------------------------------------------------------------
|
| 397 |
+
# Main (timestamp cursor)
|
| 398 |
+
# -----------------------------------------------------------------------------
|
| 399 |
+
|
| 400 |
+
|
| 401 |
+
def main(since_ms: Optional[int] = None):
|
| 402 |
+
"""
|
| 403 |
+
Orchestrates:
|
| 404 |
+
1) Search ticket IDs with <cursor_prop> > since_ms (property fallback)
|
| 405 |
+
2) Read full tickets with associations (track max timestamp for <cursor_prop>)
|
| 406 |
+
3) Upsert into Supabase
|
| 407 |
+
4) Update sync metadata with { last_sync_metadata, last_sync_time, cursor_prop }
|
| 408 |
+
"""
|
| 409 |
+
# Resolve since_ms
|
| 410 |
+
if since_ms is None and BOOTSTRAP_SINCE_MS_ENV:
|
| 411 |
+
try:
|
| 412 |
+
since_ms = int(BOOTSTRAP_SINCE_MS_ENV)
|
| 413 |
+
except ValueError:
|
| 414 |
+
raise RuntimeError(
|
| 415 |
+
"HUBSPOT_TICKETS_SINCE_MS must be an integer (ms) if set.")
|
| 416 |
+
|
| 417 |
+
if since_ms is None:
|
| 418 |
+
# Default: today@00:00:00Z for first run
|
| 419 |
+
today0 = floor_to_utc_midnight(
|
| 420 |
+
datetime.datetime.now(datetime.timezone.utc))
|
| 421 |
+
since_ms = to_epoch_ms(today0)
|
| 422 |
+
|
| 423 |
+
print(f"Searching tickets with timestamp > {since_ms} ...")
|
| 424 |
+
ids, cursor_prop = search_ticket_ids_after_ms(since_ms)
|
| 425 |
+
print(f"Search property: {cursor_prop}. Found {len(ids)} ticket IDs.")
|
| 426 |
+
|
| 427 |
+
now_iso = datetime.datetime.now(datetime.timezone.utc).isoformat()
|
| 428 |
+
|
| 429 |
+
if not ids:
|
| 430 |
+
print("No tickets beyond the cursor. Updating sync metadata and exiting.")
|
| 431 |
+
# Only record last_sync_time (updated_at handled by helper)
|
| 432 |
+
update_sync_metadata(supabase_client, "hubspot_tickets", now_iso)
|
| 433 |
+
return
|
| 434 |
+
|
| 435 |
+
print("Reading tickets (with associations)...")
|
| 436 |
+
tickets, ticket_cli_links, ticket_deal_links, ticket_contact_links, max_ts_ms = read_tickets_by_ids(
|
| 437 |
+
ids, cursor_prop)
|
| 438 |
+
|
| 439 |
+
print("Upserting into Supabase...")
|
| 440 |
+
upsert_tickets(tickets, ticket_cli_links,
|
| 441 |
+
ticket_deal_links, ticket_contact_links)
|
| 442 |
+
|
| 443 |
+
# Advance cursor to max timestamp we actually ingested for the chosen property
|
| 444 |
+
new_cursor_ms = max_ts_ms if max_ts_ms is not None else since_ms
|
| 445 |
+
|
| 446 |
+
# Record last_sync_time only (updated_at handled by helper)
|
| 447 |
+
update_sync_metadata(supabase_client, "hubspot_tickets", now_iso)
|
| 448 |
+
|
| 449 |
+
print(
|
| 450 |
+
f"Tickets sync complete. Advanced cursor to {new_cursor_ms} using prop '{cursor_prop}'.")
|
| 451 |
+
|
| 452 |
+
# -----------------------------------------------------------------------------
|
| 453 |
+
# CLI
|
| 454 |
+
# -----------------------------------------------------------------------------
|
| 455 |
+
|
| 456 |
+
|
| 457 |
+
def _parse_cli_arg_to_ms(arg: str) -> int:
|
| 458 |
+
"""
|
| 459 |
+
Accept:
|
| 460 |
+
- integer epoch ms
|
| 461 |
+
- ISO-8601 (Z or offset)
|
| 462 |
+
- YYYY-MM-DD (floors to 00:00Z)
|
| 463 |
+
"""
|
| 464 |
+
# epoch ms or seconds
|
| 465 |
+
if re.fullmatch(r"\d{10,13}", arg):
|
| 466 |
+
v = int(arg)
|
| 467 |
+
if v < 10_000_000_000_000: # seconds -> ms
|
| 468 |
+
v *= 1000
|
| 469 |
+
return v
|
| 470 |
+
|
| 471 |
+
# YYYY-MM-DD
|
| 472 |
+
if re.fullmatch(r"\d{4}-\d{2}-\d{2}", arg):
|
| 473 |
+
d = datetime.datetime.strptime(
|
| 474 |
+
arg, "%Y-%m-%d").replace(tzinfo=datetime.timezone.utc)
|
| 475 |
+
return to_epoch_ms(floor_to_utc_midnight(d))
|
| 476 |
+
|
| 477 |
+
# ISO-8601
|
| 478 |
+
return to_epoch_ms(arg)
|
| 479 |
+
|
| 480 |
+
|
| 481 |
+
if __name__ == "__main__":
|
| 482 |
+
import sys
|
| 483 |
+
if len(sys.argv) > 1:
|
| 484 |
+
try:
|
| 485 |
+
main(since_ms=_parse_cli_arg_to_ms(sys.argv[1]))
|
| 486 |
+
except Exception as e:
|
| 487 |
+
print(
|
| 488 |
+
f"Invalid timestamp. Provide epoch ms, ISO-8601, or YYYY-MM-DD. Error: {e}")
|
| 489 |
+
sys.exit(1)
|
| 490 |
+
else:
|
| 491 |
+
main()
|
python/hubspot_utils.py
ADDED
|
@@ -0,0 +1,946 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
This script contains utility functions for working with HubSpot data.
|
| 3 |
+
"""
|
| 4 |
+
import time
|
| 5 |
+
import datetime
|
| 6 |
+
import re
|
| 7 |
+
import httpx
|
| 8 |
+
import html
|
| 9 |
+
import requests
|
| 10 |
+
import logging
|
| 11 |
+
from typing import Dict, List, Optional, Tuple
|
| 12 |
+
from collections import defaultdict
|
| 13 |
+
import pandas as pd
|
| 14 |
+
from hubspot.crm.objects import ApiException as ObjectApiException
|
| 15 |
+
from hubspot.crm.contacts.exceptions import ApiException as ContactsApiException
|
| 16 |
+
from hubspot.crm.companies.exceptions import ApiException as CompaniesApiException
|
| 17 |
+
from hubspot.crm.contacts import (
|
| 18 |
+
PublicObjectSearchRequest, Filter, FilterGroup, BatchInputSimplePublicObjectId
|
| 19 |
+
)
|
| 20 |
+
from hubspot.crm.companies import (
|
| 21 |
+
PublicObjectSearchRequest as CompanySearchRequest,
|
| 22 |
+
Filter as CompanyFilter,
|
| 23 |
+
FilterGroup as CompanyFilterGroup
|
| 24 |
+
)
|
| 25 |
+
|
| 26 |
+
_MISSING = {None, ""}
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
def serialize(obj):
|
| 30 |
+
"""
|
| 31 |
+
Recursively serialize a complex object into a plain dictionary.
|
| 32 |
+
|
| 33 |
+
This function is useful for serializing HubSpot API responses into a form
|
| 34 |
+
that can be easily stored in a database or file.
|
| 35 |
+
|
| 36 |
+
It currently supports the following types:
|
| 37 |
+
|
| 38 |
+
- Objects with a `to_dict` method
|
| 39 |
+
- Lists
|
| 40 |
+
- Dictionaries
|
| 41 |
+
- Datetime objects
|
| 42 |
+
- Objects with a `__dict__` attribute
|
| 43 |
+
|
| 44 |
+
If an object doesn't match any of the above, it is converted to a string
|
| 45 |
+
using the `str` function.
|
| 46 |
+
|
| 47 |
+
:param obj: The object to serialize
|
| 48 |
+
:return: A plain dictionary
|
| 49 |
+
"""
|
| 50 |
+
|
| 51 |
+
if hasattr(obj, "to_dict"):
|
| 52 |
+
return serialize(obj.to_dict())
|
| 53 |
+
elif isinstance(obj, list):
|
| 54 |
+
return [serialize(item) for item in obj]
|
| 55 |
+
elif isinstance(obj, dict):
|
| 56 |
+
return {key: serialize(value) for key, value in obj.items()}
|
| 57 |
+
elif isinstance(obj, datetime.datetime):
|
| 58 |
+
return obj.isoformat()
|
| 59 |
+
elif hasattr(obj, "__dict__"):
|
| 60 |
+
return {key: serialize(value) for key, value in obj.__dict__.items() if not key.startswith("_")}
|
| 61 |
+
else:
|
| 62 |
+
return str(obj)
|
| 63 |
+
|
| 64 |
+
|
| 65 |
+
def try_parse_int(value):
|
| 66 |
+
"""
|
| 67 |
+
Attempts to parse the given value as an integer.
|
| 68 |
+
|
| 69 |
+
:param value: The value to be parsed.
|
| 70 |
+
:return: The integer representation of the value if parsing is successful,
|
| 71 |
+
otherwise None.
|
| 72 |
+
"""
|
| 73 |
+
|
| 74 |
+
if value is None:
|
| 75 |
+
return None
|
| 76 |
+
|
| 77 |
+
try:
|
| 78 |
+
return int(value)
|
| 79 |
+
except (ValueError, TypeError):
|
| 80 |
+
return None
|
| 81 |
+
|
| 82 |
+
|
| 83 |
+
def try_parse_float(value):
|
| 84 |
+
"""
|
| 85 |
+
Attempts to parse the given value as a float.
|
| 86 |
+
|
| 87 |
+
:param value: The value to be parsed.
|
| 88 |
+
:return: The float representation of the value if parsing is successful,
|
| 89 |
+
otherwise None.
|
| 90 |
+
"""
|
| 91 |
+
|
| 92 |
+
if value is None:
|
| 93 |
+
return None
|
| 94 |
+
try:
|
| 95 |
+
return float(value)
|
| 96 |
+
except (ValueError, TypeError):
|
| 97 |
+
return None
|
| 98 |
+
|
| 99 |
+
|
| 100 |
+
def parse_ts(ts):
|
| 101 |
+
"""
|
| 102 |
+
Parses a timestamp string in ISO 8601 format and converts it to a UTC datetime string.
|
| 103 |
+
|
| 104 |
+
This function attempts to parse the given timestamp string, adjusting for the
|
| 105 |
+
timezone offset if provided. If the parsing is successful, it returns the
|
| 106 |
+
datetime in ISO 8601 format with UTC timezone. If the parsing fails or if
|
| 107 |
+
the input is None, it returns the original timestamp or None.
|
| 108 |
+
|
| 109 |
+
:param ts: The timestamp string to be parsed.
|
| 110 |
+
:return: A UTC timezone-aware datetime string in ISO 8601 format, the original
|
| 111 |
+
timestamp if parsing fails, or None if the input is None.
|
| 112 |
+
"""
|
| 113 |
+
|
| 114 |
+
if not ts:
|
| 115 |
+
return None
|
| 116 |
+
try:
|
| 117 |
+
if isinstance(ts, (int, float)) or str(ts).isdigit():
|
| 118 |
+
ts = int(ts)
|
| 119 |
+
dt = datetime.datetime.fromtimestamp(
|
| 120 |
+
ts / 1000.0, tz=datetime.timezone.utc)
|
| 121 |
+
else:
|
| 122 |
+
dt = datetime.datetime.fromisoformat(
|
| 123 |
+
str(ts).replace("Z", "+00:00"))
|
| 124 |
+
return dt.isoformat()
|
| 125 |
+
except (ValueError, TypeError) as e:
|
| 126 |
+
logging.error(
|
| 127 |
+
"Failed to parse timestamp '%s': %s", ts, str(e)
|
| 128 |
+
)
|
| 129 |
+
return None
|
| 130 |
+
|
| 131 |
+
|
| 132 |
+
def deduplicate_by_key(rows, key):
|
| 133 |
+
"""
|
| 134 |
+
Removes duplicate rows from a list of dictionaries based on a specified key or tuple of keys.
|
| 135 |
+
|
| 136 |
+
:param rows: The list of dictionaries to deduplicate.
|
| 137 |
+
:param key: A string or tuple of strings representing the key(s) to deduplicate by.
|
| 138 |
+
If a string, the value of the dictionary at that key will be used.
|
| 139 |
+
If a tuple, the values of the dictionary at each key in the tuple
|
| 140 |
+
will be combined into a tuple and used as the deduplication key.
|
| 141 |
+
:return: A list of unique dictionaries, with duplicates removed based on the key(s) specified.
|
| 142 |
+
"""
|
| 143 |
+
|
| 144 |
+
seen = set()
|
| 145 |
+
unique_rows = []
|
| 146 |
+
for row in rows:
|
| 147 |
+
val = tuple(row[k] for k in key) if isinstance(
|
| 148 |
+
key, tuple) else row.get(key)
|
| 149 |
+
if val and val not in seen:
|
| 150 |
+
seen.add(val)
|
| 151 |
+
unique_rows.append(row)
|
| 152 |
+
return unique_rows
|
| 153 |
+
|
| 154 |
+
|
| 155 |
+
def fix_surrogates(text: str) -> str:
|
| 156 |
+
"""
|
| 157 |
+
Fix surrogates in a string.
|
| 158 |
+
Some emails may contain surrogates, which are invalid utf-8 characters.
|
| 159 |
+
This function will convert such characters to valid utf-8 characters.
|
| 160 |
+
:param text: The string to fix.
|
| 161 |
+
:return: The fixed string.
|
| 162 |
+
"""
|
| 163 |
+
return text.encode('utf-16', 'surrogatepass').decode('utf-16')
|
| 164 |
+
|
| 165 |
+
|
| 166 |
+
def strip_footer(text: str) -> str:
|
| 167 |
+
"""
|
| 168 |
+
Strips common email footers like confidentiality notices and signatures.
|
| 169 |
+
"""
|
| 170 |
+
footer_cutoff_patterns = [
|
| 171 |
+
r"(?i)Registration Number \\d+[A-Z]?",
|
| 172 |
+
r"(?i)this e[- ]?mail message contains confidential information",
|
| 173 |
+
r"(?i)this email and any attachments.*?confidential",
|
| 174 |
+
r"(?i)if you are not the intended recipient",
|
| 175 |
+
r"(?i)please notify us immediately by return e[- ]?mail",
|
| 176 |
+
r"(?i)no liability is accepted for any damage",
|
| 177 |
+
r"(?i)this message is intended only for the use of the individual",
|
| 178 |
+
r"(?i)confidentiality notice",
|
| 179 |
+
r"(?i)please consider the environment before printing",
|
| 180 |
+
r"(?i)registered in England and Wales",
|
| 181 |
+
r"(?i)the views expressed in this email are those of the sender",
|
| 182 |
+
r"(?i)accepts no liability",
|
| 183 |
+
r"(?i)has taken steps to ensure",
|
| 184 |
+
]
|
| 185 |
+
|
| 186 |
+
for pattern in footer_cutoff_patterns:
|
| 187 |
+
match = re.search(pattern, text)
|
| 188 |
+
if match:
|
| 189 |
+
return text[:match.start()].strip()
|
| 190 |
+
return text
|
| 191 |
+
|
| 192 |
+
|
| 193 |
+
def clean_text(raw_text: str) -> str:
|
| 194 |
+
"""
|
| 195 |
+
Clean an email text by removing HTML, unescaping entities, and trimming
|
| 196 |
+
quoted sections, footers, and signatures.
|
| 197 |
+
"""
|
| 198 |
+
# Defensive defaults
|
| 199 |
+
if raw_text is None:
|
| 200 |
+
raw_text = ""
|
| 201 |
+
|
| 202 |
+
# Optional helper fallbacks
|
| 203 |
+
try:
|
| 204 |
+
fixed = fix_surrogates(raw_text)
|
| 205 |
+
except NameError:
|
| 206 |
+
# If fix_surrogates isn't available, just pass through
|
| 207 |
+
fixed = raw_text
|
| 208 |
+
|
| 209 |
+
# Start from the corrected source
|
| 210 |
+
text = fixed
|
| 211 |
+
|
| 212 |
+
# Normalize Windows/Mac line endings early (before HTML handling)
|
| 213 |
+
text = text.replace("\r\n", "\n").replace("\r", "\n")
|
| 214 |
+
|
| 215 |
+
# Normalize common HTML line breaks to real newlines before removing tags
|
| 216 |
+
text = re.sub(r"(?i)<br\s*/?>", "\n", text)
|
| 217 |
+
|
| 218 |
+
# Strip HTML tags
|
| 219 |
+
text = re.sub(r"<[^>]+>", "", text)
|
| 220 |
+
|
| 221 |
+
# Unescape HTML entities
|
| 222 |
+
text = html.unescape(text)
|
| 223 |
+
|
| 224 |
+
# --- Trim quoted / forwarded blocks heuristics ---
|
| 225 |
+
on_wrote_pat = re.compile(r"(?im)^[>\s]*on\s+.+?wrote:")
|
| 226 |
+
fwd_hdr_pat = re.compile(
|
| 227 |
+
r"(?im)^[>\s]*(?:from|sent|date|subject|to)\s*:\s.+$")
|
| 228 |
+
sep_pat = re.compile(
|
| 229 |
+
r"(?im)^(?:[>\s]*-----\s*original message\s*-----|[>\s]*begin forwarded message)")
|
| 230 |
+
|
| 231 |
+
cut_idx = None
|
| 232 |
+
for m in (on_wrote_pat.search(text), fwd_hdr_pat.search(text), sep_pat.search(text)):
|
| 233 |
+
if m:
|
| 234 |
+
idx = m.start()
|
| 235 |
+
cut_idx = idx if cut_idx is None or idx < cut_idx else cut_idx
|
| 236 |
+
|
| 237 |
+
if cut_idx is not None:
|
| 238 |
+
text = text[:cut_idx].rstrip("\n")
|
| 239 |
+
|
| 240 |
+
# --- Line-by-line cleanup, footer & signature handling ---
|
| 241 |
+
lines = text.split("\n")
|
| 242 |
+
cleaned = []
|
| 243 |
+
|
| 244 |
+
footer_regex = re.compile(
|
| 245 |
+
r"(?i)\b(confidential|privacy policy|unsubscribe|follow us|visit our website|"
|
| 246 |
+
r"please consider the environment|registered office|copyright|"
|
| 247 |
+
r"this e-?mail and any attachments|do not print this email)\b"
|
| 248 |
+
)
|
| 249 |
+
sig_sep = re.compile(r"^--\s?$") # standard sig delimiter
|
| 250 |
+
|
| 251 |
+
for line in lines:
|
| 252 |
+
stripped = line.strip()
|
| 253 |
+
|
| 254 |
+
# Skip pure separators
|
| 255 |
+
if re.match(r"^[-=_*]{3,}\s*$", stripped):
|
| 256 |
+
continue
|
| 257 |
+
|
| 258 |
+
# Skip common footer lines
|
| 259 |
+
if footer_regex.search(stripped):
|
| 260 |
+
continue
|
| 261 |
+
|
| 262 |
+
# Stop including anything after a signature separator
|
| 263 |
+
if sig_sep.match(stripped):
|
| 264 |
+
break
|
| 265 |
+
|
| 266 |
+
# Keep empty lines (collapse later)
|
| 267 |
+
if stripped == "":
|
| 268 |
+
cleaned.append("")
|
| 269 |
+
continue
|
| 270 |
+
|
| 271 |
+
cleaned.append(stripped)
|
| 272 |
+
|
| 273 |
+
out = "\n".join(cleaned)
|
| 274 |
+
out = re.sub(r"\n{3,}", "\n\n", out).strip()
|
| 275 |
+
|
| 276 |
+
# Optional final footer stripping
|
| 277 |
+
try:
|
| 278 |
+
out = strip_footer(out)
|
| 279 |
+
except NameError:
|
| 280 |
+
pass
|
| 281 |
+
|
| 282 |
+
return out
|
| 283 |
+
|
| 284 |
+
|
| 285 |
+
def get_search_config(hubspot_client, object_type):
|
| 286 |
+
"""
|
| 287 |
+
Retrieves the necessary classes and search API instance for the given HubSpot object type.
|
| 288 |
+
|
| 289 |
+
:param hubspot_client: The HubSpot client object.
|
| 290 |
+
:param object_type: The type of object to search for.
|
| 291 |
+
:return: A dictionary containing the necessary classes and search API instance.
|
| 292 |
+
:raises ValueError: If the object_type is unsupported.
|
| 293 |
+
"""
|
| 294 |
+
|
| 295 |
+
if object_type == "contacts":
|
| 296 |
+
return {
|
| 297 |
+
"FilterCls": Filter,
|
| 298 |
+
"FilterGroupCls": FilterGroup,
|
| 299 |
+
"SearchRequestCls": PublicObjectSearchRequest,
|
| 300 |
+
"search_api": hubspot_client.crm.contacts.search_api,
|
| 301 |
+
"modified_prop": "createdate",
|
| 302 |
+
}
|
| 303 |
+
if object_type == "companies":
|
| 304 |
+
return {
|
| 305 |
+
"FilterCls": CompanyFilter,
|
| 306 |
+
"FilterGroupCls": CompanyFilterGroup,
|
| 307 |
+
"SearchRequestCls": CompanySearchRequest,
|
| 308 |
+
"search_api": hubspot_client.crm.companies.search_api,
|
| 309 |
+
"modified_prop": "createdate",
|
| 310 |
+
}
|
| 311 |
+
|
| 312 |
+
raise ValueError(f"Unsupported object_type '{object_type}'")
|
| 313 |
+
|
| 314 |
+
|
| 315 |
+
def get_property_label_mapping(hubspot_client, object_type: str, property_name: str) -> dict:
|
| 316 |
+
"""
|
| 317 |
+
Retrieves the label mapping for a HubSpot property.
|
| 318 |
+
|
| 319 |
+
:param hubspot_client: The HubSpot client instance.
|
| 320 |
+
:param object_type: "contacts" or "companies"
|
| 321 |
+
:param property_name: The internal name of the property (e.g., "industry").
|
| 322 |
+
:return: Dictionary mapping internal values to human-readable labels.
|
| 323 |
+
"""
|
| 324 |
+
try:
|
| 325 |
+
prop_info = hubspot_client.crm.properties.core_api.get_by_name(
|
| 326 |
+
object_type, property_name)
|
| 327 |
+
return {opt.value: opt.label for opt in prop_info.options}
|
| 328 |
+
except Exception as e:
|
| 329 |
+
logging.warning("Failed to fetch mapping for %s: %s",
|
| 330 |
+
property_name, str(e))
|
| 331 |
+
return {}
|
| 332 |
+
|
| 333 |
+
|
| 334 |
+
def build_filter_request(since, after):
|
| 335 |
+
"""
|
| 336 |
+
Builds a HubSpot SearchRequest to fetch companies created after a given datetime.
|
| 337 |
+
|
| 338 |
+
:param since: The datetime to filter by createdate
|
| 339 |
+
:param after: Optional result offset to use for pagination
|
| 340 |
+
:return: PublicObjectSearchRequest
|
| 341 |
+
"""
|
| 342 |
+
|
| 343 |
+
value = int(since.timestamp() * 1000)
|
| 344 |
+
filter_obj = Filter(property_name="createdate",
|
| 345 |
+
operator="GTE", value=value)
|
| 346 |
+
filter_group = FilterGroup(filters=[filter_obj])
|
| 347 |
+
return PublicObjectSearchRequest(
|
| 348 |
+
filter_groups=[filter_group],
|
| 349 |
+
properties=[
|
| 350 |
+
"full_name",
|
| 351 |
+
"firstname",
|
| 352 |
+
"lastname",
|
| 353 |
+
"email",
|
| 354 |
+
"phone",
|
| 355 |
+
"createdate",
|
| 356 |
+
"lastmodifieddate",
|
| 357 |
+
"lastactivitydate",
|
| 358 |
+
"associatedcompanyid",
|
| 359 |
+
],
|
| 360 |
+
limit=100,
|
| 361 |
+
after=after
|
| 362 |
+
)
|
| 363 |
+
|
| 364 |
+
|
| 365 |
+
def fetch_custom_object(hubspot_client, object_name="tickets", properties=None,
|
| 366 |
+
associations=None):
|
| 367 |
+
"""
|
| 368 |
+
Fetches all records of a custom HubSpot object and returns them as a list of dictionaries.
|
| 369 |
+
|
| 370 |
+
:param hubspot_client: The HubSpot client object.
|
| 371 |
+
:param object_name: The name of the custom object to fetch. Defaults to "tickets".
|
| 372 |
+
:param properties: A list of properties to include in the response. If None, all properties are included.
|
| 373 |
+
:param associations: A list of associations to include in the response. If None, no associations are included.
|
| 374 |
+
:return: A list of dictionaries, where each dictionary represents a record of the custom object.
|
| 375 |
+
:raises ObjectApiException: If the HubSpot API returns an error when fetching the custom object.
|
| 376 |
+
"""
|
| 377 |
+
all_objects = []
|
| 378 |
+
after = None
|
| 379 |
+
|
| 380 |
+
while True:
|
| 381 |
+
try:
|
| 382 |
+
response = hubspot_client.crm.objects.basic_api.get_page(
|
| 383 |
+
object_type=object_name,
|
| 384 |
+
properties=properties,
|
| 385 |
+
limit=100,
|
| 386 |
+
after=after,
|
| 387 |
+
archived=False,
|
| 388 |
+
associations=associations
|
| 389 |
+
)
|
| 390 |
+
|
| 391 |
+
if not response.results:
|
| 392 |
+
break
|
| 393 |
+
|
| 394 |
+
for record in response.results:
|
| 395 |
+
props = record.properties
|
| 396 |
+
object_id = record.id
|
| 397 |
+
|
| 398 |
+
# exclude hs_object_id and optionally filter properties
|
| 399 |
+
if properties:
|
| 400 |
+
filtered_props = {k: props.get(
|
| 401 |
+
k) for k in properties if k != "hs_object_id"}
|
| 402 |
+
else:
|
| 403 |
+
filtered_props = {
|
| 404 |
+
k: v for k, v in props.items() if k != "hs_object_id"}
|
| 405 |
+
|
| 406 |
+
object_dict = {"billing_id": object_id, **filtered_props}
|
| 407 |
+
|
| 408 |
+
# cast ints if present
|
| 409 |
+
for key in ["hs_created_by_user_id"]:
|
| 410 |
+
if key in object_dict:
|
| 411 |
+
object_dict[key] = try_parse_int(object_dict[key])
|
| 412 |
+
|
| 413 |
+
# parse timestamps if present
|
| 414 |
+
for key in ["hs_createdate", "hs_lastmodifieddate"]:
|
| 415 |
+
if key in object_dict:
|
| 416 |
+
object_dict[key] = parse_ts(object_dict[key])
|
| 417 |
+
|
| 418 |
+
# include associations if requested
|
| 419 |
+
if associations and record.associations:
|
| 420 |
+
assoc_data = {}
|
| 421 |
+
for assoc_type, assoc_records in record.associations.items():
|
| 422 |
+
assoc_data[assoc_type] = [
|
| 423 |
+
ar.id for ar in assoc_records.results]
|
| 424 |
+
object_dict["associations"] = assoc_data
|
| 425 |
+
|
| 426 |
+
all_objects.append(object_dict)
|
| 427 |
+
|
| 428 |
+
if response.paging and response.paging.next:
|
| 429 |
+
after = response.paging.next.after
|
| 430 |
+
else:
|
| 431 |
+
break
|
| 432 |
+
|
| 433 |
+
time.sleep(0.1)
|
| 434 |
+
|
| 435 |
+
except ObjectApiException as e:
|
| 436 |
+
logging.error("Exception when fetching %s: %s", object_name, e)
|
| 437 |
+
break
|
| 438 |
+
|
| 439 |
+
return all_objects
|
| 440 |
+
|
| 441 |
+
|
| 442 |
+
def fetch_total_objects(
|
| 443 |
+
hubspot_client,
|
| 444 |
+
object_type: str,
|
| 445 |
+
modified_after: datetime.datetime = None,
|
| 446 |
+
archived: bool = False
|
| 447 |
+
) -> int:
|
| 448 |
+
"""
|
| 449 |
+
Fetches the total number of HubSpot objects of the given type.
|
| 450 |
+
Supports counting archived objects using pagination.
|
| 451 |
+
|
| 452 |
+
:param hubspot_client: HubSpot client
|
| 453 |
+
:param object_type: "contacts" or "companies"
|
| 454 |
+
:param modified_after: Only used for non-archived search_api queries
|
| 455 |
+
:param archived: Whether to count archived records
|
| 456 |
+
:return: Total number of matching objects
|
| 457 |
+
"""
|
| 458 |
+
try:
|
| 459 |
+
if archived:
|
| 460 |
+
total = 0
|
| 461 |
+
after = None
|
| 462 |
+
while True:
|
| 463 |
+
if object_type == "contacts":
|
| 464 |
+
response = hubspot_client.crm.contacts.basic_api.get_page(
|
| 465 |
+
limit=100, archived=True, after=after
|
| 466 |
+
)
|
| 467 |
+
elif object_type == "companies":
|
| 468 |
+
response = hubspot_client.crm.companies.basic_api.get_page(
|
| 469 |
+
limit=100, archived=True, after=after
|
| 470 |
+
)
|
| 471 |
+
else:
|
| 472 |
+
raise ValueError(
|
| 473 |
+
f"Unsupported object_type '{object_type}' for archived=True")
|
| 474 |
+
|
| 475 |
+
total += len(response.results)
|
| 476 |
+
|
| 477 |
+
if response.paging and response.paging.next:
|
| 478 |
+
after = response.paging.next.after
|
| 479 |
+
else:
|
| 480 |
+
break
|
| 481 |
+
time.sleep(0.1)
|
| 482 |
+
|
| 483 |
+
logging.info("Total %s (archived): %d", object_type, total)
|
| 484 |
+
print(f"Total {object_type} (archived): {total}")
|
| 485 |
+
return total
|
| 486 |
+
|
| 487 |
+
# Non-archived path via search API
|
| 488 |
+
config = get_search_config(hubspot_client, object_type)
|
| 489 |
+
filters = []
|
| 490 |
+
|
| 491 |
+
if modified_after:
|
| 492 |
+
value = int(modified_after.timestamp() * 1000)
|
| 493 |
+
filters.append(config["FilterCls"](
|
| 494 |
+
property_name=config["modified_prop"], operator="GTE", value=value
|
| 495 |
+
))
|
| 496 |
+
|
| 497 |
+
request_body = config["SearchRequestCls"](
|
| 498 |
+
filter_groups=[config["FilterGroupCls"](
|
| 499 |
+
filters=filters)] if filters else None,
|
| 500 |
+
properties=["id"],
|
| 501 |
+
limit=1
|
| 502 |
+
)
|
| 503 |
+
response = config["search_api"].do_search(request_body)
|
| 504 |
+
total = response.total
|
| 505 |
+
|
| 506 |
+
logging.info("Total %s in HubSpot: %d", object_type, total)
|
| 507 |
+
print(f"Total {object_type} in HubSpot: {total}")
|
| 508 |
+
return total
|
| 509 |
+
|
| 510 |
+
except (ContactsApiException, CompaniesApiException) as api_err:
|
| 511 |
+
logging.error("API error occurred while fetching %s: %s",
|
| 512 |
+
object_type, str(api_err))
|
| 513 |
+
except httpx.HTTPError as http_err:
|
| 514 |
+
logging.error("HTTP error occurred while fetching %s: %s",
|
| 515 |
+
object_type, str(http_err))
|
| 516 |
+
except ValueError as val_err:
|
| 517 |
+
logging.error("ValueError while fetching %s: %s",
|
| 518 |
+
object_type, str(val_err))
|
| 519 |
+
except BaseException as critical_err:
|
| 520 |
+
logging.critical("Unexpected error fetching %s: %s",
|
| 521 |
+
object_type, str(critical_err))
|
| 522 |
+
raise
|
| 523 |
+
|
| 524 |
+
return 0
|
| 525 |
+
|
| 526 |
+
|
| 527 |
+
def _coalesce(*vals):
|
| 528 |
+
"""
|
| 529 |
+
Returns the first non-missing value from the given arguments.
|
| 530 |
+
|
| 531 |
+
A value is considered "missing" if it is None or an empty string.
|
| 532 |
+
|
| 533 |
+
:param vals: Variable number of arguments to coalesce.
|
| 534 |
+
:return: The first non-missing value, or None if all values are missing.
|
| 535 |
+
"""
|
| 536 |
+
for v in vals:
|
| 537 |
+
if v not in _MISSING:
|
| 538 |
+
return v
|
| 539 |
+
return None
|
| 540 |
+
|
| 541 |
+
|
| 542 |
+
def parse_ts_dt(ts: Optional[str]) -> Optional[datetime.datetime]:
|
| 543 |
+
"""
|
| 544 |
+
Parses a timestamp string in ISO 8601 format and converts it to a UTC datetime object.
|
| 545 |
+
|
| 546 |
+
:param ts: The timestamp string to be parsed.
|
| 547 |
+
:return: A UTC timezone-aware datetime object, or None if the input is None or parsing fails.
|
| 548 |
+
"""
|
| 549 |
+
if not ts:
|
| 550 |
+
return None
|
| 551 |
+
try:
|
| 552 |
+
return datetime.datetime.fromisoformat(str(ts).replace("Z", "+00:00")).astimezone(datetime.timezone.utc)
|
| 553 |
+
except Exception:
|
| 554 |
+
return None
|
| 555 |
+
|
| 556 |
+
|
| 557 |
+
def to_epoch_ms_from_utc_iso(date_str: str) -> int:
|
| 558 |
+
"""
|
| 559 |
+
Converts a date string in ISO 8601 format or 'YYYY-MM-DD' format to milliseconds since the Unix epoch (January 1, 1970, 00:00:00 UTC).
|
| 560 |
+
|
| 561 |
+
:param date_str: The date string to be converted.
|
| 562 |
+
:return: The number of milliseconds since the Unix epoch.
|
| 563 |
+
"""
|
| 564 |
+
if not date_str:
|
| 565 |
+
raise ValueError("date_str is required")
|
| 566 |
+
if len(date_str) == 10 and date_str[4] == "-" and date_str[7] == "-":
|
| 567 |
+
# YYYY-MM-DD -> midnight UTC
|
| 568 |
+
dt = datetime.datetime.strptime(
|
| 569 |
+
date_str, "%Y-%m-%d").replace(tzinfo=datetime.timezone.utc)
|
| 570 |
+
else:
|
| 571 |
+
dt = datetime.datetime.fromisoformat(date_str.replace("Z", "+00:00"))
|
| 572 |
+
return int(dt.timestamp() * 1000)
|
| 573 |
+
|
| 574 |
+
|
| 575 |
+
def request_with_retry(method: str,
|
| 576 |
+
url: str,
|
| 577 |
+
headers: Dict[str, str],
|
| 578 |
+
params: Dict[str, str],
|
| 579 |
+
initial_backoff: float = 1.0,
|
| 580 |
+
max_backoff: float = 16.0,
|
| 581 |
+
retries: int = 8) -> Dict:
|
| 582 |
+
"""
|
| 583 |
+
Sends a request to the given URL with the given method, headers, and parameters, and
|
| 584 |
+
retries the request up to the given number of times if it fails with a HubSpot API error
|
| 585 |
+
(429, 500, 502, 503, 504). If the request fails with another error code, it raises a
|
| 586 |
+
RuntimeError. If all retries fail, it raises a RuntimeError with a message indicating the
|
| 587 |
+
number of retry attempts.
|
| 588 |
+
|
| 589 |
+
:param method: The HTTP method to use (e.g. GET, POST).
|
| 590 |
+
:param url: The URL to send the request to.
|
| 591 |
+
:param headers: A dictionary of HTTP headers to include in the request.
|
| 592 |
+
:param params: A dictionary of URL parameters to include in the request.
|
| 593 |
+
:param initial_backoff: The initial backoff time in seconds.
|
| 594 |
+
:param max_backoff: The maximum backoff time in seconds.
|
| 595 |
+
:param retries: The number of times to retry the request before giving up.
|
| 596 |
+
:return: The JSON response from the request, decoded as a dictionary.
|
| 597 |
+
:raises RuntimeError: If all retry attempts fail.
|
| 598 |
+
"""
|
| 599 |
+
backoff = initial_backoff
|
| 600 |
+
for attempt in range(1, retries + 1):
|
| 601 |
+
try:
|
| 602 |
+
resp = requests.request(
|
| 603 |
+
method, url, headers=headers, params=params, timeout=60)
|
| 604 |
+
if resp.status_code < 400:
|
| 605 |
+
try:
|
| 606 |
+
return resp.json()
|
| 607 |
+
except Exception as e:
|
| 608 |
+
logging.error(
|
| 609 |
+
"Failed to decode JSON response from %s: %s", url, e)
|
| 610 |
+
raise
|
| 611 |
+
|
| 612 |
+
# HubSpot rate limits or transient server error
|
| 613 |
+
if resp.status_code in (429, 500, 502, 503, 504):
|
| 614 |
+
logging.warning(
|
| 615 |
+
"HubSpot API %s (%d). Retrying in %.1fs (attempt %d/%d)...",
|
| 616 |
+
url, resp.status_code, backoff, attempt, retries
|
| 617 |
+
)
|
| 618 |
+
time.sleep(backoff)
|
| 619 |
+
backoff = min(max_backoff, backoff * 2)
|
| 620 |
+
continue
|
| 621 |
+
|
| 622 |
+
# Permanent error
|
| 623 |
+
logging.error("HubSpot API error %s: %s",
|
| 624 |
+
resp.status_code, resp.text)
|
| 625 |
+
resp.raise_for_status()
|
| 626 |
+
|
| 627 |
+
except (requests.exceptions.ConnectionError,
|
| 628 |
+
requests.exceptions.Timeout) as e:
|
| 629 |
+
logging.warning(
|
| 630 |
+
"Network error on attempt %d/%d (%s). Retrying in %.1fs...",
|
| 631 |
+
attempt, retries, str(e), backoff
|
| 632 |
+
)
|
| 633 |
+
time.sleep(backoff)
|
| 634 |
+
backoff = min(max_backoff, backoff * 2)
|
| 635 |
+
continue
|
| 636 |
+
|
| 637 |
+
except Exception as e:
|
| 638 |
+
logging.error("Unexpected error during HubSpot request: %s", e)
|
| 639 |
+
raise
|
| 640 |
+
|
| 641 |
+
# If we exhausted retries
|
| 642 |
+
raise RuntimeError(f"Failed after {retries} attempts calling {url}")
|
| 643 |
+
|
| 644 |
+
|
| 645 |
+
def page_account_activity(base_url: str,
|
| 646 |
+
token: str,
|
| 647 |
+
occurred_after_ms: int,
|
| 648 |
+
occurred_before_ms: int,
|
| 649 |
+
limit: int = 200,
|
| 650 |
+
max_pages: int = 1000) -> List[Dict]:
|
| 651 |
+
"""
|
| 652 |
+
Fetches all account activity events from the given base URL that occurred
|
| 653 |
+
after the given timestamp in milliseconds.
|
| 654 |
+
|
| 655 |
+
:param base_url: The base URL to fetch events from.
|
| 656 |
+
:param token: The HubSpot API token to use for authentication.
|
| 657 |
+
:param occurred_after_ms: The timestamp in milliseconds to fetch events after.
|
| 658 |
+
:param limit: The maximum number of events to fetch per page (default=200).
|
| 659 |
+
:param max_pages: The maximum number of pages to fetch before stopping pagination
|
| 660 |
+
(default=1000).
|
| 661 |
+
:return: A list of all fetched events.
|
| 662 |
+
"""
|
| 663 |
+
headers = {"Authorization": f"Bearer {token}",
|
| 664 |
+
"Accept": "application/json"}
|
| 665 |
+
|
| 666 |
+
is_security = "activity/security" in base_url
|
| 667 |
+
is_audit = "activity/audit-logs" in base_url
|
| 668 |
+
|
| 669 |
+
params: Dict[str, str] = {"limit": str(limit)}
|
| 670 |
+
if is_audit:
|
| 671 |
+
params["occurredAfter"] = str(occurred_after_ms)
|
| 672 |
+
if occurred_before_ms is not None:
|
| 673 |
+
params["occurredBefore"] = str(occurred_before_ms)
|
| 674 |
+
elif is_security:
|
| 675 |
+
params["fromTimestamp"] = str(occurred_after_ms)
|
| 676 |
+
if occurred_before_ms is not None:
|
| 677 |
+
params["toTimestamp"] = str(occurred_before_ms)
|
| 678 |
+
|
| 679 |
+
all_items: List[Dict] = []
|
| 680 |
+
after: Optional[str] = None
|
| 681 |
+
prev_after: Optional[str] = None
|
| 682 |
+
pages = 0
|
| 683 |
+
|
| 684 |
+
# seen on /security when exhausted
|
| 685 |
+
TERMINAL_AFTER_VALUES = {"MC0w", "0-0", ""}
|
| 686 |
+
|
| 687 |
+
while True:
|
| 688 |
+
if after:
|
| 689 |
+
params["after"] = after
|
| 690 |
+
else:
|
| 691 |
+
params.pop("after", None)
|
| 692 |
+
|
| 693 |
+
data = request_with_retry("GET", base_url, headers, params)
|
| 694 |
+
results = data.get("results") or data.get("events") or []
|
| 695 |
+
if not isinstance(results, list):
|
| 696 |
+
logging.warning("Unexpected results shape at %s: %s",
|
| 697 |
+
base_url, type(results))
|
| 698 |
+
results = []
|
| 699 |
+
|
| 700 |
+
all_items.extend(results)
|
| 701 |
+
|
| 702 |
+
paging = data.get("paging") or {}
|
| 703 |
+
next_obj = paging.get("next") or {}
|
| 704 |
+
next_after = next_obj.get("after")
|
| 705 |
+
|
| 706 |
+
pages += 1
|
| 707 |
+
logging.info("Fetched page %d from %s: %d items (total=%d). after=%s",
|
| 708 |
+
pages, base_url, len(results), len(all_items), str(next_after))
|
| 709 |
+
|
| 710 |
+
if len(results) == 0:
|
| 711 |
+
logging.info(
|
| 712 |
+
"No results returned; stopping pagination for %s.", base_url)
|
| 713 |
+
break
|
| 714 |
+
|
| 715 |
+
if not next_after:
|
| 716 |
+
logging.info(
|
| 717 |
+
"No next cursor; stopping pagination for %s.", base_url)
|
| 718 |
+
break
|
| 719 |
+
|
| 720 |
+
if next_after in TERMINAL_AFTER_VALUES:
|
| 721 |
+
logging.info(
|
| 722 |
+
"Terminal 'after' (%s); stopping pagination for %s.", next_after, base_url)
|
| 723 |
+
break
|
| 724 |
+
|
| 725 |
+
if prev_after is not None and next_after == prev_after:
|
| 726 |
+
logging.info(
|
| 727 |
+
"Repeated 'after' cursor (%s); stopping pagination for %s.", next_after, base_url)
|
| 728 |
+
break
|
| 729 |
+
|
| 730 |
+
if pages >= max_pages:
|
| 731 |
+
logging.warning(
|
| 732 |
+
"Reached max_pages=%d for %s; stopping pagination.", max_pages, base_url)
|
| 733 |
+
break
|
| 734 |
+
|
| 735 |
+
prev_after, after = after, next_after
|
| 736 |
+
|
| 737 |
+
time.sleep(0.1)
|
| 738 |
+
|
| 739 |
+
return all_items
|
| 740 |
+
|
| 741 |
+
|
| 742 |
+
def safe_get_actor(ev: Dict) -> Dict:
|
| 743 |
+
"""
|
| 744 |
+
Safely extracts the actor information from an audit event.
|
| 745 |
+
|
| 746 |
+
This function is robust against the possibility of the actor being None or
|
| 747 |
+
not a dictionary.
|
| 748 |
+
|
| 749 |
+
:param ev: The audit event to extract the actor from
|
| 750 |
+
:return: A dictionary containing the actor's userId and userEmail
|
| 751 |
+
"""
|
| 752 |
+
|
| 753 |
+
actor = ev.get("actingUser")
|
| 754 |
+
if not isinstance(actor, dict):
|
| 755 |
+
actor = {}
|
| 756 |
+
return {
|
| 757 |
+
"userId": ev.get("userId") or actor.get("userId"),
|
| 758 |
+
"userEmail": ev.get("userEmail") or actor.get("userEmail") or actor.get("email"),
|
| 759 |
+
}
|
| 760 |
+
|
| 761 |
+
|
| 762 |
+
def build_login_index(login_events: List[Dict]) -> Dict[Tuple[Optional[str], Optional[str]], List[Dict]]:
|
| 763 |
+
"""
|
| 764 |
+
Builds an in-memory index of login events by (userId,email) and sorts by loginAt (UTC).
|
| 765 |
+
|
| 766 |
+
:param login_events: List of login events to index.
|
| 767 |
+
:return: A dictionary containing the indexed login events, where each key is a tuple of (userId,email)
|
| 768 |
+
and the value is a sorted list of login events for that key.
|
| 769 |
+
"""
|
| 770 |
+
idx = defaultdict(list)
|
| 771 |
+
for e in login_events:
|
| 772 |
+
user_id = e.get("userId")
|
| 773 |
+
email = e.get("email")
|
| 774 |
+
ts = parse_ts_dt(e.get("loginAt"))
|
| 775 |
+
if ts is None:
|
| 776 |
+
continue
|
| 777 |
+
e["_ts"] = ts
|
| 778 |
+
idx[(str(user_id) if user_id is not None else None, email)].append(e)
|
| 779 |
+
for k in idx:
|
| 780 |
+
idx[k].sort(key=lambda r: r["_ts"])
|
| 781 |
+
return idx
|
| 782 |
+
|
| 783 |
+
|
| 784 |
+
def build_security_index(security_events: List[Dict]) -> Dict[Tuple[Optional[str], Optional[str]], List[Dict]]:
|
| 785 |
+
"""
|
| 786 |
+
Builds an in-memory index of security events by (userId,email) and sorts by createdAt (UTC).
|
| 787 |
+
|
| 788 |
+
:param security_events: List of security events to index.
|
| 789 |
+
:return: A dictionary containing the indexed security events,
|
| 790 |
+
where each key is a tuple of (userId,email) and the value is a sorted list of security events for that key.
|
| 791 |
+
"""
|
| 792 |
+
idx = defaultdict(list)
|
| 793 |
+
for e in security_events:
|
| 794 |
+
actor = safe_get_actor(e)
|
| 795 |
+
user_id = actor.get("userId")
|
| 796 |
+
email = actor.get("userEmail") or e.get("actingUser")
|
| 797 |
+
ts = parse_ts_dt(e.get("createdAt"))
|
| 798 |
+
if ts is None:
|
| 799 |
+
continue
|
| 800 |
+
e["_ts"] = ts
|
| 801 |
+
idx[(str(user_id) if user_id is not None else None, email)].append(e)
|
| 802 |
+
for k in idx:
|
| 803 |
+
idx[k].sort(key=lambda r: r["_ts"])
|
| 804 |
+
return idx
|
| 805 |
+
|
| 806 |
+
|
| 807 |
+
def find_best_time_match(ts: Optional[datetime.datetime],
|
| 808 |
+
key: Tuple[Optional[str], Optional[str]],
|
| 809 |
+
index: Dict[Tuple[Optional[str], Optional[str]], List[Dict]],
|
| 810 |
+
window_seconds: int) -> Optional[Dict]:
|
| 811 |
+
"""
|
| 812 |
+
Find the best match for a given timestamp in the given index within
|
| 813 |
+
the given window of seconds.
|
| 814 |
+
|
| 815 |
+
:param ts: The timestamp to search for
|
| 816 |
+
:param key: The key to search the index for
|
| 817 |
+
:param index: The index to search
|
| 818 |
+
:param window_seconds: The window of seconds to search in
|
| 819 |
+
:return: The best matching event, or None if no match found
|
| 820 |
+
"""
|
| 821 |
+
if ts is None:
|
| 822 |
+
return None
|
| 823 |
+
candidates = index.get(key, [])
|
| 824 |
+
if not candidates:
|
| 825 |
+
return None
|
| 826 |
+
best, best_diff = None, float("inf")
|
| 827 |
+
for c in candidates:
|
| 828 |
+
diff = abs((ts - c["_ts"]).total_seconds())
|
| 829 |
+
if diff <= window_seconds and diff < best_diff:
|
| 830 |
+
best, best_diff = c, diff
|
| 831 |
+
return best
|
| 832 |
+
|
| 833 |
+
|
| 834 |
+
def fill_network_fields(row: Dict, src_event: Dict) -> None:
|
| 835 |
+
"""
|
| 836 |
+
Fill in network fields (ip_address, country_code, region_code) in the given row
|
| 837 |
+
from the given src_event if they are missing.
|
| 838 |
+
|
| 839 |
+
:param row: The row to fill in network fields for
|
| 840 |
+
:param src_event: The source event to draw network fields from
|
| 841 |
+
:return: None
|
| 842 |
+
"""
|
| 843 |
+
|
| 844 |
+
if not isinstance(src_event, dict):
|
| 845 |
+
return
|
| 846 |
+
|
| 847 |
+
ip = _coalesce(
|
| 848 |
+
src_event.get("ipAddress"),
|
| 849 |
+
src_event.get("sourceIp"),
|
| 850 |
+
src_event.get("ip"),
|
| 851 |
+
(src_event.get("context") or {}).get("ipAddress"),
|
| 852 |
+
)
|
| 853 |
+
country = _coalesce(
|
| 854 |
+
src_event.get("countryCode"),
|
| 855 |
+
(src_event.get("context") or {}).get("countryCode"),
|
| 856 |
+
)
|
| 857 |
+
region = _coalesce(
|
| 858 |
+
src_event.get("regionCode"),
|
| 859 |
+
(src_event.get("context") or {}).get("regionCode"),
|
| 860 |
+
)
|
| 861 |
+
|
| 862 |
+
if row.get("ip_address") in _MISSING and ip not in _MISSING:
|
| 863 |
+
row["ip_address"] = ip
|
| 864 |
+
if row.get("country_code") in _MISSING and country not in _MISSING:
|
| 865 |
+
row["country_code"] = country
|
| 866 |
+
if row.get("region_code") in _MISSING and region not in _MISSING:
|
| 867 |
+
row["region_code"] = region
|
| 868 |
+
|
| 869 |
+
|
| 870 |
+
def normalize_audit_event(ev: Dict) -> Dict:
|
| 871 |
+
"""
|
| 872 |
+
Normalize an audit event by extracting relevant fields into a standard format.
|
| 873 |
+
|
| 874 |
+
The normalized format includes the following fields:
|
| 875 |
+
|
| 876 |
+
- audit_id: The unique identifier for the audit event.
|
| 877 |
+
- category: The category of the audit event.
|
| 878 |
+
- sub_category: The sub-category of the audit event.
|
| 879 |
+
- action: The action taken in the audit event.
|
| 880 |
+
- target_object_id: The ID of the object targeted in the audit event.
|
| 881 |
+
- user_id: The ID of the user who triggered the audit event.
|
| 882 |
+
- user_email: The email address of the user who triggered the audit event.
|
| 883 |
+
- hubspot_occurred_at: The timestamp of when the audit event occurred in HubSpot.
|
| 884 |
+
- ip_address: The IP address associated with the audit event.
|
| 885 |
+
- country_code: The country code associated with the audit event.
|
| 886 |
+
- region_code: The region code associated with the audit event.
|
| 887 |
+
|
| 888 |
+
:param ev: The audit event to normalize.
|
| 889 |
+
:return: A dictionary containing the normalized audit event fields.
|
| 890 |
+
"""
|
| 891 |
+
actor = safe_get_actor(ev)
|
| 892 |
+
return {"audit_id": ev.get("id"),
|
| 893 |
+
"category": ev.get("category"),
|
| 894 |
+
"sub_category": ev.get("subCategory"),
|
| 895 |
+
"action": ev.get("action"),
|
| 896 |
+
"target_object_id": ev.get("targetObjectId") or ev.get("objectId"),
|
| 897 |
+
"user_id": actor.get("userId"),
|
| 898 |
+
"user_email": actor.get("userEmail"),
|
| 899 |
+
"hubspot_occured_at": ev.get("occurredAt") or ev.get("timestamp"),
|
| 900 |
+
"ip_address": ev.get("ipAddress"),
|
| 901 |
+
"country_code": ev.get("countryCode"),
|
| 902 |
+
"region_code": ev.get("regionCode"),
|
| 903 |
+
}
|
| 904 |
+
|
| 905 |
+
|
| 906 |
+
def enrich_audit_row_by_category(row: Dict,
|
| 907 |
+
login_idx: Dict[Tuple[Optional[str], Optional[str]], List[Dict]],
|
| 908 |
+
security_idx: Dict[Tuple[Optional[str], Optional[str]], List[Dict]],
|
| 909 |
+
match_window_seconds: int = 300) -> Dict:
|
| 910 |
+
"""
|
| 911 |
+
Enriches an audit event row by looking up the corresponding user's most recent login
|
| 912 |
+
or critical action event within a given time window.
|
| 913 |
+
|
| 914 |
+
:param row: The audit event row to enrich.
|
| 915 |
+
:param login_idx: A dictionary mapping user IDs/emails to lists of login events.
|
| 916 |
+
:param security_idx: A dictionary mapping user IDs/emails to lists of critical action events.
|
| 917 |
+
:param match_window_seconds: The time window in seconds to search for a matching login
|
| 918 |
+
or critical action event.
|
| 919 |
+
:return: The enriched audit event row with network fields filled in from the matching event, if found.
|
| 920 |
+
"""
|
| 921 |
+
cat = (row.get("category") or "").upper()
|
| 922 |
+
if cat not in ("LOGIN", "CRITICAL_ACTION"):
|
| 923 |
+
return row
|
| 924 |
+
|
| 925 |
+
uid = str(row.get("user_id")) if row.get("user_id") is not None else None
|
| 926 |
+
email = row.get("user_email")
|
| 927 |
+
key = (uid, email)
|
| 928 |
+
|
| 929 |
+
ts = parse_ts_dt(row.get("hubspot_occured_at"))
|
| 930 |
+
|
| 931 |
+
picked = None
|
| 932 |
+
if cat == "LOGIN":
|
| 933 |
+
picked = find_best_time_match(ts, key, login_idx, match_window_seconds)
|
| 934 |
+
elif cat == "CRITICAL_ACTION":
|
| 935 |
+
picked = find_best_time_match(
|
| 936 |
+
ts, key, security_idx, match_window_seconds)
|
| 937 |
+
|
| 938 |
+
if picked:
|
| 939 |
+
fill_network_fields(row, picked)
|
| 940 |
+
# Optional backfill of user fields
|
| 941 |
+
if row.get("user_email") in _MISSING and picked.get("email") not in _MISSING:
|
| 942 |
+
row["user_email"] = picked.get("email")
|
| 943 |
+
if row.get("user_id") in _MISSING and picked.get("userId") not in _MISSING:
|
| 944 |
+
row["user_id"] = picked.get("userId")
|
| 945 |
+
|
| 946 |
+
return row
|
python/load_hubspot_data.py
ADDED
|
@@ -0,0 +1,163 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Unified orchestrator for HubSpot → Supabase pipelines with a timestamp cursor.
|
| 3 |
+
|
| 4 |
+
CLI:
|
| 5 |
+
# epoch ms
|
| 6 |
+
python hubspot_orchestrator.py 1754025600000
|
| 7 |
+
# ISO-8601
|
| 8 |
+
python hubspot_orchestrator.py 2025-08-01T09:30:00Z
|
| 9 |
+
# Back-compat date (floors to 00:00Z)
|
| 10 |
+
python hubspot_orchestrator.py 2025-08-01
|
| 11 |
+
# No arg: defaults to today@00:00Z
|
| 12 |
+
"""
|
| 13 |
+
|
| 14 |
+
import sys
|
| 15 |
+
import re
|
| 16 |
+
import logging
|
| 17 |
+
import datetime
|
| 18 |
+
|
| 19 |
+
# Pipelines (each exposes main(since_ms: Optional[int]) and can fall back to env)
|
| 20 |
+
import hubspot_deals
|
| 21 |
+
import hubspot_emails
|
| 22 |
+
import hubspot_tickets
|
| 23 |
+
import hubspot_contacts
|
| 24 |
+
import hubspot_companies
|
| 25 |
+
import hubspot_billing
|
| 26 |
+
import hubspot_audit
|
| 27 |
+
|
| 28 |
+
logging.basicConfig(
|
| 29 |
+
filename=f"logs/hubspot_unified_orchestrator_{datetime.datetime.now().strftime('%Y-%m-%d')}.log",
|
| 30 |
+
filemode="a",
|
| 31 |
+
level=logging.INFO,
|
| 32 |
+
format="%(asctime)s [%(levelname)s] %(message)s",
|
| 33 |
+
)
|
| 34 |
+
|
| 35 |
+
# ---------------------------
|
| 36 |
+
# Time parsing helpers
|
| 37 |
+
# ---------------------------
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
def _ensure_utc(dt: datetime.datetime) -> datetime.datetime:
|
| 41 |
+
if dt.tzinfo is None:
|
| 42 |
+
dt = dt.replace(tzinfo=datetime.timezone.utc)
|
| 43 |
+
return dt.astimezone(datetime.timezone.utc)
|
| 44 |
+
|
| 45 |
+
|
| 46 |
+
def floor_to_utc_midnight(dt: datetime.datetime) -> datetime.datetime:
|
| 47 |
+
dt = _ensure_utc(dt)
|
| 48 |
+
return dt.replace(hour=0, minute=0, second=0, microsecond=0)
|
| 49 |
+
|
| 50 |
+
|
| 51 |
+
def _parse_iso_like_to_dt(value: str) -> datetime.datetime:
|
| 52 |
+
if isinstance(value, str) and value.endswith("Z"):
|
| 53 |
+
value = value[:-1] + "+00:00"
|
| 54 |
+
dt = datetime.datetime.fromisoformat(value)
|
| 55 |
+
return _ensure_utc(dt)
|
| 56 |
+
|
| 57 |
+
|
| 58 |
+
def to_epoch_ms(dt_or_str) -> int:
|
| 59 |
+
if isinstance(dt_or_str, str):
|
| 60 |
+
dt = _parse_iso_like_to_dt(dt_or_str)
|
| 61 |
+
elif isinstance(dt_or_str, datetime.datetime):
|
| 62 |
+
dt = _ensure_utc(dt_or_str)
|
| 63 |
+
else:
|
| 64 |
+
raise TypeError(f"Unsupported type for to_epoch_ms: {type(dt_or_str)}")
|
| 65 |
+
return int(dt.timestamp() * 1000)
|
| 66 |
+
|
| 67 |
+
|
| 68 |
+
def parse_since_arg_to_ms() -> int:
|
| 69 |
+
"""
|
| 70 |
+
Accepts:
|
| 71 |
+
- integer epoch ms (or seconds; seconds auto *1000)
|
| 72 |
+
- ISO-8601 (Z or offset)
|
| 73 |
+
- YYYY-MM-DD (floors to 00:00Z)
|
| 74 |
+
If not provided, defaults to today@00:00Z.
|
| 75 |
+
"""
|
| 76 |
+
if len(sys.argv) > 1:
|
| 77 |
+
arg = sys.argv[1].strip()
|
| 78 |
+
|
| 79 |
+
# epoch seconds or ms
|
| 80 |
+
if re.fullmatch(r"\d{10,13}", arg):
|
| 81 |
+
v = int(arg)
|
| 82 |
+
if v < 10_000_000_000_000: # seconds -> ms
|
| 83 |
+
v *= 1000
|
| 84 |
+
return v
|
| 85 |
+
|
| 86 |
+
# YYYY-MM-DD
|
| 87 |
+
if re.fullmatch(r"\d{4}-\d{2}-\d{2}", arg):
|
| 88 |
+
d = datetime.datetime.strptime(
|
| 89 |
+
arg, "%Y-%m-%d").replace(tzinfo=datetime.timezone.utc)
|
| 90 |
+
return to_epoch_ms(floor_to_utc_midnight(d))
|
| 91 |
+
|
| 92 |
+
# ISO-8601
|
| 93 |
+
try:
|
| 94 |
+
return to_epoch_ms(arg)
|
| 95 |
+
except Exception:
|
| 96 |
+
print("Invalid --since argument. Use epoch ms, ISO-8601, or YYYY-MM-DD.")
|
| 97 |
+
sys.exit(1)
|
| 98 |
+
|
| 99 |
+
# default: today@00:00Z
|
| 100 |
+
today0 = floor_to_utc_midnight(
|
| 101 |
+
datetime.datetime.now(datetime.timezone.utc))
|
| 102 |
+
return to_epoch_ms(today0)
|
| 103 |
+
|
| 104 |
+
# ---------------------------
|
| 105 |
+
# Main
|
| 106 |
+
# ---------------------------
|
| 107 |
+
|
| 108 |
+
|
| 109 |
+
def main():
|
| 110 |
+
"""
|
| 111 |
+
Runs pipelines in order with a shared timestamp cursor.
|
| 112 |
+
Each pipeline may internally advance its own stored cursor.
|
| 113 |
+
"""
|
| 114 |
+
since_ms = parse_since_arg_to_ms()
|
| 115 |
+
print(f"=== Running HubSpot sync pipeline since_ms={since_ms} ===")
|
| 116 |
+
|
| 117 |
+
try:
|
| 118 |
+
print("\n[1/7] Companies")
|
| 119 |
+
hubspot_companies.main(since_ms=since_ms)
|
| 120 |
+
except Exception as e:
|
| 121 |
+
logging.exception("Error running companies pipeline: %s", e)
|
| 122 |
+
|
| 123 |
+
try:
|
| 124 |
+
print("\n[2/7] Contacts")
|
| 125 |
+
hubspot_contacts.main(since_ms=since_ms)
|
| 126 |
+
except Exception as e:
|
| 127 |
+
logging.exception("Error running contacts pipeline: %s", e)
|
| 128 |
+
|
| 129 |
+
try:
|
| 130 |
+
print("\n[3/7] Deals")
|
| 131 |
+
hubspot_deals.main(since_ms=since_ms)
|
| 132 |
+
except Exception as e:
|
| 133 |
+
logging.exception("Error running deals pipeline: %s", e)
|
| 134 |
+
|
| 135 |
+
try:
|
| 136 |
+
print("\n[4/7] Tickets")
|
| 137 |
+
hubspot_tickets.main(since_ms=since_ms)
|
| 138 |
+
except Exception as e:
|
| 139 |
+
logging.exception("Error running tickets pipeline: %s", e)
|
| 140 |
+
|
| 141 |
+
try:
|
| 142 |
+
print("\n[5/7] Emails")
|
| 143 |
+
hubspot_emails.main(since_ms=since_ms)
|
| 144 |
+
except Exception as e:
|
| 145 |
+
logging.exception("Error running emails pipeline: %s", e)
|
| 146 |
+
|
| 147 |
+
try:
|
| 148 |
+
print("\n[6/7] Billing")
|
| 149 |
+
hubspot_billing.main(since_ms=since_ms)
|
| 150 |
+
except Exception as e:
|
| 151 |
+
logging.exception("Error running billing pipeline: %s", e)
|
| 152 |
+
|
| 153 |
+
try:
|
| 154 |
+
print("\n[7/7] Billing")
|
| 155 |
+
hubspot_audit.main(since_ms=since_ms)
|
| 156 |
+
except Exception as e:
|
| 157 |
+
logging.exception("Error running billing pipeline: %s", e)
|
| 158 |
+
|
| 159 |
+
print("\n=== HubSpot sync complete ===")
|
| 160 |
+
|
| 161 |
+
|
| 162 |
+
if __name__ == "__main__":
|
| 163 |
+
main()
|
python/supabase_utils.py
ADDED
|
@@ -0,0 +1,394 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
This script contains utility functions for working with Supabase.
|
| 3 |
+
"""
|
| 4 |
+
import time
|
| 5 |
+
import datetime
|
| 6 |
+
import json
|
| 7 |
+
import logging
|
| 8 |
+
import httpx
|
| 9 |
+
from storage3.exceptions import StorageApiError
|
| 10 |
+
from typing import List, Dict, Any, Optional
|
| 11 |
+
|
| 12 |
+
try:
|
| 13 |
+
from postgrest import APIError # supabase-py v2
|
| 14 |
+
except Exception: # pragma: no cover
|
| 15 |
+
APIError = Exception
|
| 16 |
+
try:
|
| 17 |
+
# supabase-py v2
|
| 18 |
+
from postgrest.types import ReturnMethod # Returning.MINIMAL
|
| 19 |
+
_RETURN_MINIMAL = ReturnMethod.MINIMAL
|
| 20 |
+
except Exception:
|
| 21 |
+
# supabase-py v1 fallback
|
| 22 |
+
_RETURN_MINIMAL = "minimal"
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
def insert_into_supabase_table(
|
| 26 |
+
supabase_client: Any,
|
| 27 |
+
table_name: str,
|
| 28 |
+
rows: List[Dict[str, Any]],
|
| 29 |
+
on_conflict: Optional[List[str]] = None,
|
| 30 |
+
*,
|
| 31 |
+
max_retries: int = 5,
|
| 32 |
+
backoff_base_seconds: float = 1.0,
|
| 33 |
+
use_returning_minimal: bool = True,
|
| 34 |
+
) -> None:
|
| 35 |
+
"""
|
| 36 |
+
Insert a list of rows into a Supabase table with optional conflict handling.
|
| 37 |
+
|
| 38 |
+
:param supabase_client: Supabase client
|
| 39 |
+
:param table_name: Name of the table to upsert into
|
| 40 |
+
:param rows: List of dictionaries to insert; each dictionary becomes a row
|
| 41 |
+
:param on_conflict: List of column names to use as conflict target
|
| 42 |
+
:param max_retries: Maximum number of times to retry on failure
|
| 43 |
+
:param backoff_base_seconds: Base amount of time to sleep between retries
|
| 44 |
+
:param use_returning_minimal: Whether to use returning=MINIMAL (faster but no inserted IDs)
|
| 45 |
+
:return: None. If no rows are provided, returns None immediately without attempting an insert.
|
| 46 |
+
|
| 47 |
+
This function will retry up to `max_retries` times on failure with an exponential
|
| 48 |
+
backoff schedule. If `on_conflict` is provided, it will use the given columns as the
|
| 49 |
+
conflict target. If `use_returning_minimal`, it will use the `returning=MINIMAL` parameter
|
| 50 |
+
to reduce response size (but won't get inserted IDs back).
|
| 51 |
+
|
| 52 |
+
NOTE: This function does not support transactions; it is intended for use in simple
|
| 53 |
+
data pipelines where retrying on failure is sufficient. If you need stronger
|
| 54 |
+
consistency guarantees, use transactions or another approach.
|
| 55 |
+
"""
|
| 56 |
+
|
| 57 |
+
if not rows:
|
| 58 |
+
logging.info("No rows to insert for table %s.", table_name)
|
| 59 |
+
return None
|
| 60 |
+
|
| 61 |
+
q = supabase_client.table(table_name)
|
| 62 |
+
conflict_target = ",".join(on_conflict) if on_conflict else None
|
| 63 |
+
|
| 64 |
+
if conflict_target and use_returning_minimal:
|
| 65 |
+
query = q.upsert(rows, on_conflict=conflict_target,
|
| 66 |
+
returning=_RETURN_MINIMAL)
|
| 67 |
+
elif conflict_target:
|
| 68 |
+
query = q.upsert(rows, on_conflict=conflict_target)
|
| 69 |
+
elif use_returning_minimal:
|
| 70 |
+
query = q.upsert(rows, returning=_RETURN_MINIMAL)
|
| 71 |
+
else:
|
| 72 |
+
query = q.upsert(rows)
|
| 73 |
+
|
| 74 |
+
attempt = 0
|
| 75 |
+
while True:
|
| 76 |
+
try:
|
| 77 |
+
query.execute()
|
| 78 |
+
logging.info("Inserted %s records into %s.", len(rows), table_name)
|
| 79 |
+
return True
|
| 80 |
+
|
| 81 |
+
except APIError as e:
|
| 82 |
+
code = getattr(e, "code", None)
|
| 83 |
+
attempt += 1
|
| 84 |
+
|
| 85 |
+
if attempt > max_retries:
|
| 86 |
+
logging.error(
|
| 87 |
+
"Permanent failure upserting into %s after %d attempts: %s",
|
| 88 |
+
table_name, attempt, e
|
| 89 |
+
)
|
| 90 |
+
return False
|
| 91 |
+
|
| 92 |
+
sleep_for = backoff_base_seconds * (2 ** (attempt - 1))
|
| 93 |
+
if code == "57014":
|
| 94 |
+
logging.warning(
|
| 95 |
+
"Timeout (57014) upserting into %s (attempt %d/%d). Retrying in %.1fs.",
|
| 96 |
+
table_name, attempt, max_retries, sleep_for
|
| 97 |
+
)
|
| 98 |
+
else:
|
| 99 |
+
logging.warning(
|
| 100 |
+
"APIError upserting into %s (attempt %d/%d): %s. Retrying in %.1fs.",
|
| 101 |
+
table_name, attempt, max_retries, e, sleep_for
|
| 102 |
+
)
|
| 103 |
+
time.sleep(sleep_for)
|
| 104 |
+
|
| 105 |
+
except (httpx.RequestError, httpx.HTTPStatusError) as e:
|
| 106 |
+
attempt += 1
|
| 107 |
+
if attempt > max_retries:
|
| 108 |
+
logging.error(
|
| 109 |
+
"Permanent HTTP failure upserting into %s after %d attempts: %s",
|
| 110 |
+
table_name, attempt, e
|
| 111 |
+
)
|
| 112 |
+
return False
|
| 113 |
+
|
| 114 |
+
sleep_for = backoff_base_seconds * (2 ** (attempt - 1))
|
| 115 |
+
logging.warning(
|
| 116 |
+
"HTTP error upserting into %s (attempt %d/%d): %s. Retrying in %.1fs.",
|
| 117 |
+
table_name, attempt, max_retries, e, sleep_for
|
| 118 |
+
)
|
| 119 |
+
time.sleep(sleep_for)
|
| 120 |
+
|
| 121 |
+
except Exception as e:
|
| 122 |
+
attempt += 1
|
| 123 |
+
if attempt > max_retries:
|
| 124 |
+
logging.error(
|
| 125 |
+
"Permanent unexpected failure upserting into %s after %d attempts: %s",
|
| 126 |
+
table_name, attempt, e
|
| 127 |
+
)
|
| 128 |
+
return False
|
| 129 |
+
|
| 130 |
+
sleep_for = backoff_base_seconds * (2 ** (attempt - 1))
|
| 131 |
+
logging.warning(
|
| 132 |
+
"Unexpected error upserting into %s (attempt %d/%d): %s. Retrying in %.1fs.",
|
| 133 |
+
table_name, attempt, max_retries, e, sleep_for
|
| 134 |
+
)
|
| 135 |
+
time.sleep(sleep_for)
|
| 136 |
+
|
| 137 |
+
|
| 138 |
+
def batched_insert(
|
| 139 |
+
supabase_client,
|
| 140 |
+
table_name: str,
|
| 141 |
+
rows: List[Dict[str, Any]],
|
| 142 |
+
batch_size: int = 500,
|
| 143 |
+
on_conflict: Optional[List[str]] = None,
|
| 144 |
+
max_retries: int = 4,
|
| 145 |
+
backoff_base_seconds: float = 1.0,
|
| 146 |
+
min_batch_size: int = 50,
|
| 147 |
+
) -> list:
|
| 148 |
+
"""
|
| 149 |
+
Insert a list of rows into a Supabase table with optional conflict handling, batching, and retries.
|
| 150 |
+
|
| 151 |
+
:param supabase_client: Supabase client
|
| 152 |
+
:param table_name: Name of the table to upsert into
|
| 153 |
+
:param rows: List of dictionaries to insert; each dictionary becomes a row
|
| 154 |
+
:param batch_size: Maximum number of rows to send in a single upsert request
|
| 155 |
+
:param on_conflict: List of column names to use as conflict target
|
| 156 |
+
:param max_retries: Maximum number of times to retry on failure
|
| 157 |
+
:param backoff_base_seconds: Base amount of time to sleep between retries
|
| 158 |
+
:param min_batch_size: Minimum size of a batch; if a timeout occurs, the
|
| 159 |
+
batch will be shrunk to this size before retrying
|
| 160 |
+
:return: List of responses from Supabase (empty if no rows to insert)
|
| 161 |
+
|
| 162 |
+
This function will retry up to `max_retries` times on failure with an exponential
|
| 163 |
+
backoff schedule. If `on_conflict` is provided, it will use the given columns as the
|
| 164 |
+
conflict target. If a timeout occurs, it will shrink the batch size to `min_batch_size`
|
| 165 |
+
and retry the first half of the batch. If the batch is already at `min_batch_size` or
|
| 166 |
+
smaller, it will retry with backoff. If all retries fail, it will raise the last
|
| 167 |
+
exception. If the `rows` list is empty, it will return an empty list without sending
|
| 168 |
+
any requests to Supabase.
|
| 169 |
+
"""
|
| 170 |
+
|
| 171 |
+
if not rows:
|
| 172 |
+
return []
|
| 173 |
+
|
| 174 |
+
results = []
|
| 175 |
+
n = len(rows)
|
| 176 |
+
i = 0
|
| 177 |
+
|
| 178 |
+
while i < n:
|
| 179 |
+
current_batch_size = min(batch_size, n - i)
|
| 180 |
+
batch = rows[i: i + current_batch_size]
|
| 181 |
+
|
| 182 |
+
# Attempt to insert this batch with retries
|
| 183 |
+
attempt = 0
|
| 184 |
+
while True:
|
| 185 |
+
try:
|
| 186 |
+
q = supabase_client.table(table_name)
|
| 187 |
+
if on_conflict:
|
| 188 |
+
# `on_conflict` in supabase-py v2 expects a comma-separated string
|
| 189 |
+
conflict_target = ",".join(on_conflict)
|
| 190 |
+
resp = q.upsert(batch, on_conflict=conflict_target,
|
| 191 |
+
returning=_RETURN_MINIMAL).execute()
|
| 192 |
+
else:
|
| 193 |
+
resp = q.upsert(batch, returning=_RETURN_MINIMAL).execute()
|
| 194 |
+
|
| 195 |
+
results.append(resp)
|
| 196 |
+
logging.info(
|
| 197 |
+
f"Inserted batch rows {i}–{i + len(batch)} into {table_name}")
|
| 198 |
+
i += len(batch) # advance window
|
| 199 |
+
break # batch succeeded, move to next
|
| 200 |
+
|
| 201 |
+
except APIError as e:
|
| 202 |
+
if getattr(e, "code", None) == "57014" and current_batch_size > min_batch_size:
|
| 203 |
+
# Halve the batch and retry the first half now;
|
| 204 |
+
# the second half will be handled subsequently
|
| 205 |
+
half = max(min_batch_size, current_batch_size // 2)
|
| 206 |
+
logging.warning(
|
| 207 |
+
f"Timeout on {table_name} rows {i}–{i + current_batch_size}. "
|
| 208 |
+
f"Shrinking batch to {half} and retrying."
|
| 209 |
+
)
|
| 210 |
+
batch_size = half
|
| 211 |
+
time.sleep(backoff_base_seconds)
|
| 212 |
+
break
|
| 213 |
+
|
| 214 |
+
# Other errors or batch already at min size -> retry with backoff
|
| 215 |
+
attempt += 1
|
| 216 |
+
if attempt > max_retries:
|
| 217 |
+
logging.error(
|
| 218 |
+
f"Failed inserting batch rows {i}–{i + current_batch_size} into {table_name} "
|
| 219 |
+
f"after {max_retries} retries: {e}"
|
| 220 |
+
)
|
| 221 |
+
raise
|
| 222 |
+
|
| 223 |
+
sleep_for = backoff_base_seconds * (2 ** (attempt - 1))
|
| 224 |
+
logging.warning(
|
| 225 |
+
f"Error inserting batch rows {i}–{i + current_batch_size} into {table_name} "
|
| 226 |
+
f"(attempt {attempt}/{max_retries}): {e}. Retrying in {sleep_for:.1f}s."
|
| 227 |
+
)
|
| 228 |
+
time.sleep(sleep_for)
|
| 229 |
+
continue
|
| 230 |
+
|
| 231 |
+
except Exception as e:
|
| 232 |
+
# Non-API errors: retry with backoff
|
| 233 |
+
attempt += 1
|
| 234 |
+
if attempt > max_retries:
|
| 235 |
+
logging.error(
|
| 236 |
+
f"Failed inserting batch rows {i}–{i + current_batch_size} into {table_name} "
|
| 237 |
+
f"after {max_retries} retries: {e}"
|
| 238 |
+
)
|
| 239 |
+
raise
|
| 240 |
+
|
| 241 |
+
sleep_for = backoff_base_seconds * (2 ** (attempt - 1))
|
| 242 |
+
logging.warning(
|
| 243 |
+
f"Unexpected error inserting batch rows {i}–{i + current_batch_size} into {table_name} "
|
| 244 |
+
f"(attempt {attempt}/{max_retries}): {e}. Retrying in {sleep_for:.1f}s."
|
| 245 |
+
)
|
| 246 |
+
time.sleep(sleep_for)
|
| 247 |
+
continue
|
| 248 |
+
|
| 249 |
+
return results
|
| 250 |
+
|
| 251 |
+
|
| 252 |
+
def upload_raw_json_to_supabase(supabase_client, json_data, object_type="contacts"):
|
| 253 |
+
"""
|
| 254 |
+
Uploads raw JSON data to Supabase storage under a specified object type directory.
|
| 255 |
+
|
| 256 |
+
:param supabase_client: The Supabase client object.
|
| 257 |
+
:param json_data: The JSON data to be uploaded.
|
| 258 |
+
:param object_type: The type of object, used to determine the directory path for storage.
|
| 259 |
+
:return: The path where the JSON file is stored in Supabase.
|
| 260 |
+
"""
|
| 261 |
+
|
| 262 |
+
now_str = datetime.datetime.now(
|
| 263 |
+
datetime.timezone.utc).strftime("%Y%m%d_%H%M%S")
|
| 264 |
+
path = f"{object_type}/{now_str}.json"
|
| 265 |
+
file_bytes = json.dumps(json_data, indent=2, default=str).encode("utf-8")
|
| 266 |
+
|
| 267 |
+
supabase_client.storage.from_("hubspot-raw-data").remove([path])
|
| 268 |
+
try:
|
| 269 |
+
supabase_client.storage.from_("hubspot-raw-data").upload(
|
| 270 |
+
path, file=file_bytes, file_options={
|
| 271 |
+
"content-type": "application/json"}
|
| 272 |
+
)
|
| 273 |
+
except StorageApiError as e:
|
| 274 |
+
if e.status_code == 413:
|
| 275 |
+
logging.warning(
|
| 276 |
+
"Upload failed: payload too large for Supabase Storage.")
|
| 277 |
+
else:
|
| 278 |
+
logging.error("Storage API error during upload: %s", e)
|
| 279 |
+
except httpx.RequestError as e:
|
| 280 |
+
logging.error("Upload error: %s", e)
|
| 281 |
+
return path
|
| 282 |
+
|
| 283 |
+
|
| 284 |
+
def get_last_sync_time(supabase_client, object_type):
|
| 285 |
+
"""
|
| 286 |
+
Retrieves the last sync time for the given object type from the hubspot_sync_metadata table.
|
| 287 |
+
|
| 288 |
+
:param supabase_client: The Supabase client object.
|
| 289 |
+
:param object_type: The type of object to retrieve the last sync time for.
|
| 290 |
+
:return: The last sync time for the given object type as a datetime object.
|
| 291 |
+
If no previous sync was found, returns None.
|
| 292 |
+
"""
|
| 293 |
+
|
| 294 |
+
res = supabase_client.table("hubspot_sync_metadata").select(
|
| 295 |
+
"last_sync_time").eq("object_type", object_type).execute()
|
| 296 |
+
data = res.data
|
| 297 |
+
if data and data[0]["last_sync_time"]:
|
| 298 |
+
last_sync = datetime.datetime.fromisoformat(data[0]["last_sync_time"])
|
| 299 |
+
return last_sync
|
| 300 |
+
|
| 301 |
+
return None
|
| 302 |
+
|
| 303 |
+
|
| 304 |
+
def update_sync_metadata(supabase_client, object_type, sync_time):
|
| 305 |
+
"""
|
| 306 |
+
Updates the last sync time for the given object type in the hubspot_sync_metadata table.
|
| 307 |
+
|
| 308 |
+
:param supabase_client: The Supabase client object.
|
| 309 |
+
:param object_type: The type of object to update the last sync time for.
|
| 310 |
+
:param sync_time: The last sync time to update.
|
| 311 |
+
:return: The result of the Supabase upsert operation.
|
| 312 |
+
"""
|
| 313 |
+
|
| 314 |
+
payload = [{
|
| 315 |
+
"object_type": object_type,
|
| 316 |
+
"last_sync_time": sync_time,
|
| 317 |
+
"updated_at": datetime.datetime.now(datetime.timezone.utc).isoformat()
|
| 318 |
+
}]
|
| 319 |
+
|
| 320 |
+
return insert_into_supabase_table(supabase_client, "hubspot_sync_metadata", payload)
|
| 321 |
+
|
| 322 |
+
|
| 323 |
+
def get_existing_email_content_ids(supabase_client):
|
| 324 |
+
"""
|
| 325 |
+
Fetches existing email_ids from the hubspot_email_contents table.
|
| 326 |
+
|
| 327 |
+
:param supabase_client: Supabase client
|
| 328 |
+
:return: A set of email_id values already in the database
|
| 329 |
+
"""
|
| 330 |
+
existing_ids = set()
|
| 331 |
+
page_size = 1000
|
| 332 |
+
offset = 0
|
| 333 |
+
|
| 334 |
+
while True:
|
| 335 |
+
res = supabase_client.table("hubspot_email_contents")\
|
| 336 |
+
.select("email_id")\
|
| 337 |
+
.range(offset, offset + page_size - 1)\
|
| 338 |
+
.execute()
|
| 339 |
+
if not res.data:
|
| 340 |
+
break
|
| 341 |
+
batch = {row["email_id"] for row in res.data}
|
| 342 |
+
existing_ids.update(batch)
|
| 343 |
+
offset += page_size
|
| 344 |
+
|
| 345 |
+
return existing_ids
|
| 346 |
+
|
| 347 |
+
|
| 348 |
+
def fetch_supabase_table(
|
| 349 |
+
supabase_client, table_name="hubspot_contacts",
|
| 350 |
+
id_column="contact_id"
|
| 351 |
+
):
|
| 352 |
+
"""
|
| 353 |
+
Fetches all rows from a Supabase table and returns them as a dict.
|
| 354 |
+
|
| 355 |
+
:param supabase_client: Supabase client instance
|
| 356 |
+
:param table_name: Name of the table to fetch from
|
| 357 |
+
:param id_column: Unique ID column to use as the key for the returned dict
|
| 358 |
+
:return: A dict of rows with the ID column as the key and the full row as the value
|
| 359 |
+
"""
|
| 360 |
+
|
| 361 |
+
all_rows = {}
|
| 362 |
+
page = 0
|
| 363 |
+
page_size = 1000
|
| 364 |
+
|
| 365 |
+
while True:
|
| 366 |
+
res = supabase_client.table(table_name).select(
|
| 367 |
+
"*").range(page * page_size, (page + 1) * page_size - 1).execute()
|
| 368 |
+
if not res.data:
|
| 369 |
+
break
|
| 370 |
+
for row in res.data:
|
| 371 |
+
all_rows[str(row[id_column])] = row
|
| 372 |
+
if len(res.data) < page_size:
|
| 373 |
+
break
|
| 374 |
+
page += 1
|
| 375 |
+
|
| 376 |
+
return all_rows
|
| 377 |
+
|
| 378 |
+
|
| 379 |
+
def enrich_supabase_row(base_row: dict) -> dict:
|
| 380 |
+
"""
|
| 381 |
+
Enriches a Supabase row with additional columns needed for duplicate detection.
|
| 382 |
+
|
| 383 |
+
:param base_row: The base row to enrich.
|
| 384 |
+
:return: The enriched row.
|
| 385 |
+
"""
|
| 386 |
+
|
| 387 |
+
base_row.update({
|
| 388 |
+
"duplicate_id": None,
|
| 389 |
+
"duplicate_status": None,
|
| 390 |
+
"duplicate_action": None,
|
| 391 |
+
"is_primary": True,
|
| 392 |
+
"updated_at": datetime.datetime.now(datetime.timezone.utc).isoformat()
|
| 393 |
+
})
|
| 394 |
+
return base_row
|
requirements.txt
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
python-dotenv==1.1.0
|
| 2 |
+
hubspot-api-client==12.0.0
|
| 3 |
+
supabase==2.16.0
|
| 4 |
+
httpx==0.28.1
|
| 5 |
+
pandas==2.2.2
|
| 6 |
+
openpyxl==3.1.5
|
| 7 |
+
phonenumbers
|
| 8 |
+
tldextract
|
| 9 |
+
rapidfuzz
|