niwayandm commited on
Commit
a16378e
·
1 Parent(s): 4f74457

Initial files commit

Browse files
.gitattributes CHANGED
@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ fonts/HanyiSentyPagoda[[:space:]]Regular.ttf filter=lfs diff=lfs merge=lfs -text
37
+ fonts/HanyiSentyPagoda_Regular.ttf filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1 @@
 
 
1
+ .env
Dockerfile ADDED
@@ -0,0 +1,115 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM node:24-alpine
2
+
3
+ # -------------------------
4
+ # Base + build args
5
+ # -------------------------
6
+ USER root
7
+ ARG N8N_PATH=/usr/local/lib/node_modules/n8n
8
+ ARG BASE_PATH=/root/.n8n
9
+ ARG DATABASE_PATH=$BASE_PATH/database
10
+ ARG CONFIG_PATH=$BASE_PATH/config
11
+ ARG WORKFLOWS_PATH=$BASE_PATH/workflows
12
+ ARG LOGS_PATH=$BASE_PATH/logs
13
+
14
+ # Optional args passed at build time (no secrets here)
15
+ ARG N8N_ENFORCE_SETTINGS_FILE_PERMISSIONS=$N8N_ENFORCE_SETTINGS_FILE_PERMISSIONS
16
+ ARG N8N_HOST=$N8N_HOST
17
+ ARG N8N_PORT=$N8N_PORT
18
+ ARG N8N_PROTOCOL=https
19
+ ARG N8N_EDITOR_BASE_URL=$N8N_EDITOR_BASE_URL
20
+ ARG WEBHOOK_URL=$WEBHOOK_URL
21
+ ARG GENERIC_TIMEZONE=$GENERIC_TIMEZONE
22
+ ARG TZ=$TZ
23
+ ARG N8N_ENCRYPTION_KEY=$N8N_ENCRYPTION_KEY
24
+ ARG DB_TYPE=$DB_TYPE
25
+ ARG DB_POSTGRESDB_SCHEMA=$DB_POSTGRESDB_SCHEMA
26
+ ARG DB_POSTGRESDB_HOST=$DB_POSTGRESDB_HOST
27
+ ARG DB_POSTGRESDB_DATABASE=$DB_POSTGRESDB_DATABASE
28
+ ARG DB_POSTGRESDB_PORT=$DB_POSTGRESDB_PORT
29
+ ARG DB_POSTGRESDB_USER=$DB_POSTGRESDB_USER
30
+ ARG DB_POSTGRESDB_PASSWORD=$DB_POSTGRESDB_PASSWORD
31
+
32
+ # -------------------------
33
+ # System deps
34
+ # -------------------------
35
+ RUN apk add --no-cache \
36
+ bash \
37
+ git \
38
+ python3 \
39
+ py3-pip \
40
+ make \
41
+ g++ \
42
+ build-base \
43
+ cairo-dev \
44
+ pango-dev \
45
+ chromium \
46
+ postgresql-client \
47
+ ffmpeg \
48
+ yt-dlp
49
+
50
+
51
+ # Tell Puppeteer to skip installing Chrome.
52
+ ENV PUPPETEER_SKIP_DOWNLOAD=true
53
+ ENV PUPPETEER_EXECUTABLE_PATH=/usr/bin/chromium-browser
54
+
55
+ RUN yarn add puppeteer@24.27.0
56
+
57
+ # Install n8n-nodes-puppeteer in a permanent location
58
+ RUN mkdir -p /opt/n8n-custom-nodes && \
59
+ cd /opt/n8n-custom-nodes && \
60
+ npm install n8n-nodes-puppeteer && \
61
+ chown -R node:node /opt/n8n-custom-nodes
62
+
63
+ # Copy our custom entrypoint
64
+ COPY docker-custom-entrypoint.sh /docker-custom-entrypoint.sh
65
+ RUN chmod +x /docker-custom-entrypoint.sh && \
66
+ chown node:node /docker-custom-entrypoint.sh
67
+
68
+ # -------------------------
69
+ # n8n
70
+ # -------------------------
71
+ RUN npm install -g n8n@1.118.1
72
+
73
+ # -------------------------
74
+ # Python venv + requirements
75
+ # -------------------------
76
+ RUN python3 -m venv /opt/venv
77
+ ENV PATH="/opt/venv/bin:$PATH"
78
+
79
+ # Python dependencies
80
+ COPY requirements.txt /tmp/requirements.txt
81
+ RUN pip install --no-cache-dir -r /tmp/requirements.txt
82
+
83
+ # Python code
84
+ COPY python/ /app/python/
85
+
86
+ # -------------------------
87
+ # Runtime layout
88
+ # -------------------------
89
+ # Persist n8n data under /data (enable “Persistent storage” in Space settings)
90
+ ENV N8N_USER_FOLDER="/data" \
91
+ N8N_LISTEN_ADDRESS="0.0.0.0" \
92
+ N8N_PORT=7860 \
93
+ N8N_PROTOCOL="http"
94
+
95
+ # Create standard dirs + logs
96
+ RUN mkdir -p $DATABASE_PATH $CONFIG_PATH $WORKFLOWS_PATH $LOGS_PATH \
97
+ && mkdir -p /data/logs /app/logs \
98
+ && chmod -R 777 $BASE_PATH /data /app
99
+
100
+ COPY logs/ /data/logs/
101
+
102
+ # -------------------------
103
+ # Non-root user for Chromium
104
+ # -------------------------
105
+ RUN addgroup -S pptruser && adduser -S -G pptruser pptruser \
106
+ && mkdir -p /home/pptruser/Downloads \
107
+ && chown -R pptruser:pptruser /home/pptruser /data /app
108
+
109
+ USER pptruser
110
+
111
+ WORKDIR /data
112
+ EXPOSE 7860
113
+
114
+ # Start n8n
115
+ CMD ["n8n", "start"]
README.md CHANGED
@@ -1,10 +1,12 @@
1
  ---
2
- title: N8n
3
- emoji: 🔥
4
- colorFrom: pink
5
- colorTo: green
6
  sdk: docker
7
  pinned: false
 
 
8
  ---
9
 
10
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
  ---
2
+ title: N8N
3
+ emoji:
4
+ colorFrom: blue
5
+ colorTo: gray
6
  sdk: docker
7
  pinned: false
8
+ license: mit
9
+ short_description: Free n8n with Supabase
10
  ---
11
 
12
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
docker-custom-entrypoint.sh ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/sh
2
+
3
+ print_banner() {
4
+ echo "----------------------------------------"
5
+ echo "n8n Puppeteer Node - Environment Details"
6
+ echo "----------------------------------------"
7
+ echo "Node.js version: $(node -v)"
8
+ echo "n8n version: $(n8n --version)"
9
+
10
+ # Get Chromium version specifically from the path we're using for Puppeteer
11
+ CHROME_VERSION=$("$PUPPETEER_EXECUTABLE_PATH" --version 2>/dev/null || echo "Chromium not found")
12
+ echo "Chromium version: $CHROME_VERSION"
13
+
14
+ # Get Puppeteer version if installed
15
+ PUPPETEER_PATH="/opt/n8n-custom-nodes/node_modules/n8n-nodes-puppeteer"
16
+ if [ -f "$PUPPETEER_PATH/package.json" ]; then
17
+ PUPPETEER_VERSION=$(node -p "require('$PUPPETEER_PATH/package.json').version")
18
+ echo "n8n-nodes-puppeteer version: $PUPPETEER_VERSION"
19
+
20
+ # Try to resolve puppeteer package from the n8n-nodes-puppeteer directory
21
+ CORE_PUPPETEER_VERSION=$(cd "$PUPPETEER_PATH" && node -e "try { const version = require('puppeteer/package.json').version; console.log(version); } catch(e) { console.log('not found'); }")
22
+ echo "Puppeteer core version: $CORE_PUPPETEER_VERSION"
23
+ else
24
+ echo "n8n-nodes-puppeteer: not installed"
25
+ fi
26
+
27
+ echo "Puppeteer executable path: $PUPPETEER_EXECUTABLE_PATH"
28
+ echo "----------------------------------------"
29
+ }
30
+
31
+ # Add custom nodes to the NODE_PATH
32
+ if [ -n "$N8N_CUSTOM_EXTENSIONS" ]; then
33
+ export N8N_CUSTOM_EXTENSIONS="/opt/n8n-custom-nodes:${N8N_CUSTOM_EXTENSIONS}"
34
+ else
35
+ export N8N_CUSTOM_EXTENSIONS="/opt/n8n-custom-nodes"
36
+ fi
37
+
38
+ print_banner
39
+
40
+ # Execute the original n8n entrypoint script
41
+ exec /docker-entrypoint.sh "$@"
fonts/HanyiSentyPagoda_Regular.ttf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:754eb3539249898f9daffc23b0068561bb9a618a79d517708513b5119fd7e6e5
3
+ size 9982284
logs/.gitkeep ADDED
File without changes
python/hubspot_audit.py ADDED
@@ -0,0 +1,252 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ HubSpot Audit → Supabase (incremental since a millisecond cursor)
3
+
4
+ Usage from orchestrator:
5
+ import load_hubspot_companies
6
+ load_hubspot_companies.main(since_ms=<int milliseconds since epoch UTC>)
7
+
8
+ Direct CLI:
9
+ # epoch ms
10
+ python load_hubspot_companies.py 1754025600000
11
+ # ISO-8601
12
+ python load_hubspot_companies.py 2025-08-01T09:30:00Z
13
+ # Back-compat date (floors to 00:00Z)
14
+ python load_hubspot_companies.py 2025-08-01
15
+ """
16
+ import os
17
+ import logging
18
+ import datetime
19
+ from typing import Dict, List, Optional, Union
20
+ import re
21
+ from dotenv import load_dotenv
22
+ from supabase import create_client
23
+ from supabase_utils import batched_insert, update_sync_metadata
24
+ from hubspot_utils import (
25
+ to_epoch_ms_from_utc_iso,
26
+ page_account_activity,
27
+ build_login_index,
28
+ build_security_index,
29
+ normalize_audit_event,
30
+ enrich_audit_row_by_category,
31
+ deduplicate_by_key,
32
+ )
33
+
34
+ # Logging
35
+ logging.basicConfig(
36
+ filename=f"logs/hubspot_audit_logs_{datetime.datetime.now().strftime('%Y-%m-%d')}.log",
37
+ filemode="a",
38
+ level=logging.INFO,
39
+ format="%(asctime)s [%(levelname)s] %(message)s",
40
+ )
41
+
42
+ # Environment
43
+ load_dotenv()
44
+ HUBSPOT_TOKEN = os.getenv("HUBSPOT_TOKEN")
45
+ SUPABASE_URL = os.getenv("SUPABASE_URL")
46
+ SUPABASE_SERVICE_ROLE_KEY = os.getenv("SUPABASE_SERVICE_ROLE_KEY")
47
+
48
+ if not HUBSPOT_TOKEN:
49
+ raise RuntimeError("HUBSPOT_TOKEN is required")
50
+ if not SUPABASE_URL or not SUPABASE_SERVICE_ROLE_KEY:
51
+ raise RuntimeError(
52
+ "SUPABASE_URL and SUPABASE_SERVICE_ROLE_KEY are required")
53
+
54
+ supabase_client = create_client(SUPABASE_URL, SUPABASE_SERVICE_ROLE_KEY)
55
+
56
+ # Config
57
+ AUDITLOG_TABLE = os.getenv("HUBSPOT_AUDITLOG_TABLE", "hubspot_audits")
58
+ HUBSPOT_LIMIT = int(os.getenv("HUBSPOT_AUDITLOG_LIMIT", "100"))
59
+ HUBSPOT_MAX_PAGES = int(os.getenv("HUBSPOT_AUDITLOG_MAX_PAGES", "10000"))
60
+ MATCH_WINDOW_SECONDS = int(os.getenv("HUBSPOT_MATCH_WINDOW_SECONDS", "300"))
61
+
62
+ INITIAL_BACKOFF_SECONDS = float(os.getenv("HUBSPOT_BACKOFF_START", "1.0"))
63
+ MAX_BACKOFF_SECONDS = float(os.getenv("HUBSPOT_BACKOFF_MAX", "16.0"))
64
+
65
+ BOOTSTRAP_SINCE_MS_ENV = os.getenv(
66
+ "BOOTSTRAP_SINCE_MS_ENV") # may be epoch ms or ISO-8601
67
+
68
+
69
+ def _today_midnight_ms_utc() -> int:
70
+ now = datetime.datetime.now(datetime.timezone.utc)
71
+ dt = now.replace(hour=0, minute=0, second=0, microsecond=0)
72
+ # hubspot_utils.to_epoch_ms_from_utc_iso expects ISO; format accordingly
73
+ return to_epoch_ms_from_utc_iso(dt.isoformat().replace("+00:00", "Z"))
74
+
75
+
76
+ def _ensure_utc(dt: datetime.datetime) -> datetime.datetime:
77
+ if dt.tzinfo is None:
78
+ dt = dt.replace(tzinfo=datetime.timezone.utc)
79
+ return dt.astimezone(datetime.timezone.utc)
80
+
81
+
82
+ def floor_to_utc_midnight(dt: datetime.datetime) -> datetime.datetime:
83
+ dt = _ensure_utc(dt)
84
+ return dt.replace(hour=0, minute=0, second=0, microsecond=0)
85
+
86
+
87
+ def _parse_iso_like_to_dt(value: str) -> datetime.datetime:
88
+ if value.endswith("Z"):
89
+ value = value[:-1] + "+00:00"
90
+ dt = datetime.datetime.fromisoformat(value)
91
+ return _ensure_utc(dt)
92
+
93
+
94
+ def to_epoch_ms(dt_or_str: Union[str, datetime.datetime]) -> int:
95
+ if isinstance(dt_or_str, str):
96
+ dt = _parse_iso_like_to_dt(dt_or_str)
97
+ elif isinstance(dt_or_str, datetime.datetime):
98
+ dt = _ensure_utc(dt_or_str)
99
+ else:
100
+ raise TypeError(f"Unsupported type for to_epoch_ms: {type(dt_or_str)}")
101
+ return int(dt.timestamp() * 1000)
102
+
103
+
104
+ BASE_AUDIT_URL = "https://api.hubapi.com/account-info/v3/activity/audit-logs"
105
+ BASE_SECURITY_URL = "https://api.hubspot.com/account-info/v3/activity/security".replace(
106
+ "hubspot.com", "hubapi.com") # ensure hubapi
107
+ BASE_LOGIN_URL = "https://api.hubapi.com/account-info/v3/activity/login"
108
+
109
+
110
+ def fetch_streams(occurred_after_ms: int) -> Dict[str, List[dict]]:
111
+ """
112
+ Fetch the three streams from HubSpot account-info API:
113
+ - Audit logs
114
+ - Login activity
115
+ - Security activity
116
+
117
+ We pass occurred_after_ms and omit occurred_before_ms (None).
118
+ """
119
+ audit = page_account_activity(
120
+ base_url=BASE_AUDIT_URL,
121
+ token=HUBSPOT_TOKEN,
122
+ occurred_after_ms=occurred_after_ms,
123
+ occurred_before_ms=None,
124
+ limit=HUBSPOT_LIMIT,
125
+ max_pages=HUBSPOT_MAX_PAGES,
126
+ )
127
+ login = page_account_activity(
128
+ base_url=BASE_LOGIN_URL,
129
+ token=HUBSPOT_TOKEN,
130
+ occurred_after_ms=occurred_after_ms,
131
+ occurred_before_ms=None,
132
+ limit=HUBSPOT_LIMIT,
133
+ max_pages=HUBSPOT_MAX_PAGES,
134
+ )
135
+ security = page_account_activity(
136
+ base_url=BASE_SECURITY_URL,
137
+ token=HUBSPOT_TOKEN,
138
+ occurred_after_ms=occurred_after_ms,
139
+ occurred_before_ms=None,
140
+ limit=HUBSPOT_LIMIT,
141
+ max_pages=HUBSPOT_MAX_PAGES,
142
+ )
143
+ logging.info("Fetched counts: audit=%d login=%d security=%d",
144
+ len(audit), len(login), len(security))
145
+ return {"audit": audit, "login": login, "security": security}
146
+
147
+
148
+ def build_indices(login_events: List[dict], security_events: List[dict]):
149
+ login_idx = build_login_index(login_events)
150
+ security_idx = build_security_index(security_events)
151
+ return login_idx, security_idx
152
+
153
+
154
+ def normalize_and_enrich(audit_events: List[dict], login_idx, security_idx) -> List[dict]:
155
+ rows: List[dict] = []
156
+ for ev in audit_events:
157
+ row = normalize_audit_event(ev)
158
+ if not row.get("audit_id"):
159
+ continue
160
+ row = enrich_audit_row_by_category(
161
+ row, login_idx, security_idx, match_window_seconds=MATCH_WINDOW_SECONDS)
162
+ rows.append(row)
163
+ rows = deduplicate_by_key(rows, "audit_id")
164
+ return rows
165
+
166
+
167
+ def upsert(rows: List[dict]) -> None:
168
+ if not rows:
169
+ logging.info("No rows to upsert.")
170
+ sync_time = datetime.datetime.now(datetime.timezone.utc)
171
+ update_sync_metadata(
172
+ supabase_client, AUDITLOG_TABLE, sync_time.isoformat())
173
+ return
174
+
175
+ batched_insert(
176
+ supabase_client,
177
+ AUDITLOG_TABLE,
178
+ rows,
179
+ batch_size=500,
180
+ on_conflict=["audit_id"],
181
+ )
182
+ logging.info("Upserted %d rows into %s", len(rows), AUDITLOG_TABLE)
183
+
184
+ sync_time = datetime.datetime.now(datetime.timezone.utc)
185
+ update_sync_metadata(supabase_client, AUDITLOG_TABLE,
186
+ sync_time.isoformat())
187
+
188
+
189
+ def main(since_ms: Optional[int] = None):
190
+ logging.info(
191
+ "Starting HubSpot audit sync (occurredAfter_ms=%s, occurredBefore=SKIPPED).", since_ms)
192
+
193
+ if since_ms is None and BOOTSTRAP_SINCE_MS_ENV:
194
+ try:
195
+ since_ms = int(BOOTSTRAP_SINCE_MS_ENV)
196
+ except ValueError:
197
+ raise RuntimeError(
198
+ "HUBSPOT_BILLING_SINCE_MS must be an integer (ms) if set.")
199
+
200
+ if since_ms is None:
201
+ # Default: today@00:00:00Z for first run
202
+ today0 = floor_to_utc_midnight(
203
+ datetime.datetime.now(datetime.timezone.utc))
204
+ since_ms = to_epoch_ms(today0)
205
+
206
+ print(f"Fetching HubSpot audit logs occurredAfter > {since_ms} ...")
207
+ streams = fetch_streams(occurred_after_ms=since_ms)
208
+ login_idx, security_idx = build_indices(
209
+ streams["login"], streams["security"])
210
+ rows = normalize_and_enrich(streams["audit"], login_idx, security_idx)
211
+
212
+ print("Upserting into Supabase...")
213
+ upsert(rows)
214
+ print(f"Synced {len(rows)} audit rows into '{AUDITLOG_TABLE}'.")
215
+ print("Audit logs sync complete.")
216
+
217
+
218
+ def _parse_cli_arg_to_ms(arg: str) -> int:
219
+ """
220
+ Accept:
221
+ - integer epoch ms
222
+ - ISO-8601 (Z or offset)
223
+ - YYYY-MM-DD (floors to 00:00Z)
224
+ """
225
+ # epoch ms or seconds
226
+ if re.fullmatch(r"\d{10,13}", arg):
227
+ v = int(arg)
228
+ if v < 10_000_000_000_000: # seconds -> ms
229
+ v *= 1000
230
+ return v
231
+
232
+ # YYYY-MM-DD
233
+ if re.fullmatch(r"\d{4}-\d{2}-\d{2}", arg):
234
+ d = datetime.datetime.strptime(
235
+ arg, "%Y-%m-%d").replace(tzinfo=datetime.timezone.utc)
236
+ return to_epoch_ms(floor_to_utc_midnight(d))
237
+
238
+ # ISO-8601
239
+ return to_epoch_ms(arg)
240
+
241
+
242
+ if __name__ == "__main__":
243
+ import sys
244
+ if len(sys.argv) > 1:
245
+ try:
246
+ main(since_ms=_parse_cli_arg_to_ms(sys.argv[1]))
247
+ except Exception as e:
248
+ print(
249
+ f"Invalid timestamp. Provide epoch ms, ISO-8601, or YYYY-MM-DD. Error: {e}")
250
+ sys.exit(1)
251
+ else:
252
+ main()
python/hubspot_billing.py ADDED
@@ -0,0 +1,454 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ HubSpot Billing → Supabase (incremental since a millisecond cursor)
3
+
4
+ Usage from orchestrator:
5
+ import load_hubspot_billing
6
+ load_hubspot_billing.main(since_ms=<int milliseconds since epoch UTC>)
7
+
8
+ Direct CLI:
9
+ # epoch ms
10
+ python load_hubspot_billing.py 1754025600000
11
+ # ISO-8601
12
+ python load_hubspot_billing.py 2025-08-01T09:30:00Z
13
+ # Back-compat date (floors to 00:00Z)
14
+ python load_hubspot_billing.py 2025-08-01
15
+ """
16
+ import os
17
+ import re
18
+ import time
19
+ import logging
20
+ import datetime
21
+ from typing import List, Dict, Optional, Tuple, Union
22
+
23
+ import httpx
24
+ import hubspot
25
+ from dotenv import load_dotenv
26
+ from supabase import create_client
27
+
28
+ from hubspot_utils import (
29
+ parse_ts, try_parse_int, deduplicate_by_key,
30
+ )
31
+ from supabase_utils import (
32
+ batched_insert, update_sync_metadata,
33
+ )
34
+
35
+ # -----------------------------------------------------------------------------
36
+ # Logging
37
+ # -----------------------------------------------------------------------------
38
+ logging.basicConfig(
39
+ filename=f"logs/hubspot_billing_pipeline_{datetime.datetime.now().strftime('%Y-%m-%d')}.log",
40
+ filemode="a",
41
+ level=logging.INFO,
42
+ format="%(asctime)s [%(levelname)s] %(message)s",
43
+ )
44
+
45
+ # -----------------------------------------------------------------------------
46
+ # Environment
47
+ # -----------------------------------------------------------------------------
48
+ load_dotenv()
49
+ HUBSPOT_TOKEN = os.getenv("HUBSPOT_TOKEN")
50
+ SUPABASE_URL = os.getenv("SUPABASE_URL")
51
+ SUPABASE_SERVICE_ROLE_KEY = os.getenv("SUPABASE_SERVICE_ROLE_KEY")
52
+ # Optional bootstrap cursor if orchestrator doesn't provide one
53
+ BOOTSTRAP_SINCE_MS_ENV = os.getenv("HUBSPOT_BILLING_SINCE_MS")
54
+
55
+ if not HUBSPOT_TOKEN:
56
+ raise RuntimeError("HUBSPOT_TOKEN is not set")
57
+ if not SUPABASE_URL or not SUPABASE_SERVICE_ROLE_KEY:
58
+ raise RuntimeError("Supabase env vars are not set")
59
+
60
+ hubspot_client = hubspot.Client.create(access_token=HUBSPOT_TOKEN)
61
+ supabase_client = create_client(SUPABASE_URL, SUPABASE_SERVICE_ROLE_KEY)
62
+
63
+ # -----------------------------------------------------------------------------
64
+ # Config
65
+ # -----------------------------------------------------------------------------
66
+
67
+ # Custom object type ID for Billing Services
68
+ BILLING_OBJECT_TYPE = "2-38060359"
69
+
70
+ BILLING_PROPERTIES = [
71
+ "cli",
72
+ "account_code",
73
+ "service_name",
74
+ "status",
75
+ "type",
76
+ "tariff_name",
77
+ "contract_renewal_date",
78
+ "supplier_network",
79
+ "network_name",
80
+ "product_type_name",
81
+ "hs_created_by_user_id",
82
+ "hs_createdate",
83
+ # "hs_lastmodifieddate", # optionally include if needed later
84
+ ]
85
+
86
+ PROPERTY_RENAME = {
87
+ "hs_created_by_user_id": "hubspot_created_by",
88
+ "hs_createdate": "hubspot_created_at",
89
+ }
90
+
91
+ # -----------------------------------------------------------------------------
92
+ # Time helpers (mirroring hubspot_tickets)
93
+ # -----------------------------------------------------------------------------
94
+
95
+
96
+ def _ensure_utc(dt: datetime.datetime) -> datetime.datetime:
97
+ if dt.tzinfo is None:
98
+ dt = dt.replace(tzinfo=datetime.timezone.utc)
99
+ return dt.astimezone(datetime.timezone.utc)
100
+
101
+
102
+ def floor_to_utc_midnight(dt: datetime.datetime) -> datetime.datetime:
103
+ dt = _ensure_utc(dt)
104
+ return dt.replace(hour=0, minute=0, second=0, microsecond=0)
105
+
106
+
107
+ def _parse_iso_like_to_dt(value: str) -> datetime.datetime:
108
+ if value.endswith("Z"):
109
+ value = value[:-1] + "+00:00"
110
+ dt = datetime.datetime.fromisoformat(value)
111
+ return _ensure_utc(dt)
112
+
113
+
114
+ def to_epoch_ms(dt_or_str: Union[str, datetime.datetime]) -> int:
115
+ if isinstance(dt_or_str, str):
116
+ dt = _parse_iso_like_to_dt(dt_or_str)
117
+ elif isinstance(dt_or_str, datetime.datetime):
118
+ dt = _ensure_utc(dt_or_str)
119
+ else:
120
+ raise TypeError(f"Unsupported type for to_epoch_ms: {type(dt_or_str)}")
121
+ return int(dt.timestamp() * 1000)
122
+
123
+
124
+ def parse_any_ts_ms(value: Optional[Union[str, int, float]]) -> Optional[int]:
125
+ """
126
+ Accepts:
127
+ - ms-epoch as str/int/float
128
+ - seconds-epoch as str/int/float (auto *1000)
129
+ - ISO-8601 string
130
+ Returns ms since epoch or None.
131
+ """
132
+ if value is None:
133
+ return None
134
+
135
+ # numeric-ish string or number
136
+ try:
137
+ v = int(float(value)) # handles numeric strings
138
+ # heuristics: treat 10-digit as seconds
139
+ if v < 10_000_000_000_000:
140
+ v *= 1000
141
+ return v
142
+ except Exception:
143
+ pass
144
+
145
+ # ISO-8601
146
+ try:
147
+ return to_epoch_ms(str(value))
148
+ except Exception:
149
+ return None
150
+
151
+
152
+ # -----------------------------------------------------------------------------
153
+ # Search (by timestamp cursor)
154
+ # -----------------------------------------------------------------------------
155
+
156
+
157
+ def _search_billing_ids_from(since_ms: int, prop: str) -> List[str]:
158
+ """
159
+ Search billing custom object IDs where {prop} > since_ms.
160
+ Sort ascending so we can advance cursor monotonically.
161
+ """
162
+ url = f"https://api.hubapi.com/crm/v3/objects/{BILLING_OBJECT_TYPE}/search"
163
+ headers = {
164
+ "Authorization": f"Bearer {HUBSPOT_TOKEN}",
165
+ "Content-Type": "application/json",
166
+ "Accept": "application/json",
167
+ }
168
+ payload = {
169
+ "filterGroups": [{
170
+ "filters": [
171
+ {"propertyName": prop, "operator": "GT", "value": str(since_ms)},
172
+ ]
173
+ }],
174
+ "limit": 100,
175
+ "sorts": [{"propertyName": prop, "direction": "ASCENDING"}],
176
+ "properties": ["hs_object_id"],
177
+ }
178
+
179
+ ids: List[str] = []
180
+ after: Optional[str] = None
181
+ with httpx.Client(timeout=30.0) as client:
182
+ while True:
183
+ body = dict(payload)
184
+ if after:
185
+ body["after"] = after
186
+
187
+ resp = client.post(url, headers=headers, json=body)
188
+ if resp.status_code >= 400:
189
+ try:
190
+ logging.error(
191
+ "Billing search error for prop '%s': %s", prop, resp.json())
192
+ except Exception:
193
+ logging.error(
194
+ "Billing search error for prop '%s': %s", prop, resp.text)
195
+ resp.raise_for_status()
196
+
197
+ data = resp.json()
198
+ ids.extend([obj["id"] for obj in data.get("results", []) or []])
199
+
200
+ after = (data.get("paging") or {}).get("next", {}).get("after")
201
+ if not after:
202
+ break
203
+ time.sleep(0.1)
204
+
205
+ return ids
206
+
207
+
208
+ def search_billing_ids_after_ms(since_ms: int) -> Tuple[List[str], str]:
209
+ """
210
+ Search billing IDs where {prop} > since_ms (strictly greater),
211
+ sorted ASC so the max timestamp at the end is monotonic.
212
+ Returns a tuple of (ids, prop_used).
213
+ """
214
+
215
+ props_to_try = ["hs_createdate"]
216
+ last_err = None
217
+
218
+ for prop in props_to_try:
219
+ try:
220
+ ids = _search_billing_ids_from(since_ms, prop)
221
+ logging.info(
222
+ "Billing search with '%s' returned %d IDs.", prop, len(ids))
223
+ return ids, prop
224
+ except httpx.HTTPStatusError as e:
225
+ last_err = e
226
+ continue
227
+
228
+ if last_err:
229
+ raise last_err
230
+ return [], "hs_createdate"
231
+
232
+
233
+ # -----------------------------------------------------------------------------
234
+ # Read-by-ID (with associations)
235
+ # -----------------------------------------------------------------------------
236
+
237
+
238
+ def read_billing_by_ids(
239
+ billing_ids: List[str],
240
+ cursor_prop: str,
241
+ ) -> Tuple[List[Dict], List[Dict], List[Dict], Optional[int]]:
242
+ """
243
+ Read billing services by ID with properties and associations (companies/deals).
244
+ Returns: services, company_links, deal_links, max_ts_ms_for_cursor_prop
245
+ """
246
+ if not billing_ids:
247
+ return [], [], [], None
248
+
249
+ services: List[Dict] = []
250
+ company_links: List[Dict] = []
251
+ deal_links: List[Dict] = []
252
+
253
+ max_ts_ms: Optional[int] = None
254
+
255
+ assoc_types = ["companies", "deals"]
256
+
257
+ for i, bid in enumerate(billing_ids, start=1):
258
+ try:
259
+ record = hubspot_client.crm.objects.basic_api.get_by_id(
260
+ object_type=BILLING_OBJECT_TYPE,
261
+ object_id=bid,
262
+ properties=BILLING_PROPERTIES,
263
+ associations=assoc_types,
264
+ archived=False,
265
+ )
266
+
267
+ p = record.properties or {}
268
+
269
+ # Track max timestamp based on cursor_prop
270
+ cursor_val = p.get(cursor_prop)
271
+ ts_ms = parse_any_ts_ms(cursor_val)
272
+ if ts_ms is not None and (max_ts_ms is None or ts_ms > max_ts_ms):
273
+ max_ts_ms = ts_ms
274
+
275
+ # Build service row
276
+ row = {
277
+ "billing_id": str(record.id),
278
+ "cli": p.get("cli"),
279
+ "account_code": p.get("account_code"),
280
+ "service_name": p.get("service_name"),
281
+ "status": p.get("status"),
282
+ "type": p.get("type"),
283
+ "contract_renewal_date": parse_ts(p.get("contract_renewal_date")),
284
+ "supplier_network": p.get("supplier_network"),
285
+ "network_name": p.get("network_name"),
286
+ "product_type_name": p.get("product_type_name"),
287
+ "hubspot_created_by": try_parse_int(p.get("hs_created_by_user_id")),
288
+ "hubspot_created_at": parse_ts(p.get("hs_createdate")),
289
+ }
290
+ services.append(row)
291
+
292
+ # Associations
293
+ assoc = record.associations or {}
294
+
295
+ if assoc.get("companies") and getattr(assoc["companies"], "results", None):
296
+ for a in assoc["companies"].results:
297
+ if a.id:
298
+ company_links.append({
299
+ "billing_id": str(record.id),
300
+ "company_id": str(a.id),
301
+ })
302
+
303
+ if assoc.get("deals") and getattr(assoc["deals"], "results", None):
304
+ for a in assoc["deals"].results:
305
+ if a.id:
306
+ deal_links.append({
307
+ "billing_id": str(record.id),
308
+ "deal_id": str(a.id),
309
+ })
310
+
311
+ if i % 200 == 0:
312
+ logging.info("Read %d billing services...", i)
313
+
314
+ time.sleep(0.05)
315
+
316
+ except httpx.HTTPStatusError as e:
317
+ logging.error("HTTP error reading billing %s: %s", bid, e)
318
+ except Exception as e:
319
+ logging.error("Error reading billing %s: %s", bid, e)
320
+
321
+ return services, company_links, deal_links, max_ts_ms
322
+
323
+
324
+ # -----------------------------------------------------------------------------
325
+ # Upsert
326
+ # -----------------------------------------------------------------------------
327
+
328
+
329
+ def upsert_billing(
330
+ services: List[Dict],
331
+ company_links: List[Dict],
332
+ deal_links: List[Dict],
333
+ ) -> None:
334
+ if services:
335
+ services = deduplicate_by_key(services, key="billing_id")
336
+ batched_insert(
337
+ supabase_client, "hubspot_billing_services", services,
338
+ batch_size=1000, on_conflict=["billing_id"]
339
+ )
340
+ print(f"Upserted {len(services)} billing services.")
341
+
342
+ if company_links:
343
+ company_links = deduplicate_by_key(
344
+ company_links, key=("billing_id", "company_id"))
345
+ batched_insert(
346
+ supabase_client, "hubspot_billing_companies", company_links,
347
+ batch_size=1000, on_conflict=["billing_id", "company_id"]
348
+ )
349
+ print(f"Upserted {len(company_links)} billing-company associations.")
350
+
351
+ if deal_links:
352
+ deal_links = deduplicate_by_key(
353
+ deal_links, key=("billing_id", "deal_id"))
354
+ batched_insert(
355
+ supabase_client, "hubspot_billing_deals", deal_links,
356
+ batch_size=1000, on_conflict=["billing_id", "deal_id"]
357
+ )
358
+ print(f"Upserted {len(deal_links)} billing-deal associations.")
359
+
360
+
361
+ # -----------------------------------------------------------------------------
362
+ # Main (timestamp cursor)
363
+ # -----------------------------------------------------------------------------
364
+
365
+
366
+ def main(since_ms: Optional[int] = None):
367
+ """
368
+ Orchestrates:
369
+ 1) Search billing IDs with <cursor_prop> > since_ms
370
+ 2) Read full records with associations (track max timestamp)
371
+ 3) Upsert into Supabase
372
+ 4) Update sync metadata with { last_sync_metadata, last_sync_time, cursor_prop }
373
+ """
374
+ # Resolve since_ms
375
+ if since_ms is None and BOOTSTRAP_SINCE_MS_ENV:
376
+ try:
377
+ since_ms = int(BOOTSTRAP_SINCE_MS_ENV)
378
+ except ValueError:
379
+ raise RuntimeError(
380
+ "HUBSPOT_BILLING_SINCE_MS must be an integer (ms) if set.")
381
+
382
+ if since_ms is None:
383
+ # Default: today@00:00:00Z for first run
384
+ today0 = floor_to_utc_midnight(
385
+ datetime.datetime.now(datetime.timezone.utc))
386
+ since_ms = to_epoch_ms(today0)
387
+
388
+ print(f"Searching billing services with timestamp > {since_ms} ...")
389
+ ids, cursor_prop = search_billing_ids_after_ms(since_ms)
390
+ print(f"Search property: {cursor_prop}. Found {len(ids)} billing IDs.")
391
+
392
+ now_iso = datetime.datetime.now(datetime.timezone.utc).isoformat()
393
+
394
+ if not ids:
395
+ print("No billing services beyond the cursor. Updating sync metadata and exiting.")
396
+ update_sync_metadata(supabase_client, "hubspot_billing_services", now_iso)
397
+ return
398
+
399
+ print("Reading billing services (with associations)...")
400
+ services, company_links, deal_links, max_ts_ms = read_billing_by_ids(
401
+ ids, cursor_prop)
402
+
403
+ print("Upserting into Supabase...")
404
+ upsert_billing(services, company_links, deal_links)
405
+
406
+ # Advance cursor to max timestamp we actually ingested for the chosen property
407
+ new_cursor_ms = max_ts_ms if max_ts_ms is not None else since_ms
408
+
409
+ update_sync_metadata(supabase_client, "hubspot_billing_services", now_iso)
410
+
411
+ print(
412
+ f"Billing sync complete. Advanced cursor to {new_cursor_ms} using prop '{cursor_prop}'.")
413
+
414
+
415
+ # -----------------------------------------------------------------------------
416
+ # CLI
417
+ # -----------------------------------------------------------------------------
418
+
419
+
420
+ def _parse_cli_arg_to_ms(arg: str) -> int:
421
+ """
422
+ Accept:
423
+ - integer epoch ms
424
+ - ISO-8601 (Z or offset)
425
+ - YYYY-MM-DD (floors to 00:00Z)
426
+ """
427
+ # epoch ms or seconds
428
+ if re.fullmatch(r"\d{10,13}", arg):
429
+ v = int(arg)
430
+ if v < 10_000_000_000_000: # seconds -> ms
431
+ v *= 1000
432
+ return v
433
+
434
+ # YYYY-MM-DD
435
+ if re.fullmatch(r"\d{4}-\d{2}-\d{2}", arg):
436
+ d = datetime.datetime.strptime(
437
+ arg, "%Y-%m-%d").replace(tzinfo=datetime.timezone.utc)
438
+ return to_epoch_ms(floor_to_utc_midnight(d))
439
+
440
+ # ISO-8601
441
+ return to_epoch_ms(arg)
442
+
443
+
444
+ if __name__ == "__main__":
445
+ import sys
446
+ if len(sys.argv) > 1:
447
+ try:
448
+ main(since_ms=_parse_cli_arg_to_ms(sys.argv[1]))
449
+ except Exception as e:
450
+ print(
451
+ f"Invalid timestamp. Provide epoch ms, ISO-8601, or YYYY-MM-DD. Error: {e}")
452
+ raise SystemExit(1)
453
+ else:
454
+ main()
python/hubspot_companies.py ADDED
@@ -0,0 +1,475 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ HubSpot Companies → Supabase (incremental since a millisecond cursor)
3
+
4
+ Usage from orchestrator:
5
+ import load_hubspot_companies
6
+ load_hubspot_companies.main(since_ms=<int milliseconds since epoch UTC>)
7
+
8
+ Direct CLI:
9
+ # epoch ms
10
+ python load_hubspot_companies.py 1754025600000
11
+ # ISO-8601
12
+ python load_hubspot_companies.py 2025-08-01T09:30:00Z
13
+ # Back-compat date (floors to 00:00Z)
14
+ python load_hubspot_companies.py 2025-08-01
15
+ """
16
+
17
+ import os
18
+ import re
19
+ import time
20
+ import logging
21
+ import datetime
22
+ from typing import List, Dict, Optional, Tuple, Union
23
+
24
+ import httpx
25
+ import hubspot
26
+ from dotenv import load_dotenv
27
+ from supabase import create_client
28
+ from hubspot.crm.companies import ApiException as CompaniesApiException
29
+
30
+ from hubspot_utils import (
31
+ try_parse_int, parse_ts, get_property_label_mapping,
32
+ )
33
+ from supabase_utils import (
34
+ update_sync_metadata, enrich_supabase_row, upload_raw_json_to_supabase,
35
+ batched_insert, fetch_supabase_table,
36
+ )
37
+
38
+ # -----------------------------------------------------------------------------
39
+ # Logging
40
+ # -----------------------------------------------------------------------------
41
+ logging.basicConfig(
42
+ filename=f"logs/hubspot_company_pipeline_{datetime.datetime.now().strftime('%Y-%m-%d')}.log",
43
+ filemode="a",
44
+ level=logging.INFO,
45
+ format="%(asctime)s [%(levelname)s] %(message)s",
46
+ )
47
+
48
+ # -----------------------------------------------------------------------------
49
+ # Environment
50
+ # -----------------------------------------------------------------------------
51
+ load_dotenv()
52
+ HUBSPOT_TOKEN = os.getenv("HUBSPOT_TOKEN")
53
+ SUPABASE_URL = os.getenv("SUPABASE_URL")
54
+ SUPABASE_SERVICE_ROLE_KEY = os.getenv("SUPABASE_SERVICE_ROLE_KEY")
55
+ # Optional bootstrap cursor if orchestrator doesn't provide one
56
+ BOOTSTRAP_SINCE_MS_ENV = os.getenv("HUBSPOT_COMPANIES_SINCE_MS")
57
+
58
+ if not HUBSPOT_TOKEN:
59
+ raise RuntimeError("HUBSPOT_TOKEN is not set")
60
+ if not SUPABASE_URL or not SUPABASE_SERVICE_ROLE_KEY:
61
+ raise RuntimeError("Supabase env vars are not set")
62
+
63
+ hubspot_client = hubspot.Client.create(access_token=HUBSPOT_TOKEN)
64
+ supabase_client = create_client(SUPABASE_URL, SUPABASE_SERVICE_ROLE_KEY)
65
+
66
+ # -----------------------------------------------------------------------------
67
+ # Config
68
+ # -----------------------------------------------------------------------------
69
+ COMPANY_PROPERTIES = [
70
+ "name",
71
+ "city",
72
+ "company_email",
73
+ "address",
74
+ "address2",
75
+ "domain",
76
+ "number_of_active__cli_s",
77
+ "number_of__mobile___cloned_",
78
+ "createdate",
79
+ "hs_lastmodifieddate",
80
+ "lastmodifieddate",
81
+ "review_date_updated_by___clone",
82
+ "industry",
83
+ "macro_industry_grouping",
84
+ "closest_review_date___clone",
85
+ ]
86
+
87
+ # -----------------------------------------------------------------------------
88
+ # Time helpers
89
+ # -----------------------------------------------------------------------------
90
+
91
+
92
+ def _ensure_utc(dt: datetime.datetime) -> datetime.datetime:
93
+ if dt.tzinfo is None:
94
+ dt = dt.replace(tzinfo=datetime.timezone.utc)
95
+ return dt.astimezone(datetime.timezone.utc)
96
+
97
+
98
+ def floor_to_utc_midnight(dt: datetime.datetime) -> datetime.datetime:
99
+ dt = _ensure_utc(dt)
100
+ return dt.replace(hour=0, minute=0, second=0, microsecond=0)
101
+
102
+
103
+ def _parse_iso_like_to_dt(value: str) -> datetime.datetime:
104
+ if isinstance(value, str) and value.endswith("Z"):
105
+ value = value[:-1] + "+00:00"
106
+ dt = datetime.datetime.fromisoformat(value)
107
+ return _ensure_utc(dt)
108
+
109
+
110
+ def to_epoch_ms(dt_or_str: Union[str, datetime.datetime]) -> int:
111
+ if isinstance(dt_or_str, str):
112
+ dt = _parse_iso_like_to_dt(dt_or_str)
113
+ elif isinstance(dt_or_str, datetime.datetime):
114
+ dt = _ensure_utc(dt_or_str)
115
+ else:
116
+ raise TypeError(f"Unsupported type for to_epoch_ms: {type(dt_or_str)}")
117
+ return int(dt.timestamp() * 1000)
118
+
119
+
120
+ def parse_any_ts_ms(value: Optional[Union[str, int, float]]) -> Optional[int]:
121
+ if value is None:
122
+ return None
123
+ try:
124
+ v = int(str(value))
125
+ if v < 10_000_000_000_000: # seconds → ms
126
+ v *= 1000
127
+ return v
128
+ except ValueError:
129
+ pass
130
+ try:
131
+ return to_epoch_ms(str(value))
132
+ except Exception:
133
+ logging.warning("Could not parse timestamp value=%r", value)
134
+ return None
135
+
136
+ # -----------------------------------------------------------------------------
137
+ # Search IDs (ts > since_ms) with property fallback
138
+ # -----------------------------------------------------------------------------
139
+
140
+
141
+ def _search_company_ids_from(since_ms: int, prop: str) -> List[str]:
142
+ """
143
+ Search companies where {prop} > since_ms (epoch-ms).
144
+ Sort ascending so we can advance the cursor monotonically.
145
+ """
146
+ url = "https://api.hubapi.com/crm/v3/objects/companies/search"
147
+ headers = {
148
+ "Authorization": f"Bearer {HUBSPOT_TOKEN}",
149
+ "Content-Type": "application/json",
150
+ "Accept": "application/json",
151
+ }
152
+ payload = {
153
+ "filterGroups": [{
154
+ "filters": [
155
+ {"propertyName": prop, "operator": "GT",
156
+ "value": str(since_ms)},
157
+ ]
158
+ }],
159
+ "limit": 100,
160
+ "sorts": [{"propertyName": prop, "direction": "ASCENDING"}],
161
+ }
162
+
163
+ ids: List[str] = []
164
+ after: Optional[str] = None
165
+ with httpx.Client(timeout=30.0) as client:
166
+ while True:
167
+ body = dict(payload)
168
+ if after:
169
+ body["after"] = after
170
+
171
+ resp = client.post(url, headers=headers, json=body)
172
+ if resp.status_code >= 400:
173
+ try:
174
+ logging.error(
175
+ "Company search error for prop '%s': %s", prop, resp.json())
176
+ except Exception:
177
+ logging.error(
178
+ "Company search error for prop '%s': %s", prop, resp.text)
179
+ resp.raise_for_status()
180
+
181
+ data = resp.json()
182
+ ids.extend([obj["id"] for obj in data.get("results", []) or []])
183
+
184
+ after = (data.get("paging") or {}).get("next", {}).get("after")
185
+ if not after:
186
+ break
187
+ time.sleep(0.1)
188
+
189
+ return ids
190
+
191
+
192
+ def search_company_ids_after_ms(since_ms: int) -> Tuple[List[str], str]:
193
+ """
194
+ Try these properties in order; return (ids, prop_used) for the first successful search:
195
+ 1) hs_lastmodifieddate
196
+ 2) lastmodifieddate
197
+ 3) createdate
198
+ """
199
+ props_to_try = ["createdate"]
200
+ last_err = None
201
+
202
+ for prop in props_to_try:
203
+ try:
204
+ ids = _search_company_ids_from(since_ms, prop)
205
+ logging.info(
206
+ "Company search with '%s' returned %d IDs.", prop, len(ids))
207
+ return ids, prop
208
+ except httpx.HTTPStatusError as e:
209
+ last_err = e
210
+ continue
211
+
212
+ if last_err:
213
+ raise last_err
214
+ return [], "hs_lastmodifieddate"
215
+
216
+ # -----------------------------------------------------------------------------
217
+ # Read-by-ID (with associations) → enrich & track max cursor ts
218
+ # -----------------------------------------------------------------------------
219
+
220
+
221
+ def _enrich_company_data_from_record(
222
+ record,
223
+ industry_map: Dict[str, str],
224
+ macro_industry_map: Dict[str, str],
225
+ ) -> Dict:
226
+ props = record.properties or {}
227
+ company_data: Dict[str, Optional[str]] = {"id": record.id}
228
+ for p in COMPANY_PROPERTIES:
229
+ company_data[p] = props.get(p)
230
+
231
+ # Association counts
232
+ num_contacts = 0
233
+ num_deals = 0
234
+ if getattr(record, "associations", None):
235
+ if record.associations.get("contacts") and getattr(record.associations["contacts"], "results", None):
236
+ num_contacts = len(
237
+ {a.id for a in record.associations["contacts"].results if getattr(a, "id", None)})
238
+ if record.associations.get("deals") and getattr(record.associations["deals"], "results", None):
239
+ num_deals = len(
240
+ {a.id for a in record.associations["deals"].results if getattr(a, "id", None)})
241
+ company_data["number_of_associated_contacts"] = num_contacts
242
+ company_data["number_of_associated_deals"] = num_deals
243
+
244
+ # Map label properties
245
+ code = company_data.get("industry")
246
+ company_data["industry"] = industry_map.get(
247
+ code) if code in industry_map else None
248
+
249
+ macro = company_data.get("macro_industry_grouping")
250
+ company_data["macro_industry_grouping"] = macro_industry_map.get(
251
+ macro) if macro in macro_industry_map else None
252
+
253
+ return company_data
254
+
255
+
256
+ def read_companies_by_ids(company_ids: List[str], cursor_prop: str) -> Tuple[List[Dict], Optional[int]]:
257
+ if not company_ids:
258
+ return [], None
259
+
260
+ companies: List[Dict] = []
261
+ assoc_types = ["contacts", "deals"]
262
+
263
+ # Fetch label maps once
264
+ try:
265
+ industry_map = get_property_label_mapping(
266
+ hubspot_client, "companies", "industry")
267
+ except Exception as e:
268
+ logging.warning("Failed to fetch industry map: %s", e)
269
+ industry_map = {}
270
+
271
+ try:
272
+ macro_industry_map = get_property_label_mapping(
273
+ hubspot_client, "companies", "macro_industry_grouping")
274
+ except Exception as e:
275
+ logging.warning("Failed to fetch macro_industry_grouping map: %s", e)
276
+ macro_industry_map = {}
277
+
278
+ max_ts_ms: Optional[int] = None
279
+
280
+ for i, cid in enumerate(company_ids, start=1):
281
+ try:
282
+ record = hubspot_client.crm.companies.basic_api.get_by_id(
283
+ company_id=cid,
284
+ properties=COMPANY_PROPERTIES,
285
+ associations=assoc_types,
286
+ archived=False,
287
+ )
288
+
289
+ # Track max timestamp for the chosen cursor property
290
+ cursor_val = (record.properties or {}).get(cursor_prop)
291
+ ts_ms = parse_any_ts_ms(cursor_val)
292
+ if ts_ms is not None and (max_ts_ms is None or ts_ms > max_ts_ms):
293
+ max_ts_ms = ts_ms
294
+
295
+ companies.append(_enrich_company_data_from_record(
296
+ record, industry_map, macro_industry_map))
297
+
298
+ if i % 200 == 0:
299
+ logging.info("Read %d companies...", i)
300
+
301
+ time.sleep(0.05)
302
+
303
+ except httpx.HTTPStatusError as e:
304
+ logging.error("HTTP error reading company %s: %s", cid, e)
305
+ except (CompaniesApiException, httpx.HTTPError) as e:
306
+ logging.error("Error reading company %s: %s", cid, e)
307
+
308
+ return companies, max_ts_ms
309
+
310
+ # -----------------------------------------------------------------------------
311
+ # Map → Supabase rows and diff
312
+ # -----------------------------------------------------------------------------
313
+
314
+
315
+ def map_company_data_for_db(companies: List[Dict]) -> List[Dict]:
316
+ mapped: List[Dict] = []
317
+ for c in companies:
318
+ base_row = {
319
+ "company_id": try_parse_int(c["id"]),
320
+ "company_name": c.get("name"),
321
+ "company_email": c.get("company_email"),
322
+ "city": c.get("city"),
323
+ "domain": c.get("domain"),
324
+ "street_address": c.get("address"),
325
+ "street_address2": c.get("address2"),
326
+ "hubspot_create_date": parse_ts(c.get("createdate")),
327
+ "hubspot_modified_date": parse_ts(c.get("hs_lastmodifieddate") or c.get("lastmodifieddate")),
328
+ "review_date_updated_by": c.get("review_date_updated_by___clone") or None,
329
+ "number_of_active_clis": try_parse_int(c.get("number_of_active__cli_s")),
330
+ "number_of_clis": try_parse_int(c.get("number_of__mobile___cloned_")),
331
+ "number_of_associated_contacts": c.get("number_of_associated_contacts", 0),
332
+ "number_of_associated_deals": c.get("number_of_associated_deals", 0),
333
+ "industry": c.get("industry"),
334
+ "macro_industry_grouping": c.get("macro_industry_grouping"),
335
+ "closest_review_date": parse_ts(c.get("closest_review_date___clone")),
336
+ }
337
+ mapped.append(enrich_supabase_row(base_row))
338
+ return mapped
339
+
340
+
341
+ def companies_are_different(new_row: Dict, old_row: Dict) -> bool:
342
+ compare_keys = [
343
+ "company_name", "company_email", "city", "domain", "street_address",
344
+ "street_address2", "hubspot_modified_date", "review_date_updated_by",
345
+ "number_of_active_clis", "number_of_clis",
346
+ "number_of_associated_contacts", "number_of_associated_deals",
347
+ "industry", "macro_industry_grouping", "closest_review_date",
348
+ ]
349
+ for key in compare_keys:
350
+ if str(new_row.get(key)) != str(old_row.get(key)):
351
+ return True
352
+ return False
353
+
354
+ # -----------------------------------------------------------------------------
355
+ # Upsert
356
+ # -----------------------------------------------------------------------------
357
+
358
+
359
+ def upsert_companies(companies: List[Dict]) -> None:
360
+ if not companies:
361
+ print("No companies to upsert.")
362
+ return
363
+
364
+ existing = fetch_supabase_table(
365
+ supabase_client, "hubspot_companies", "company_id")
366
+ mapped = map_company_data_for_db(companies)
367
+
368
+ rows_to_upsert: List[Dict] = []
369
+ for row in mapped:
370
+ cid = row.get("company_id")
371
+ if not cid:
372
+ continue
373
+ old_row = existing.get(str(cid))
374
+ if not old_row or companies_are_different(row, old_row):
375
+ rows_to_upsert.append(row)
376
+
377
+ print(f"{len(rows_to_upsert)} companies to insert/update (out of {len(companies)} read).")
378
+
379
+ if rows_to_upsert:
380
+ # upload_raw_json_to_supabase(supabase_client, rows_to_upsert, object_type="companies")
381
+ batched_insert(supabase_client, "hubspot_companies",
382
+ rows_to_upsert, batch_size=1000)
383
+
384
+ # -----------------------------------------------------------------------------
385
+ # Main (timestamp cursor)
386
+ # -----------------------------------------------------------------------------
387
+
388
+
389
+ def main(since_ms: Optional[int] = None):
390
+ """
391
+ Orchestrates:
392
+ 1) Search company IDs with <cursor_prop> > since_ms (property fallback)
393
+ 2) Read full companies (track max timestamp for <cursor_prop>)
394
+ 3) Upsert into Supabase
395
+ 4) Update sync metadata with { last_sync_metadata, last_sync_time, cursor_prop }
396
+ """
397
+ # Resolve since_ms
398
+ if since_ms is None and BOOTSTRAP_SINCE_MS_ENV:
399
+ try:
400
+ since_ms = int(BOOTSTRAP_SINCE_MS_ENV)
401
+ except ValueError:
402
+ raise RuntimeError(
403
+ "HUBSPOT_COMPANIES_SINCE_MS must be an integer (ms) if set.")
404
+
405
+ if since_ms is None:
406
+ # Default: today@00:00:00Z for first run
407
+ today0 = floor_to_utc_midnight(
408
+ datetime.datetime.now(datetime.timezone.utc))
409
+ since_ms = to_epoch_ms(today0)
410
+
411
+ print(f"Searching companies with timestamp > {since_ms} ...")
412
+ ids, cursor_prop = search_company_ids_after_ms(since_ms)
413
+ print(f"Search property: {cursor_prop}. Found {len(ids)} company IDs.")
414
+
415
+ now_iso = datetime.datetime.now(datetime.timezone.utc).isoformat()
416
+
417
+ if not ids:
418
+ print("No companies beyond the cursor. Updating sync metadata and exiting.")
419
+ update_sync_metadata(supabase_client, "companies", now_iso)
420
+ return
421
+
422
+ print("Reading companies (with associations)...")
423
+ companies, max_ts_ms = read_companies_by_ids(ids, cursor_prop)
424
+
425
+ print("Upserting into Supabase...")
426
+ upsert_companies(companies)
427
+
428
+ # Advance cursor to max timestamp we actually ingested for the chosen property
429
+ new_cursor_ms = max_ts_ms if max_ts_ms is not None else since_ms
430
+
431
+ update_sync_metadata(supabase_client, "companies", now_iso)
432
+
433
+ print(
434
+ f"Companies sync complete. Advanced cursor to {new_cursor_ms} using prop '{cursor_prop}'.")
435
+
436
+ # -----------------------------------------------------------------------------
437
+ # CLI
438
+ # -----------------------------------------------------------------------------
439
+
440
+
441
+ def _parse_cli_arg_to_ms(arg: str) -> int:
442
+ """
443
+ Accept:
444
+ - integer epoch ms
445
+ - ISO-8601 (Z or offset)
446
+ - YYYY-MM-DD (floors to 00:00Z)
447
+ """
448
+ # epoch ms or seconds
449
+ if re.fullmatch(r"\d{10,13}", arg):
450
+ v = int(arg)
451
+ if v < 10_000_000_000_000: # seconds -> ms
452
+ v *= 1000
453
+ return v
454
+
455
+ # YYYY-MM-DD
456
+ if re.fullmatch(r"\d{4}-\d{2}-\d{2}", arg):
457
+ d = datetime.datetime.strptime(
458
+ arg, "%Y-%m-%d").replace(tzinfo=datetime.timezone.utc)
459
+ return to_epoch_ms(floor_to_utc_midnight(d))
460
+
461
+ # ISO-8601
462
+ return to_epoch_ms(arg)
463
+
464
+
465
+ if __name__ == "__main__":
466
+ import sys
467
+ if len(sys.argv) > 1:
468
+ try:
469
+ main(since_ms=_parse_cli_arg_to_ms(sys.argv[1]))
470
+ except Exception as e:
471
+ print(
472
+ f"Invalid timestamp. Provide epoch ms, ISO-8601, or YYYY-MM-DD. Error: {e}")
473
+ sys.exit(1)
474
+ else:
475
+ main()
python/hubspot_contacts.py ADDED
@@ -0,0 +1,415 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ HubSpot Contacts → Supabase (incremental since a millisecond cursor)
3
+
4
+ Usage from orchestrator:
5
+ import load_hubspot_contacts
6
+ load_hubspot_contacts.main(since_ms=<int milliseconds since epoch UTC>)
7
+ """
8
+
9
+ import os
10
+ import re
11
+ import time
12
+ import logging
13
+ import datetime
14
+ from typing import List, Dict, Tuple, Optional, Union
15
+
16
+ import httpx
17
+ import hubspot
18
+ from dotenv import load_dotenv
19
+ from supabase import create_client
20
+ from hubspot.crm.contacts import ApiException as ContactsApiException
21
+
22
+ from hubspot_utils import (
23
+ try_parse_int, parse_ts, get_property_label_mapping,
24
+ )
25
+ from supabase_utils import (
26
+ fetch_supabase_table, update_sync_metadata, enrich_supabase_row,
27
+ upload_raw_json_to_supabase, batched_insert,
28
+ )
29
+
30
+ # -----------------------------------------------------------------------------
31
+ # Logging
32
+ # -----------------------------------------------------------------------------
33
+ logging.basicConfig(
34
+ filename=f"logs/hubspot_contact_pipeline_{datetime.datetime.now().strftime('%Y-%m-%d')}.log",
35
+ filemode="a",
36
+ level=logging.INFO,
37
+ format="%(asctime)s [%(levelname)s] %(message)s",
38
+ )
39
+
40
+ # -----------------------------------------------------------------------------
41
+ # Environment
42
+ # -----------------------------------------------------------------------------
43
+ load_dotenv()
44
+ HUBSPOT_TOKEN = os.getenv("HUBSPOT_TOKEN")
45
+ SUPABASE_URL = os.getenv("SUPABASE_URL")
46
+ SUPABASE_SERVICE_ROLE_KEY = os.getenv("SUPABASE_SERVICE_ROLE_KEY")
47
+ # Optional bootstrap cursor if orchestrator doesn't provide one
48
+ BOOTSTRAP_SINCE_MS_ENV = os.getenv("HUBSPOT_CONTACTS_SINCE_MS")
49
+
50
+ if not HUBSPOT_TOKEN:
51
+ raise RuntimeError("HUBSPOT_TOKEN is not set")
52
+ if not SUPABASE_URL or not SUPABASE_SERVICE_ROLE_KEY:
53
+ raise RuntimeError("Supabase env vars are not set")
54
+
55
+ hubspot_client = hubspot.Client.create(access_token=HUBSPOT_TOKEN)
56
+ supabase_client = create_client(SUPABASE_URL, SUPABASE_SERVICE_ROLE_KEY)
57
+
58
+ # -----------------------------------------------------------------------------
59
+ # Config
60
+ # -----------------------------------------------------------------------------
61
+ CONTACT_PROPERTIES = [
62
+ "full_name",
63
+ "firstname",
64
+ "lastname",
65
+ "email",
66
+ "phone",
67
+ "job_title_level",
68
+ "createdate",
69
+ "lastmodifieddate",
70
+ "hs_lastmodifieddate",
71
+ "notes_last_updated",
72
+ "associatedcompanyid",
73
+ ]
74
+
75
+ # -----------------------------------------------------------------------------
76
+ # Time helpers
77
+ # -----------------------------------------------------------------------------
78
+ def _ensure_utc(dt: datetime.datetime) -> datetime.datetime:
79
+ if dt.tzinfo is None:
80
+ dt = dt.replace(tzinfo=datetime.timezone.utc)
81
+ return dt.astimezone(datetime.timezone.utc)
82
+
83
+ def floor_to_utc_midnight(dt: datetime.datetime) -> datetime.datetime:
84
+ dt = _ensure_utc(dt)
85
+ return dt.replace(hour=0, minute=0, second=0, microsecond=0)
86
+
87
+ def _parse_iso_like_to_dt(value: str) -> datetime.datetime:
88
+ if isinstance(value, str) and value.endswith("Z"):
89
+ value = value[:-1] + "+00:00"
90
+ dt = datetime.datetime.fromisoformat(value)
91
+ return _ensure_utc(dt)
92
+
93
+ def to_epoch_ms(dt_or_str: Union[str, datetime.datetime]) -> int:
94
+ if isinstance(dt_or_str, str):
95
+ dt = _parse_iso_like_to_dt(dt_or_str)
96
+ elif isinstance(dt_or_str, datetime.datetime):
97
+ dt = _ensure_utc(dt_or_str)
98
+ else:
99
+ raise TypeError(f"Unsupported type for to_epoch_ms: {type(dt_or_str)}")
100
+ return int(dt.timestamp() * 1000)
101
+
102
+ def parse_any_ts_ms(value: Optional[Union[str, int, float]]) -> Optional[int]:
103
+ """
104
+ Accepts ms-epoch / sec-epoch / ISO-8601; returns ms since epoch or None.
105
+ """
106
+ if value is None:
107
+ return None
108
+ try:
109
+ v = int(str(value))
110
+ if v < 10_000_000_000_000: # seconds → ms
111
+ v *= 1000
112
+ return v
113
+ except ValueError:
114
+ pass
115
+ try:
116
+ return to_epoch_ms(str(value))
117
+ except Exception:
118
+ logging.warning("Could not parse timestamp value=%r", value)
119
+ return None
120
+
121
+ # -----------------------------------------------------------------------------
122
+ # Mapping & helpers
123
+ # -----------------------------------------------------------------------------
124
+ def map_contact_data_for_db(contacts: List[Dict]) -> List[Dict]:
125
+ mapped = []
126
+ for c in contacts:
127
+ base_row = {
128
+ "contact_id": try_parse_int(c["id"]),
129
+ "full_name": c.get("full_name"),
130
+ "first_name": c.get("firstname"),
131
+ "last_name": c.get("lastname"),
132
+ "email": c.get("email"),
133
+ "phone_number": c.get("phone"),
134
+ "job_title_level": c.get("job_title_level"),
135
+ "hubspot_create_date": parse_ts(c.get("createdate")) or None,
136
+ "hubspot_modified_date": parse_ts(c.get("lastmodifieddate") or c.get("hs_lastmodifieddate")) or None,
137
+ "hubspot_last_activity_date": parse_ts(c.get("notes_last_updated")) or None,
138
+ "number_of_associated_deals": c.get("number_of_associated_deals", 0),
139
+ "associated_company_id": try_parse_int(c.get("associatedcompanyid")),
140
+ }
141
+ mapped.append(enrich_supabase_row(base_row))
142
+ return mapped
143
+
144
+ def contacts_are_different(new_row: Dict, old_row: Dict) -> bool:
145
+ compare_keys = [
146
+ "full_name", "first_name", "last_name", "email", "phone_number",
147
+ "hubspot_create_date", "hubspot_modified_date",
148
+ "hubspot_last_activity_date", "number_of_associated_deals",
149
+ "associated_company_id"
150
+ ]
151
+ for key in compare_keys:
152
+ if str(new_row.get(key)) != str(old_row.get(key)):
153
+ return True
154
+ return False
155
+
156
+ # -----------------------------------------------------------------------------
157
+ # Search IDs (ts > since_ms) with property fallback
158
+ # -----------------------------------------------------------------------------
159
+ def _search_contact_ids_from(since_ms: int, prop: str) -> List[str]:
160
+ """
161
+ Search contacts where {prop} > since_ms (epoch-ms).
162
+ Sort ascending so we can advance the cursor monotonically.
163
+ """
164
+ url = "https://api.hubapi.com/crm/v3/objects/contacts/search"
165
+ headers = {
166
+ "Authorization": f"Bearer {HUBSPOT_TOKEN}",
167
+ "Content-Type": "application/json",
168
+ "Accept": "application/json",
169
+ }
170
+ payload = {
171
+ "filterGroups": [{
172
+ "filters": [
173
+ {"propertyName": prop, "operator": "GT", "value": str(since_ms)}
174
+ ]
175
+ }],
176
+ "limit": 100,
177
+ "sorts": [{"propertyName": prop, "direction": "ASCENDING"}],
178
+ }
179
+
180
+ ids: List[str] = []
181
+ after: Optional[str] = None
182
+ with httpx.Client(timeout=30.0) as client:
183
+ while True:
184
+ body = dict(payload)
185
+ if after:
186
+ body["after"] = after
187
+
188
+ resp = client.post(url, headers=headers, json=body)
189
+ if resp.status_code >= 400:
190
+ try:
191
+ logging.error("Contacts search error for prop '%s': %s", prop, resp.json())
192
+ except Exception:
193
+ logging.error("Contacts search error for prop '%s': %s", prop, resp.text)
194
+ resp.raise_for_status()
195
+
196
+ data = resp.json()
197
+ ids.extend([obj["id"] for obj in data.get("results", []) or []])
198
+
199
+ after = (data.get("paging") or {}).get("next", {}).get("after")
200
+ if not after:
201
+ break
202
+ time.sleep(0.1)
203
+
204
+ return ids
205
+
206
+ def search_contact_ids_after_ms(since_ms: int) -> Tuple[List[str], str]:
207
+ """
208
+ Try these properties in order; return (ids, prop_used) for the first successful search:
209
+ 1) hs_lastmodifieddate
210
+ 2) lastmodifieddate
211
+ 3) createdate
212
+ """
213
+ props_to_try = ["createdate"]
214
+ last_err = None
215
+
216
+ for prop in props_to_try:
217
+ try:
218
+ ids = _search_contact_ids_from(since_ms, prop)
219
+ logging.info("Contacts search with '%s' returned %d IDs.", prop, len(ids))
220
+ return ids, prop
221
+ except httpx.HTTPStatusError as e:
222
+ last_err = e
223
+ continue
224
+
225
+ if last_err:
226
+ raise last_err
227
+ return [], "hs_lastmodifieddate"
228
+
229
+ # -----------------------------------------------------------------------------
230
+ # Read-by-ID (with associations) → enrich & track max cursor ts
231
+ # -----------------------------------------------------------------------------
232
+ def _enrich_contact_data_from_record(record, job_level_map: Optional[Dict[str, str]]) -> Dict:
233
+ props = record.properties or {}
234
+
235
+ contact_data: Dict[str, Optional[str]] = {"id": record.id}
236
+ for p in CONTACT_PROPERTIES:
237
+ contact_data[p] = props.get(p)
238
+
239
+ # Full name fallback
240
+ if not contact_data.get("full_name") or not str(contact_data["full_name"]).strip():
241
+ first = contact_data.get("firstname") or ""
242
+ last = contact_data.get("lastname") or ""
243
+ contact_data["full_name"] = f"{first.strip()} {last.strip()}".strip()
244
+
245
+ # Associations: deals count
246
+ num_deals = 0
247
+ if getattr(record, "associations", None) and record.associations.get("deals"):
248
+ bucket = record.associations["deals"]
249
+ if getattr(bucket, "results", None):
250
+ num_deals = len({a.id for a in bucket.results if getattr(a, "id", None)})
251
+
252
+ contact_data["number_of_associated_deals"] = num_deals
253
+
254
+ # Map job_title_level code → label (if configured)
255
+ if job_level_map:
256
+ code = contact_data.get("job_title_level")
257
+ contact_data["job_title_level"] = job_level_map.get(code) if code in job_level_map else None
258
+
259
+ return contact_data
260
+
261
+ def read_contacts_by_ids(contact_ids: List[str], cursor_prop: str) -> Tuple[List[Dict], Optional[int]]:
262
+ if not contact_ids:
263
+ return [], None
264
+
265
+ contacts: List[Dict] = []
266
+ assoc_types = ["deals"]
267
+
268
+ # Fetch property label map once
269
+ try:
270
+ job_level_map = get_property_label_mapping(hubspot_client, "contacts", "job_title_level")
271
+ except Exception as e:
272
+ logging.warning("Failed to fetch job_title_level map: %s", e)
273
+ job_level_map = None
274
+
275
+ max_ts_ms: Optional[int] = None
276
+
277
+ for i, cid in enumerate(contact_ids, start=1):
278
+ try:
279
+ record = hubspot_client.crm.contacts.basic_api.get_by_id(
280
+ contact_id=cid,
281
+ properties=CONTACT_PROPERTIES,
282
+ associations=assoc_types,
283
+ archived=False,
284
+ )
285
+
286
+ # Track max timestamp for the chosen cursor property
287
+ cursor_val = (record.properties or {}).get(cursor_prop)
288
+ ts_ms = parse_any_ts_ms(cursor_val)
289
+ if ts_ms is not None and (max_ts_ms is None or ts_ms > max_ts_ms):
290
+ max_ts_ms = ts_ms
291
+
292
+ contacts.append(_enrich_contact_data_from_record(record, job_level_map))
293
+
294
+ if i % 200 == 0:
295
+ logging.info("Read %d contacts...", i)
296
+
297
+ time.sleep(0.05) # gentle pacing
298
+
299
+ except httpx.HTTPStatusError as e:
300
+ logging.error("HTTP error reading contact %s: %s", cid, e)
301
+ except (ContactsApiException, httpx.HTTPError) as e:
302
+ logging.error("Error reading contact %s: %s", cid, e)
303
+
304
+ return contacts, max_ts_ms
305
+
306
+ # -----------------------------------------------------------------------------
307
+ # Upsert flow
308
+ # -----------------------------------------------------------------------------
309
+ def upsert_contacts(contacts: List[Dict]) -> None:
310
+ if not contacts:
311
+ print("No contacts to upsert.")
312
+ return
313
+
314
+ existing = fetch_supabase_table(supabase_client, "hubspot_contacts", "contact_id")
315
+ rows_to_upsert: List[Dict] = []
316
+
317
+ for c in contacts:
318
+ contact_id = try_parse_int(c.get("id"))
319
+ if not contact_id:
320
+ continue
321
+
322
+ mapped_row = map_contact_data_for_db([c])[0]
323
+ existing_row = existing.get(str(contact_id))
324
+
325
+ if not existing_row or contacts_are_different(mapped_row, existing_row):
326
+ rows_to_upsert.append(mapped_row)
327
+
328
+ print(f"{len(rows_to_upsert)} contacts to insert/update (out of {len(contacts)} read).")
329
+
330
+ if rows_to_upsert:
331
+ # upload_raw_json_to_supabase(supabase_client, rows_to_upsert, object_type="contacts")
332
+ batched_insert(supabase_client, "hubspot_contacts", rows_to_upsert, batch_size=1000)
333
+
334
+ # -----------------------------------------------------------------------------
335
+ # Main (timestamp cursor)
336
+ # -----------------------------------------------------------------------------
337
+ def main(since_ms: Optional[int] = None):
338
+ """
339
+ Orchestrates:
340
+ 1) Search contact IDs with <cursor_prop> > since_ms (property fallback)
341
+ 2) Read full contacts (track max timestamp for <cursor_prop>)
342
+ 3) Upsert into Supabase
343
+ 4) Update sync metadata with { last_sync_metadata, last_sync_time, cursor_prop }
344
+ """
345
+ # Resolve since_ms
346
+ if since_ms is None and BOOTSTRAP_SINCE_MS_ENV:
347
+ try:
348
+ since_ms = int(BOOTSTRAP_SINCE_MS_ENV)
349
+ except ValueError:
350
+ raise RuntimeError("HUBSPOT_CONTACTS_SINCE_MS must be an integer (ms) if set.")
351
+
352
+ if since_ms is None:
353
+ # Default: today@00:00:00Z for first run
354
+ today0 = floor_to_utc_midnight(datetime.datetime.now(datetime.timezone.utc))
355
+ since_ms = to_epoch_ms(today0)
356
+
357
+ print(f"Searching contacts with timestamp > {since_ms} ...")
358
+ ids, cursor_prop = search_contact_ids_after_ms(since_ms)
359
+ print(f"Search property: {cursor_prop}. Found {len(ids)} contact IDs.")
360
+
361
+ now_iso = datetime.datetime.now(datetime.timezone.utc).isoformat()
362
+
363
+ if not ids:
364
+ print("No contacts beyond the cursor. Updating sync metadata and exiting.")
365
+ update_sync_metadata(supabase_client, "contacts", now_iso)
366
+ return
367
+
368
+ print("Reading contacts (with associations)...")
369
+ contacts, max_ts_ms = read_contacts_by_ids(ids, cursor_prop)
370
+
371
+ print("Upserting into Supabase...")
372
+ upsert_contacts(contacts)
373
+
374
+ # Advance cursor to max timestamp we actually ingested for the chosen property
375
+ new_cursor_ms = max_ts_ms if max_ts_ms is not None else since_ms
376
+
377
+ update_sync_metadata(supabase_client, "contacts", now_iso)
378
+
379
+ print(f"Contacts sync complete. Advanced cursor to {new_cursor_ms} using prop '{cursor_prop}'.")
380
+
381
+ # -----------------------------------------------------------------------------
382
+ # CLI
383
+ # -----------------------------------------------------------------------------
384
+ def _parse_cli_arg_to_ms(arg: str) -> int:
385
+ """
386
+ Accept:
387
+ - integer epoch ms
388
+ - ISO-8601 (Z or offset)
389
+ - YYYY-MM-DD (floors to 00:00Z)
390
+ """
391
+ # epoch ms or seconds
392
+ if re.fullmatch(r"\d{10,13}", arg):
393
+ v = int(arg)
394
+ if v < 10_000_000_000_000: # seconds -> ms
395
+ v *= 1000
396
+ return v
397
+
398
+ # YYYY-MM-DD
399
+ if re.fullmatch(r"\d{4}-\d{2}-\d{2}", arg):
400
+ d = datetime.datetime.strptime(arg, "%Y-%m-%d").replace(tzinfo=datetime.timezone.utc)
401
+ return to_epoch_ms(floor_to_utc_midnight(d))
402
+
403
+ # ISO-8601
404
+ return to_epoch_ms(arg)
405
+
406
+ if __name__ == "__main__":
407
+ import sys
408
+ if len(sys.argv) > 1:
409
+ try:
410
+ main(since_ms=_parse_cli_arg_to_ms(sys.argv[1]))
411
+ except Exception as e:
412
+ print(f"Invalid timestamp. Provide epoch ms, ISO-8601, or YYYY-MM-DD. Error: {e}")
413
+ sys.exit(1)
414
+ else:
415
+ main()
python/hubspot_deals.py ADDED
@@ -0,0 +1,445 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ HubSpot Deals → Supabase (incremental since a millisecond cursor)
3
+
4
+ Usage from orchestrator:
5
+ import load_hubspot_deals
6
+ load_hubspot_deals.main(since_ms=<int milliseconds since epoch UTC>)
7
+
8
+ Direct CLI:
9
+ # epoch ms
10
+ python load_hubspot_deals.py 1754025600000
11
+ # ISO-8601
12
+ python load_hubspot_deals.py 2025-08-01T09:30:00Z
13
+ # Back-compat date (floors to 00:00Z)
14
+ python load_hubspot_deals.py 2025-08-01
15
+ """
16
+ import os
17
+ import re
18
+ import time
19
+ import logging
20
+ import datetime
21
+ from typing import List, Dict, Optional, Tuple, Union
22
+
23
+ import httpx
24
+ import hubspot
25
+ from dotenv import load_dotenv
26
+ from supabase import create_client
27
+ from hubspot.crm.deals import ApiException as DealsApiException
28
+
29
+ from hubspot_utils import (
30
+ parse_ts, try_parse_int, try_parse_float, deduplicate_by_key
31
+ )
32
+ from supabase_utils import (
33
+ insert_into_supabase_table, update_sync_metadata
34
+ )
35
+
36
+ # -----------------------------------------------------------------------------
37
+ # Constants
38
+ # -----------------------------------------------------------------------------
39
+ DEAL_TO_COMPANY_ASSOC_TYPE = "deal_to_company"
40
+
41
+ HUBSPOT_TEAM_MAP = {
42
+ "1322863": "Fulfilment Team",
43
+ "1322864": "Customer Services Team",
44
+ "1322865": "Sales Team",
45
+ "1448134": "Accounts & Billing Team",
46
+ "3557793": "Marketing Team",
47
+ "57348939": "MDR Team",
48
+ }
49
+
50
+ DEAL_PROPERTIES = [
51
+ "dealname",
52
+ "hubspot_owner_id",
53
+ "pipeline",
54
+ "dealstage",
55
+ "createdate", # use createdate; hs_createdate is also seen in some portals
56
+ "hs_createdate",
57
+ "closedate",
58
+ "contract_signed_date",
59
+ "contract_end_date",
60
+ "notes_last_updated",
61
+ "num_contacted_notes",
62
+ "hubspot_team_id",
63
+ "hs_analytics_source",
64
+ "number_of_cli",
65
+ "amount",
66
+ "hs_acv",
67
+ "hs_tcv",
68
+ "margin",
69
+ "source_of_deal_2___migration",
70
+ "hs_primary_associated_company",
71
+ "hs_lastmodifieddate", # for mapping/debugging
72
+ "lastmodifieddate",
73
+ ]
74
+
75
+ # -----------------------------------------------------------------------------
76
+ # Logging
77
+ # -----------------------------------------------------------------------------
78
+ logging.basicConfig(
79
+ filename=f"logs/hubspot_deals_pipeline_{datetime.datetime.now().strftime('%Y-%m-%d')}.log",
80
+ filemode="a",
81
+ level=logging.INFO,
82
+ format="%(asctime)s [%(levelname)s] %(message)s",
83
+ )
84
+
85
+ # -----------------------------------------------------------------------------
86
+ # Environment
87
+ # -----------------------------------------------------------------------------
88
+ load_dotenv()
89
+ HUBSPOT_TOKEN = os.getenv("HUBSPOT_TOKEN")
90
+ SUPABASE_URL = os.getenv("SUPABASE_URL")
91
+ SUPABASE_SERVICE_ROLE_KEY = os.getenv("SUPABASE_SERVICE_ROLE_KEY")
92
+ # Optional bootstrap cursor if orchestrator doesn't provide one
93
+ BOOTSTRAP_SINCE_MS_ENV = os.getenv("HUBSPOT_DEALS_SINCE_MS")
94
+
95
+ if not HUBSPOT_TOKEN:
96
+ raise RuntimeError("HUBSPOT_TOKEN is not set")
97
+ if not SUPABASE_URL or not SUPABASE_SERVICE_ROLE_KEY:
98
+ raise RuntimeError("Supabase env vars are not set")
99
+
100
+ hubspot_client = hubspot.Client.create(access_token=HUBSPOT_TOKEN)
101
+ supabase_client = create_client(SUPABASE_URL, SUPABASE_SERVICE_ROLE_KEY)
102
+
103
+ # -----------------------------------------------------------------------------
104
+ # Time helpers
105
+ # -----------------------------------------------------------------------------
106
+ def _ensure_utc(dt: datetime.datetime) -> datetime.datetime:
107
+ if dt.tzinfo is None:
108
+ dt = dt.replace(tzinfo=datetime.timezone.utc)
109
+ return dt.astimezone(datetime.timezone.utc)
110
+
111
+ def floor_to_utc_midnight(dt: datetime.datetime) -> datetime.datetime:
112
+ dt = _ensure_utc(dt)
113
+ return dt.replace(hour=0, minute=0, second=0, microsecond=0)
114
+
115
+ def _parse_iso_like_to_dt(value: str) -> datetime.datetime:
116
+ if isinstance(value, str) and value.endswith("Z"):
117
+ value = value[:-1] + "+00:00"
118
+ dt = datetime.datetime.fromisoformat(value)
119
+ return _ensure_utc(dt)
120
+
121
+ def to_epoch_ms(dt_or_str: Union[str, datetime.datetime]) -> int:
122
+ if isinstance(dt_or_str, str):
123
+ dt = _parse_iso_like_to_dt(dt_or_str)
124
+ elif isinstance(dt_or_str, datetime.datetime):
125
+ dt = _ensure_utc(dt_or_str)
126
+ else:
127
+ raise TypeError(f"Unsupported type for to_epoch_ms: {type(dt_or_str)}")
128
+ return int(dt.timestamp() * 1000)
129
+
130
+ def parse_any_ts_ms(value: Optional[Union[str, int, float]]) -> Optional[int]:
131
+ """
132
+ Accepts ms-epoch / sec-epoch / ISO-8601; returns ms since epoch or None.
133
+ """
134
+ if value is None:
135
+ return None
136
+ try:
137
+ v = int(str(value))
138
+ if v < 10_000_000_000_000: # seconds → ms
139
+ v *= 1000
140
+ return v
141
+ except ValueError:
142
+ pass
143
+ try:
144
+ return to_epoch_ms(str(value))
145
+ except Exception:
146
+ logging.warning("Could not parse timestamp value=%r", value)
147
+ return None
148
+
149
+ # -----------------------------------------------------------------------------
150
+ # Pipeline / Stage labels
151
+ # -----------------------------------------------------------------------------
152
+ def get_pipeline_and_stage_mappings() -> Tuple[Dict[str, str], Dict[str, str]]:
153
+ """Retrieve pipeline and stage label mappings for deals."""
154
+ try:
155
+ resp = hubspot_client.crm.pipelines.pipelines_api.get_all(object_type="deals")
156
+ pipeline_mapping: Dict[str, str] = {}
157
+ stage_mapping: Dict[str, str] = {}
158
+ for p in resp.results:
159
+ pipeline_mapping[p.id] = p.label
160
+ for s in p.stages:
161
+ stage_mapping[s.id] = s.label
162
+ return pipeline_mapping, stage_mapping
163
+ except Exception as e:
164
+ logging.error("Failed to fetch pipeline/stage mappings: %s", e)
165
+ return {}, {}
166
+
167
+ # -----------------------------------------------------------------------------
168
+ # Search IDs (ts > since_ms) with property fallback
169
+ # -----------------------------------------------------------------------------
170
+ def _search_deal_ids_from(since_ms: int, prop: str) -> List[str]:
171
+ """
172
+ Search deals where {prop} > since_ms (epoch-ms).
173
+ Sort ascending so we can advance the cursor monotonically.
174
+ """
175
+ url = "https://api.hubapi.com/crm/v3/objects/deals/search"
176
+ headers = {
177
+ "Authorization": f"Bearer {HUBSPOT_TOKEN}",
178
+ "Content-Type": "application/json",
179
+ "Accept": "application/json",
180
+ }
181
+ payload = {
182
+ "filterGroups": [{
183
+ "filters": [
184
+ {"propertyName": prop, "operator": "GT", "value": str(since_ms)},
185
+ ]
186
+ }],
187
+ "limit": 100,
188
+ "sorts": [{"propertyName": prop, "direction": "ASCENDING"}],
189
+ }
190
+
191
+ ids: List[str] = []
192
+ after: Optional[str] = None
193
+ with httpx.Client(timeout=30.0) as client:
194
+ while True:
195
+ body = dict(payload)
196
+ if after:
197
+ body["after"] = after
198
+
199
+ resp = client.post(url, headers=headers, json=body)
200
+ if resp.status_code >= 400:
201
+ try:
202
+ logging.error("Deal search error for prop '%s': %s", prop, resp.json())
203
+ except Exception:
204
+ logging.error("Deal search error for prop '%s': %s", prop, resp.text)
205
+ resp.raise_for_status()
206
+
207
+ data = resp.json()
208
+ ids.extend([obj["id"] for obj in data.get("results", []) or []])
209
+
210
+ after = (data.get("paging") or {}).get("next", {}).get("after")
211
+ if not after:
212
+ break
213
+ time.sleep(0.1)
214
+
215
+ return ids
216
+
217
+ def search_deal_ids_after_ms(since_ms: int) -> Tuple[List[str], str]:
218
+ """
219
+ Try these properties in order; return (ids, prop_used) for the first successful search:
220
+ 1) hs_lastmodifieddate
221
+ 2) lastmodifieddate
222
+ 3) createdate
223
+ 4) hs_createdate
224
+ """
225
+ props_to_try = ["createdate", "hs_createdate"]
226
+ last_err = None
227
+
228
+ for prop in props_to_try:
229
+ try:
230
+ ids = _search_deal_ids_from(since_ms, prop)
231
+ logging.info("Deal search with '%s' returned %d IDs.", prop, len(ids))
232
+ return ids, prop
233
+ except httpx.HTTPStatusError as e:
234
+ last_err = e
235
+ continue
236
+
237
+ if last_err:
238
+ raise last_err
239
+ return [], "hs_lastmodifieddate"
240
+
241
+ # -----------------------------------------------------------------------------
242
+ # Read-by-ID (with associations) → map rows and track max cursor ts
243
+ # -----------------------------------------------------------------------------
244
+ def _extract_primary_company_id(record, props: Dict) -> Optional[int]:
245
+ """
246
+ Prefer hs_primary_associated_company property. If missing, fall back to
247
+ associations of type 'deal_to_company' (if available).
248
+ """
249
+ primary = props.get("hs_primary_associated_company")
250
+ if primary:
251
+ return try_parse_int(primary)
252
+
253
+ assoc = getattr(record, "associations", None)
254
+ if assoc and assoc.get("companies") and getattr(assoc["companies"], "results", None):
255
+ for a in assoc["companies"].results:
256
+ if getattr(a, "type", None) == DEAL_TO_COMPANY_ASSOC_TYPE or not hasattr(a, "type"):
257
+ return try_parse_int(a.id)
258
+ return None
259
+
260
+ def read_deals_by_ids(
261
+ deal_ids: List[str],
262
+ cursor_prop: str,
263
+ ) -> Tuple[List[Dict], List[Dict], Optional[int]]:
264
+ """
265
+ Read deals by ID including contacts/companies associations.
266
+ Returns: (all_deals, deal_contact_links, max_ts_ms_for_cursor_prop)
267
+ """
268
+ if not deal_ids:
269
+ return [], [], None
270
+
271
+ all_deals: List[Dict] = []
272
+ deal_contact_links: List[Dict] = []
273
+
274
+ assoc_types = ["contacts", "companies"]
275
+ pipeline_map, stage_map = get_pipeline_and_stage_mappings()
276
+
277
+ max_ts_ms: Optional[int] = None
278
+
279
+ for i, did in enumerate(deal_ids, start=1):
280
+ try:
281
+ record = hubspot_client.crm.deals.basic_api.get_by_id(
282
+ deal_id=did, properties=DEAL_PROPERTIES, associations=assoc_types, archived=False
283
+ )
284
+ p = record.properties or {}
285
+
286
+ # Track max timestamp for the cursor property we used in the search
287
+ cursor_val = p.get(cursor_prop)
288
+ ts_ms = parse_any_ts_ms(cursor_val)
289
+ if ts_ms is not None and (max_ts_ms is None or ts_ms > max_ts_ms):
290
+ max_ts_ms = ts_ms
291
+
292
+ # Contacts
293
+ if getattr(record, "associations", None) and record.associations.get("contacts"):
294
+ bucket = record.associations["contacts"]
295
+ if getattr(bucket, "results", None):
296
+ for a in bucket.results:
297
+ if a.id and a.id.isdigit():
298
+ deal_contact_links.append({
299
+ "deal_id": try_parse_int(record.id),
300
+ "contact_id": try_parse_int(a.id),
301
+ })
302
+
303
+ # Company
304
+ company_id = _extract_primary_company_id(record, p)
305
+
306
+ # Created date: accept either createdate or hs_createdate
307
+ created_iso = p.get("createdate") or p.get("hs_createdate")
308
+
309
+ all_deals.append({
310
+ "deal_id": try_parse_int(record.id),
311
+ "dealname": p.get("dealname"),
312
+ "hubspot_owner_id": try_parse_int(p.get("hubspot_owner_id")),
313
+ "pipeline_id": try_parse_int(p.get("pipeline")),
314
+ "pipeline_label": pipeline_map.get(p.get("pipeline"), ""),
315
+ "dealstage": p.get("dealstage"),
316
+ "dealstage_label": stage_map.get(p.get("dealstage"), ""),
317
+ "hubspot_createdate": parse_ts(created_iso),
318
+ "closedate": parse_ts(p.get("closedate")) or None,
319
+ "contract_signed_date": p.get("contract_signed_date") or None,
320
+ "contract_end_date": p.get("contract_end_date") or None,
321
+ "hubspot_last_activity_date": parse_ts(p.get("notes_last_updated")),
322
+ "num_contacted_notes": p.get("num_contacted_notes"),
323
+ "hubspot_team_id": try_parse_int(p.get("hubspot_team_id")),
324
+ "hubspot_team_label": HUBSPOT_TEAM_MAP.get(p.get("hubspot_team_id"), ""),
325
+ "hs_analytics_source": p.get("hs_analytics_source"),
326
+ "number_of_cli": try_parse_int(p.get("number_of_cli")),
327
+ "amount": try_parse_int(p.get("amount")),
328
+ "annual_contract_value": try_parse_float(p.get("hs_acv")),
329
+ "total_contract_value": try_parse_float(p.get("hs_tcv")),
330
+ "margin": try_parse_float(p.get("margin")),
331
+ "source_of_deal": p.get("source_of_deal_2___migration"),
332
+ "company_id": company_id,
333
+ })
334
+
335
+ if i % 200 == 0:
336
+ logging.info("Read %d deals...", i)
337
+ time.sleep(0.05)
338
+
339
+ except httpx.HTTPStatusError as e:
340
+ logging.error("HTTP error reading deal %s: %s", did, e)
341
+ except (DealsApiException, httpx.HTTPError) as e:
342
+ logging.error("Error reading deal %s: %s", did, e)
343
+
344
+ return all_deals, deal_contact_links, max_ts_ms
345
+
346
+ # -----------------------------------------------------------------------------
347
+ # Upsert
348
+ # -----------------------------------------------------------------------------
349
+ def upsert_deals(deals: List[Dict], deal_contact_links: List[Dict]) -> None:
350
+ if deals:
351
+ insert_into_supabase_table(supabase_client, "hubspot_deals", deals)
352
+ print(f"Upserted {len(deals)} deals.")
353
+
354
+ if deal_contact_links:
355
+ deal_contact_links = deduplicate_by_key(deal_contact_links, key=("deal_id", "contact_id"))
356
+ insert_into_supabase_table(
357
+ supabase_client,
358
+ "hubspot_deal_contacts",
359
+ deal_contact_links,
360
+ on_conflict=["deal_id", "contact_id"],
361
+ )
362
+ print(f"Upserted {len(deal_contact_links)} deal-contact associations.")
363
+
364
+ # -----------------------------------------------------------------------------
365
+ # Main (timestamp cursor)
366
+ # -----------------------------------------------------------------------------
367
+ def main(since_ms: Optional[int] = None):
368
+ """
369
+ Orchestrates:
370
+ 1) Search deal IDs with <cursor_prop> > since_ms (property fallback)
371
+ 2) Read full deals with associations (track max timestamp for <cursor_prop>)
372
+ 3) Upsert into Supabase
373
+ 4) Update sync metadata with { last_sync_metadata, last_sync_time, cursor_prop }
374
+ """
375
+ # Resolve since_ms
376
+ if since_ms is None and BOOTSTRAP_SINCE_MS_ENV:
377
+ try:
378
+ since_ms = int(BOOTSTRAP_SINCE_MS_ENV)
379
+ except ValueError:
380
+ raise RuntimeError("HUBSPOT_DEALS_SINCE_MS must be an integer (ms) if set.")
381
+
382
+ if since_ms is None:
383
+ # Default: today@00:00:00Z for first run
384
+ today0 = floor_to_utc_midnight(datetime.datetime.now(datetime.timezone.utc))
385
+ since_ms = to_epoch_ms(today0)
386
+
387
+ print(f"Searching deals with timestamp > {since_ms} ...")
388
+ ids, cursor_prop = search_deal_ids_after_ms(since_ms)
389
+ print(f"Search property: {cursor_prop}. Found {len(ids)} deal IDs.")
390
+
391
+ now_iso = datetime.datetime.now(datetime.timezone.utc).isoformat()
392
+
393
+ if not ids:
394
+ print("No deals beyond the cursor. Updating sync metadata and exiting.")
395
+ update_sync_metadata(supabase_client, "hubspot_deals", now_iso)
396
+ return
397
+
398
+ print("Reading deals (with associations)...")
399
+ deals, deal_contact_links, max_ts_ms = read_deals_by_ids(ids, cursor_prop)
400
+
401
+ print("Upserting into Supabase...")
402
+ upsert_deals(deals, deal_contact_links)
403
+
404
+ # Advance cursor to max timestamp we actually ingested for the chosen property
405
+ new_cursor_ms = max_ts_ms if max_ts_ms is not None else since_ms
406
+
407
+ update_sync_metadata(supabase_client, "hubspot_deals", now_iso)
408
+
409
+ print(f"Deals sync complete. Advanced cursor to {new_cursor_ms} using prop '{cursor_prop}'.")
410
+
411
+ # -----------------------------------------------------------------------------
412
+ # CLI
413
+ # -----------------------------------------------------------------------------
414
+ def _parse_cli_arg_to_ms(arg: str) -> int:
415
+ """
416
+ Accept:
417
+ - integer epoch ms
418
+ - ISO-8601 (Z or offset)
419
+ - YYYY-MM-DD (floors to 00:00Z)
420
+ """
421
+ # epoch ms or seconds
422
+ if re.fullmatch(r"\d{10,13}", arg):
423
+ v = int(arg)
424
+ if v < 10_000_000_000_000: # seconds -> ms
425
+ v *= 1000
426
+ return v
427
+
428
+ # YYYY-MM-DD
429
+ if re.fullmatch(r"\d{4}-\d{2}-\d{2}", arg):
430
+ d = datetime.datetime.strptime(arg, "%Y-%m-%d").replace(tzinfo=datetime.timezone.utc)
431
+ return to_epoch_ms(floor_to_utc_midnight(d))
432
+
433
+ # ISO-8601
434
+ return to_epoch_ms(arg)
435
+
436
+ if __name__ == "__main__":
437
+ import sys
438
+ if len(sys.argv) > 1:
439
+ try:
440
+ main(since_ms=_parse_cli_arg_to_ms(sys.argv[1]))
441
+ except Exception as e:
442
+ print(f"Invalid timestamp. Provide epoch ms, ISO-8601, or YYYY-MM-DD. Error: {e}")
443
+ sys.exit(1)
444
+ else:
445
+ main()
python/hubspot_emails.py ADDED
@@ -0,0 +1,472 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ HubSpot Emails → Supabase (incremental since a millisecond cursor)
3
+
4
+ Usage from orchestrator:
5
+ import load_hubspot_emails
6
+ load_hubspot_emails.main(since_ms=<int milliseconds since epoch UTC>)
7
+
8
+ Direct CLI:
9
+ # epoch ms
10
+ python load_hubspot_emails.py 1754025600000
11
+
12
+ # ISO-8601 (Z or offset)
13
+ python load_hubspot_emails.py 2025-08-01T09:30:00Z
14
+
15
+ # Back-compat date (floors to 00:00:00Z)
16
+ python load_hubspot_emails.py 2025-08-01
17
+ """
18
+
19
+ import os
20
+ import time
21
+ import logging
22
+ import datetime
23
+ import re
24
+ from typing import List, Dict, Optional, Tuple, Union
25
+
26
+ import httpx
27
+ import hubspot
28
+ from dotenv import load_dotenv
29
+ from supabase import create_client
30
+ from hubspot.crm.objects.emails import ApiException as EmailApiException
31
+
32
+ from hubspot_utils import (
33
+ clean_text,
34
+ )
35
+ from supabase_utils import (
36
+ batched_insert, update_sync_metadata,
37
+ )
38
+
39
+ # -----------------------------------------------------------------------------
40
+ # Logging
41
+ # -----------------------------------------------------------------------------
42
+ logging.basicConfig(
43
+ filename=f"logs/hubspot_email_pipeline_{datetime.datetime.now().strftime('%Y-%m-%d')}.log",
44
+ filemode="a",
45
+ level=logging.INFO,
46
+ format="%(asctime)s [%(levelname)s] %(message)s",
47
+ )
48
+
49
+ # -----------------------------------------------------------------------------
50
+ # Environment
51
+ # -----------------------------------------------------------------------------
52
+ load_dotenv()
53
+ HUBSPOT_TOKEN = os.getenv("HUBSPOT_TOKEN")
54
+ SUPABASE_URL = os.getenv("SUPABASE_URL")
55
+ SUPABASE_SERVICE_ROLE_KEY = os.getenv("SUPABASE_SERVICE_ROLE_KEY")
56
+ # Optional bootstrap cursor if orchestrator doesn't provide one
57
+ BOOTSTRAP_SINCE_MS_ENV = os.getenv("HUBSPOT_EMAILS_SINCE_MS")
58
+
59
+ if not HUBSPOT_TOKEN:
60
+ raise RuntimeError("HUBSPOT_TOKEN is not set")
61
+ if not SUPABASE_URL or not SUPABASE_SERVICE_ROLE_KEY:
62
+ raise RuntimeError("Supabase env vars are not set")
63
+
64
+ hubspot_client = hubspot.Client.create(access_token=HUBSPOT_TOKEN)
65
+ supabase_client = create_client(SUPABASE_URL, SUPABASE_SERVICE_ROLE_KEY)
66
+
67
+ # -----------------------------------------------------------------------------
68
+ # Config
69
+ # -----------------------------------------------------------------------------
70
+ EMAIL_PROPERTIES = [
71
+ "hs_email_from_email",
72
+ "hs_email_to_email",
73
+ "hs_email_subject",
74
+ "hs_email_direction",
75
+ "hs_email_text",
76
+ "hs_timestamp",
77
+ ]
78
+
79
+ # -----------------------------------------------------------------------------
80
+ # Email parsing
81
+ # -----------------------------------------------------------------------------
82
+ EMAIL_RE = re.compile(r'[\w\.\+\-]+@[\w\.\-]+\.\w+')
83
+
84
+
85
+ def parse_emails(raw: Optional[object]) -> List[str]:
86
+ if raw is None:
87
+ return []
88
+ candidates: List[str] = []
89
+ if isinstance(raw, list):
90
+ for item in raw or []:
91
+ if not item:
92
+ continue
93
+ if isinstance(item, str):
94
+ for chunk in re.split(r'[;,]', item):
95
+ candidates.extend(EMAIL_RE.findall(chunk))
96
+ elif isinstance(raw, str):
97
+ for chunk in re.split(r'[;,]', raw):
98
+ candidates.extend(EMAIL_RE.findall(chunk))
99
+ else:
100
+ candidates.extend(EMAIL_RE.findall(str(raw)))
101
+ return sorted({c.strip().lower() for c in candidates if c and c.strip()})
102
+
103
+ # -----------------------------------------------------------------------------
104
+ # Time helpers
105
+ # -----------------------------------------------------------------------------
106
+
107
+
108
+ def _ensure_utc(dt: datetime.datetime) -> datetime.datetime:
109
+ if dt.tzinfo is None:
110
+ dt = dt.replace(tzinfo=datetime.timezone.utc)
111
+ return dt.astimezone(datetime.timezone.utc)
112
+
113
+
114
+ def floor_to_utc_midnight(dt: datetime.datetime) -> datetime.datetime:
115
+ dt = _ensure_utc(dt)
116
+ return dt.replace(hour=0, minute=0, second=0, microsecond=0)
117
+
118
+
119
+ def _parse_iso_like_to_dt(value: str) -> datetime.datetime:
120
+ if value.endswith("Z"):
121
+ value = value[:-1] + "+00:00"
122
+ dt = datetime.datetime.fromisoformat(value)
123
+ return _ensure_utc(dt)
124
+
125
+
126
+ def to_epoch_ms(dt_or_str: Union[str, datetime.datetime]) -> int:
127
+ if isinstance(dt_or_str, str):
128
+ dt = _parse_iso_like_to_dt(dt_or_str)
129
+ elif isinstance(dt_or_str, datetime.datetime):
130
+ dt = _ensure_utc(dt_or_str)
131
+ else:
132
+ raise TypeError(f"Unsupported type for to_epoch_ms: {type(dt_or_str)}")
133
+ return int(dt.timestamp() * 1000)
134
+
135
+
136
+ def parse_hs_timestamp_ms(value: Optional[Union[str, int, float]]) -> Optional[int]:
137
+ """
138
+ HubSpot CRM datetime properties often come back as strings.
139
+ Accepts:
140
+ - ms epoch as str/int/float
141
+ - ISO-8601 strings (rare for CRM objects, but robust to it)
142
+ Returns ms since epoch or None.
143
+ """
144
+ if value is None:
145
+ return None
146
+ # Easy path: integer ms
147
+ try:
148
+ v = int(str(value))
149
+ # treat values too small as seconds and up-convert, just in case
150
+ if v < 10_000_000_000: # < ~2001-09 in seconds
151
+ v = v * 1000
152
+ return v
153
+ except ValueError:
154
+ pass
155
+
156
+ # Fallback: ISO
157
+ try:
158
+ return to_epoch_ms(str(value))
159
+ except Exception:
160
+ logging.warning("Could not parse hs_timestamp=%r", value)
161
+ return None
162
+
163
+ # -----------------------------------------------------------------------------
164
+ # Search IDs (ts > since_ms)
165
+ # -----------------------------------------------------------------------------
166
+
167
+
168
+ def search_email_ids_after_ms(since_ms: int) -> List[str]:
169
+ """
170
+ Find email IDs where hs_timestamp > since_ms (strictly greater),
171
+ sorted ASC so the max timestamp at the end is monotonic.
172
+ """
173
+
174
+ url = "https://api.hubapi.com/crm/v3/objects/emails/search"
175
+ headers = {
176
+ "Authorization": f"Bearer {HUBSPOT_TOKEN}",
177
+ "Content-Type": "application/json",
178
+ "Accept": "application/json",
179
+ }
180
+ payload = {
181
+ "filterGroups": [{
182
+ "filters": [
183
+ {"propertyName": "hs_timestamp",
184
+ "operator": "GT", "value": str(since_ms)}
185
+ ]
186
+ }],
187
+ "limit": 100,
188
+ "sorts": [{"propertyName": "hs_timestamp", "direction": "ASCENDING"}],
189
+ # We only need IDs here; we’ll read full records later
190
+ }
191
+
192
+ ids: List[str] = []
193
+ after: Optional[str] = None
194
+ with httpx.Client(timeout=30.0) as client:
195
+ while True:
196
+ body = dict(payload)
197
+ if after:
198
+ body["after"] = after
199
+
200
+ resp = client.post(url, headers=headers, json=body)
201
+ if resp.status_code >= 400:
202
+ try:
203
+ logging.error("Email search error: %s", resp.json())
204
+ except Exception:
205
+ logging.error("Email search error: %s", resp.text)
206
+ resp.raise_for_status()
207
+
208
+ data = resp.json()
209
+ ids.extend([obj["id"] for obj in data.get("results", []) or []])
210
+
211
+ after = (data.get("paging") or {}).get("next", {}).get("after")
212
+ if not after:
213
+ break
214
+ time.sleep(0.1)
215
+
216
+ logging.info("Found %d email IDs after %d.", len(ids), since_ms)
217
+ return ids
218
+
219
+ # -----------------------------------------------------------------------------
220
+ # Read-by-ID (with associations)
221
+ # -----------------------------------------------------------------------------
222
+
223
+
224
+ def read_emails_by_ids(
225
+ email_ids: List[str]
226
+ ) -> Tuple[List[Dict], List[Dict], List[Dict], List[Dict], Optional[int]]:
227
+ """
228
+ Read emails by ID with properties and associations (companies/contacts/deals).
229
+ Returns data lists + max hs_timestamp (ms) seen.
230
+ """
231
+
232
+ if not email_ids:
233
+ return [], [], [], [], None
234
+
235
+ email_metadata_data: List[Dict] = []
236
+ email_company_links: List[Dict] = []
237
+ email_contact_links: List[Dict] = []
238
+ email_deal_links: List[Dict] = []
239
+ email_ticket_links: List[Dict] = []
240
+
241
+ assoc_types = ["companies", "contacts", "deals", "tickets"]
242
+ max_ts_ms: Optional[int] = None
243
+
244
+ for i, email_id in enumerate(email_ids, start=1):
245
+ try:
246
+ record = hubspot_client.crm.objects.emails.basic_api.get_by_id(
247
+ email_id, properties=EMAIL_PROPERTIES, associations=assoc_types, archived=False
248
+ )
249
+ props = record.properties or {}
250
+
251
+ cleaned = clean_text(props.get("hs_email_text") or "")
252
+
253
+ # Robust timestamp handling
254
+ hs_ts_ms = parse_hs_timestamp_ms(props.get("hs_timestamp"))
255
+ if hs_ts_ms is not None:
256
+ if (max_ts_ms is None) or (hs_ts_ms > max_ts_ms):
257
+ max_ts_ms = hs_ts_ms
258
+ sent_at_iso = datetime.datetime.fromtimestamp(
259
+ hs_ts_ms / 1000, tz=datetime.timezone.utc).isoformat()
260
+ else:
261
+ sent_at_iso = None
262
+
263
+ email_metadata_data.append({
264
+ "email_id": record.id,
265
+ "subject": props.get("hs_email_subject"),
266
+ "from_email": props.get("hs_email_from_email") or "",
267
+ "to_emails": parse_emails(props.get("hs_email_to_email")),
268
+ "sent_at": sent_at_iso,
269
+ "direction": props.get("hs_email_direction"),
270
+ "email_content": cleaned,
271
+ })
272
+
273
+ assoc = record.associations or {}
274
+ if assoc.get("companies") and getattr(assoc["companies"], "results", None):
275
+ for a in assoc["companies"].results:
276
+ if a.id and a.id.isdigit():
277
+ email_company_links.append(
278
+ {"email_id": record.id, "company_id": int(a.id)})
279
+ if assoc.get("contacts") and getattr(assoc["contacts"], "results", None):
280
+ for a in assoc["contacts"].results:
281
+ if a.id and a.id.isdigit():
282
+ email_contact_links.append(
283
+ {"email_id": record.id, "contact_id": int(a.id)})
284
+ if assoc.get("deals") and getattr(assoc["deals"], "results", None):
285
+ for a in assoc["deals"].results:
286
+ if a.id and a.id.isdigit():
287
+ email_deal_links.append(
288
+ {"email_id": record.id, "deal_id": int(a.id)})
289
+ if assoc.get("tickets") and getattr(assoc["tickets"], "results", None):
290
+ for a in assoc["tickets"].results:
291
+ if a.id and a.id.isdigit():
292
+ email_ticket_links.append(
293
+ {"email_id": record.id, "ticket_id": int(a.id)})
294
+
295
+ if i % 200 == 0:
296
+ logging.info("Read %d emails...", i)
297
+ time.sleep(0.05)
298
+
299
+ except httpx.HTTPStatusError as e:
300
+ logging.error("HTTP error reading email %s: %s", email_id, e)
301
+ except (EmailApiException, httpx.HTTPError) as e:
302
+ logging.error("Error reading email %s: %s", email_id, e)
303
+
304
+ return (
305
+ email_metadata_data, email_company_links, email_contact_links,
306
+ email_deal_links, email_ticket_links, max_ts_ms
307
+ )
308
+
309
+ # -----------------------------------------------------------------------------
310
+ # Upsert
311
+ # -----------------------------------------------------------------------------
312
+
313
+
314
+ def upsert_emails(
315
+ email_metadata_data: List[Dict],
316
+ email_company_links: List[Dict],
317
+ email_contact_links: List[Dict],
318
+ email_deal_links: List[Dict],
319
+ email_ticket_links: List[Dict],
320
+ ) -> None:
321
+ if email_metadata_data:
322
+ batched_insert(
323
+ supabase_client,
324
+ "hubspot_emails",
325
+ email_metadata_data,
326
+ batch_size=1000,
327
+ # If your helper supports on_conflict:
328
+ # on_conflict=["email_id"]
329
+ )
330
+ print(f"Upserted {len(email_metadata_data)} email metadata rows.")
331
+
332
+ if email_company_links:
333
+ batched_insert(
334
+ supabase_client,
335
+ "hubspot_email_companies",
336
+ email_company_links,
337
+ batch_size=1000,
338
+ on_conflict=["email_id", "company_id"],
339
+ )
340
+ print(f"Upserted {len(email_company_links)} email-company links.")
341
+
342
+ if email_contact_links:
343
+ batched_insert(
344
+ supabase_client,
345
+ "hubspot_email_contacts",
346
+ email_contact_links,
347
+ batch_size=1000,
348
+ on_conflict=["email_id", "contact_id"],
349
+ )
350
+ print(f"Upserted {len(email_contact_links)} email-contact links.")
351
+
352
+ if email_deal_links:
353
+ batched_insert(
354
+ supabase_client,
355
+ "hubspot_email_deals",
356
+ email_deal_links,
357
+ batch_size=1000,
358
+ on_conflict=["email_id", "deal_id"],
359
+ )
360
+ print(f"Upserted {len(email_deal_links)} email-deal links.")
361
+
362
+ if email_ticket_links:
363
+ batched_insert(
364
+ supabase_client,
365
+ "hubspot_email_tickets",
366
+ email_ticket_links,
367
+ batch_size=1000,
368
+ on_conflict=["email_id", "ticket_id"],
369
+ )
370
+ print(f"Upserted {len(email_ticket_links)} email-ticket links.")
371
+
372
+ # -----------------------------------------------------------------------------
373
+ # Main (timestamp cursor)
374
+ # -----------------------------------------------------------------------------
375
+
376
+
377
+ def main(since_ms: Optional[int] = None):
378
+ """
379
+ Orchestrates:
380
+ 1) Search email IDs with hs_timestamp > since_ms
381
+ 2) Read full emails with associations (track max timestamp)
382
+ 3) Upsert into Supabase
383
+ 4) Update sync metadata with both last_sync_metadata and last_sync_time
384
+ """
385
+ # Resolve since_ms
386
+ if since_ms is None and BOOTSTRAP_SINCE_MS_ENV:
387
+ try:
388
+ since_ms = int(BOOTSTRAP_SINCE_MS_ENV)
389
+ except ValueError:
390
+ raise RuntimeError(
391
+ "HUBSPOT_EMAILS_SINCE_MS must be an integer (ms) if set.")
392
+
393
+ if since_ms is None:
394
+ # Default: today@00:00:00Z for first run
395
+ today0 = floor_to_utc_midnight(
396
+ datetime.datetime.now(datetime.timezone.utc))
397
+ since_ms = to_epoch_ms(today0)
398
+
399
+ print(f"Searching emails with hs_timestamp > {since_ms} ...")
400
+ ids = search_email_ids_after_ms(since_ms)
401
+ print(f"Found {len(ids)} email IDs.")
402
+
403
+ if not ids:
404
+ print("No emails beyond the cursor. Updating sync metadata and exiting.")
405
+ # Record only last_sync_time; helper sets updated_at
406
+ now_iso = datetime.datetime.now(datetime.timezone.utc).isoformat()
407
+ update_sync_metadata(supabase_client, "emails", now_iso)
408
+ return
409
+
410
+ print("Reading emails (with associations)...")
411
+ (
412
+ email_metadata_data, email_company_links, email_contact_links,
413
+ email_deal_links, email_ticket_links, max_ts_ms
414
+ ) = read_emails_by_ids(ids)
415
+
416
+ print("Upserting into Supabase...")
417
+ upsert_emails(
418
+ email_metadata_data,
419
+ email_company_links,
420
+ email_contact_links,
421
+ email_deal_links,
422
+ email_ticket_links,
423
+ )
424
+
425
+ # Advance cursor to max timestamp we actually ingested
426
+ new_cursor_ms = max_ts_ms if max_ts_ms is not None else since_ms
427
+ now_iso = datetime.datetime.now(datetime.timezone.utc).isoformat()
428
+
429
+ update_sync_metadata(supabase_client, "emails", now_iso)
430
+
431
+ print(f"Emails sync complete. Advanced cursor to {new_cursor_ms}.")
432
+
433
+ # -----------------------------------------------------------------------------
434
+ # CLI
435
+ # -----------------------------------------------------------------------------
436
+
437
+
438
+ def _parse_cli_arg_to_ms(arg: str) -> int:
439
+ """
440
+ Accept:
441
+ - integer epoch ms
442
+ - ISO-8601 (Z or offset)
443
+ - YYYY-MM-DD (floors to 00:00Z for convenience/back-compat)
444
+ """
445
+ # epoch ms
446
+ if re.fullmatch(r"\d{10,13}", arg):
447
+ v = int(arg)
448
+ if v < 10_000_000_000_000: # seconds -> ms
449
+ v *= 1000
450
+ return v
451
+
452
+ # YYYY-MM-DD
453
+ if re.fullmatch(r"\d{4}-\d{2}-\d{2}", arg):
454
+ d = datetime.datetime.strptime(
455
+ arg, "%Y-%m-%d").replace(tzinfo=datetime.timezone.utc)
456
+ return to_epoch_ms(floor_to_utc_midnight(d))
457
+
458
+ # ISO-8601
459
+ return to_epoch_ms(arg)
460
+
461
+
462
+ if __name__ == "__main__":
463
+ import sys
464
+ if len(sys.argv) > 1:
465
+ try:
466
+ main(since_ms=_parse_cli_arg_to_ms(sys.argv[1]))
467
+ except Exception as e:
468
+ print(
469
+ f"Invalid timestamp. Provide epoch ms, ISO-8601, or YYYY-MM-DD. Error: {e}")
470
+ sys.exit(1)
471
+ else:
472
+ main()
python/hubspot_tickets.py ADDED
@@ -0,0 +1,491 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ HubSpot Tickets → Supabase (incremental since a millisecond cursor)
3
+
4
+ Usage from orchestrator:
5
+ import load_hubspot_tickets
6
+ load_hubspot_tickets.main(since_ms=<int milliseconds since epoch UTC>)
7
+
8
+ Direct CLI:
9
+ # epoch ms
10
+ python load_hubspot_tickets.py 1754025600000
11
+ # ISO-8601
12
+ python load_hubspot_tickets.py 2025-08-01T09:30:00Z
13
+ # Back-compat date (floors to 00:00Z)
14
+ python load_hubspot_tickets.py 2025-08-01
15
+ """
16
+
17
+ import os
18
+ import re
19
+ import time
20
+ import logging
21
+ import datetime
22
+ from typing import List, Dict, Optional, Tuple, Union
23
+
24
+ import httpx
25
+ import hubspot
26
+ from dotenv import load_dotenv
27
+ from supabase import create_client
28
+ from hubspot.crm.tickets import ApiException as TicketsApiException
29
+
30
+ from hubspot_utils import (
31
+ parse_ts, try_parse_int, deduplicate_by_key,
32
+ )
33
+ from supabase_utils import (
34
+ batched_insert, update_sync_metadata,
35
+ )
36
+
37
+ # -----------------------------------------------------------------------------
38
+ # Logging
39
+ # -----------------------------------------------------------------------------
40
+ logging.basicConfig(
41
+ filename=f"logs/hubspot_tickets_pipeline_{datetime.datetime.now().strftime('%Y-%m-%d')}.log",
42
+ filemode="a",
43
+ level=logging.INFO,
44
+ format="%(asctime)s [%(levelname)s] %(message)s",
45
+ )
46
+
47
+ # -----------------------------------------------------------------------------
48
+ # Environment
49
+ # -----------------------------------------------------------------------------
50
+ load_dotenv()
51
+ HUBSPOT_TOKEN = os.getenv("HUBSPOT_TOKEN")
52
+ SUPABASE_URL = os.getenv("SUPABASE_URL")
53
+ SUPABASE_SERVICE_ROLE_KEY = os.getenv("SUPABASE_SERVICE_ROLE_KEY")
54
+ # Optional bootstrap cursor if orchestrator doesn't provide one
55
+ BOOTSTRAP_SINCE_MS_ENV = os.getenv("HUBSPOT_TICKETS_SINCE_MS")
56
+
57
+ if not HUBSPOT_TOKEN:
58
+ raise RuntimeError("HUBSPOT_TOKEN is not set")
59
+ if not SUPABASE_URL or not SUPABASE_SERVICE_ROLE_KEY:
60
+ raise RuntimeError("Supabase env vars are not set")
61
+
62
+ hubspot_client = hubspot.Client.create(access_token=HUBSPOT_TOKEN)
63
+ supabase_client = create_client(SUPABASE_URL, SUPABASE_SERVICE_ROLE_KEY)
64
+
65
+ # -----------------------------------------------------------------------------
66
+ # Config
67
+ # -----------------------------------------------------------------------------
68
+ # Custom object association (CLI)
69
+ CLI_ASSOC_TYPE_ID = "2-25629083"
70
+ CLI_ASSOC_FALLBACK_KEY = "p8600202_cli_s"
71
+
72
+ TICKET_PROPERTIES = [
73
+ "closed_date",
74
+ "content",
75
+ "ticket_type",
76
+ "createdate",
77
+ "hs_created_by_user_id",
78
+ "hs_lastmodifieddate",
79
+ "hs_object_id",
80
+ "hs_pipeline",
81
+ "hs_pipeline_stage",
82
+ "hs_ticket_priority",
83
+ "subject",
84
+ "hubspot_owner_id",
85
+ "source_type",
86
+ ]
87
+
88
+ # -----------------------------------------------------------------------------
89
+ # Time helpers
90
+ # -----------------------------------------------------------------------------
91
+
92
+
93
+ def _ensure_utc(dt: datetime.datetime) -> datetime.datetime:
94
+ if dt.tzinfo is None:
95
+ dt = dt.replace(tzinfo=datetime.timezone.utc)
96
+ return dt.astimezone(datetime.timezone.utc)
97
+
98
+
99
+ def floor_to_utc_midnight(dt: datetime.datetime) -> datetime.datetime:
100
+ dt = _ensure_utc(dt)
101
+ return dt.replace(hour=0, minute=0, second=0, microsecond=0)
102
+
103
+
104
+ def _parse_iso_like_to_dt(value: str) -> datetime.datetime:
105
+ if value.endswith("Z"):
106
+ value = value[:-1] + "+00:00"
107
+ dt = datetime.datetime.fromisoformat(value)
108
+ return _ensure_utc(dt)
109
+
110
+
111
+ def to_epoch_ms(dt_or_str: Union[str, datetime.datetime]) -> int:
112
+ if isinstance(dt_or_str, str):
113
+ dt = _parse_iso_like_to_dt(dt_or_str)
114
+ elif isinstance(dt_or_str, datetime.datetime):
115
+ dt = _ensure_utc(dt_or_str)
116
+ else:
117
+ raise TypeError(f"Unsupported type for to_epoch_ms: {type(dt_or_str)}")
118
+ return int(dt.timestamp() * 1000)
119
+
120
+
121
+ def parse_any_ts_ms(value: Optional[Union[str, int, float]]) -> Optional[int]:
122
+ """
123
+ Accepts:
124
+ - ms-epoch as str/int/float
125
+ - seconds-epoch as str/int/float (auto *1000)
126
+ - ISO-8601 string
127
+ Returns ms since epoch or None.
128
+ """
129
+ if value is None:
130
+ return None
131
+ # try int first
132
+ try:
133
+ v = int(str(value))
134
+ if v < 10_000_000_000: # seconds → ms
135
+ v *= 1000
136
+ return v
137
+ except ValueError:
138
+ pass
139
+ # try ISO
140
+ try:
141
+ return to_epoch_ms(str(value))
142
+ except Exception:
143
+ logging.warning("Could not parse timestamp value=%r", value)
144
+ return None
145
+
146
+ # -----------------------------------------------------------------------------
147
+ # Pipeline & stage mappings
148
+ # -----------------------------------------------------------------------------
149
+
150
+
151
+ def get_pipeline_and_stage_mappings() -> Tuple[Dict[str, str], Dict[str, str]]:
152
+ try:
153
+ resp = hubspot_client.crm.pipelines.pipelines_api.get_all(
154
+ object_type="tickets")
155
+ pipeline_mapping: Dict[str, str] = {}
156
+ stage_mapping: Dict[str, str] = {}
157
+ for p in resp.results:
158
+ pipeline_mapping[p.id] = p.label
159
+ for s in p.stages:
160
+ stage_mapping[s.id] = s.label
161
+ return pipeline_mapping, stage_mapping
162
+ except Exception as e:
163
+ logging.error("Failed to fetch pipeline/stage mappings: %s", e)
164
+ return {}, {}
165
+
166
+ # -----------------------------------------------------------------------------
167
+ # Search IDs (ts > since_ms) with property fallback
168
+ # -----------------------------------------------------------------------------
169
+
170
+
171
+ def _search_ticket_ids_from(since_ms: int, prop: str) -> List[str]:
172
+ """
173
+ Search tickets where {prop} > since_ms.
174
+ Sort ascending so we can advance cursor monotonically.
175
+ """
176
+ url = "https://api.hubapi.com/crm/v3/objects/tickets/search"
177
+ headers = {
178
+ "Authorization": f"Bearer {HUBSPOT_TOKEN}",
179
+ "Content-Type": "application/json",
180
+ "Accept": "application/json",
181
+ }
182
+ payload = {
183
+ "filterGroups": [{
184
+ "filters": [
185
+ {"propertyName": prop, "operator": "GT",
186
+ "value": str(since_ms)},
187
+ ]
188
+ }],
189
+ "limit": 100,
190
+ "sorts": [{"propertyName": prop, "direction": "ASCENDING"}],
191
+ }
192
+
193
+ ids: List[str] = []
194
+ after: Optional[str] = None
195
+ with httpx.Client(timeout=30.0) as client:
196
+ while True:
197
+ body = dict(payload)
198
+ if after:
199
+ body["after"] = after
200
+
201
+ resp = client.post(url, headers=headers, json=body)
202
+ if resp.status_code >= 400:
203
+ try:
204
+ logging.error(
205
+ "Ticket search error for prop '%s': %s", prop, resp.json())
206
+ except Exception:
207
+ logging.error(
208
+ "Ticket search error for prop '%s': %s", prop, resp.text)
209
+ resp.raise_for_status()
210
+
211
+ data = resp.json()
212
+ ids.extend([obj["id"] for obj in data.get("results", []) or []])
213
+
214
+ after = (data.get("paging") or {}).get("next", {}).get("after")
215
+ if not after:
216
+ break
217
+ time.sleep(0.1)
218
+
219
+ return ids
220
+
221
+
222
+ def search_ticket_ids_after_ms(since_ms: int) -> Tuple[List[str], str]:
223
+ """
224
+ Search ticket IDs where {prop} > since_ms (strictly greater),
225
+ sorted ASC so the max timestamp at the end is monotonic.
226
+ Returns a tuple of (ticket_ids, prop_used).
227
+ """
228
+
229
+ props_to_try = ["createdate"]
230
+ last_err = None
231
+
232
+ for prop in props_to_try:
233
+ try:
234
+ ids = _search_ticket_ids_from(since_ms, prop)
235
+ logging.info(
236
+ "Ticket search with '%s' returned %d IDs.", prop, len(ids))
237
+ return ids, prop
238
+ except httpx.HTTPStatusError as e:
239
+ last_err = e
240
+ continue
241
+
242
+ if last_err:
243
+ raise last_err
244
+ return [], "createdate"
245
+
246
+ # -----------------------------------------------------------------------------
247
+ # Read-by-ID (with associations)
248
+ # -----------------------------------------------------------------------------
249
+
250
+
251
+ def read_tickets_by_ids(
252
+ ticket_ids: List[str],
253
+ cursor_prop: str,
254
+ ) -> Tuple[List[Dict], List[Dict], List[Dict], List[Dict], Optional[int]]:
255
+ """
256
+ Read tickets by ID with properties and associations (CLI/deals/contacts).
257
+ Returns: tickets, ticket_cli_links, ticket_deal_links, ticket_contact_links, max_ts_ms_for_cursor_prop
258
+ """
259
+ if not ticket_ids:
260
+ return [], [], [], [], None
261
+
262
+ tickets: List[Dict] = []
263
+ ticket_cli_links: List[Dict] = []
264
+ ticket_deal_links: List[Dict] = []
265
+ ticket_contact_links: List[Dict] = []
266
+
267
+ assoc_types = [CLI_ASSOC_TYPE_ID, "deals", "contacts"]
268
+ pipeline_map, stage_map = get_pipeline_and_stage_mappings()
269
+
270
+ max_ts_ms: Optional[int] = None
271
+
272
+ for i, tid in enumerate(ticket_ids, start=1):
273
+ try:
274
+ record = hubspot_client.crm.tickets.basic_api.get_by_id(
275
+ tid, properties=TICKET_PROPERTIES, associations=assoc_types, archived=False
276
+ )
277
+ p = record.properties or {}
278
+
279
+ # Track max timestamp based on cursor_prop we searched on
280
+ cursor_val = p.get(cursor_prop)
281
+ ts_ms = parse_any_ts_ms(cursor_val)
282
+ if ts_ms is not None and (max_ts_ms is None or ts_ms > max_ts_ms):
283
+ max_ts_ms = ts_ms
284
+
285
+ tickets.append({
286
+ "ticket_id": try_parse_int(record.id),
287
+ "subject": p.get("subject"),
288
+ "content": p.get("content"),
289
+ "ticket_type": p.get("ticket_type"),
290
+ "closed_date": parse_ts(p.get("closed_date")),
291
+ "hubspot_modified_date": parse_ts(p.get("hs_lastmodifieddate")),
292
+ "pipeline_id": try_parse_int(p.get("hs_pipeline")),
293
+ "pipeline_label": pipeline_map.get(p.get("hs_pipeline"), ""),
294
+ "ticket_status_id": try_parse_int(p.get("hs_pipeline_stage")),
295
+ "ticket_status_label": stage_map.get(p.get("hs_pipeline_stage"), ""),
296
+ "ticket_priority": p.get("hs_ticket_priority"),
297
+ "source_type": p.get("source_type"),
298
+ "hubspot_owner_id": try_parse_int(p.get("hubspot_owner_id")),
299
+ "hubspot_created_at": parse_ts(p.get("createdate")),
300
+ "hubspot_created_by": try_parse_int(p.get("hs_created_by_user_id")),
301
+ })
302
+
303
+ # Associations
304
+ assoc = record.associations or {}
305
+
306
+ # CLI (custom object) key may be numeric ID or the name key
307
+ cli_bucket = None
308
+ if assoc.get(CLI_ASSOC_TYPE_ID):
309
+ cli_bucket = assoc[CLI_ASSOC_TYPE_ID]
310
+ elif assoc.get(CLI_ASSOC_FALLBACK_KEY):
311
+ cli_bucket = assoc[CLI_ASSOC_FALLBACK_KEY]
312
+
313
+ if cli_bucket and getattr(cli_bucket, "results", None):
314
+ for a in cli_bucket.results:
315
+ if a.id and a.id.isdigit():
316
+ ticket_cli_links.append({
317
+ "ticket_id": try_parse_int(record.id),
318
+ "cli_id": try_parse_int(a.id),
319
+ })
320
+
321
+ if assoc.get("deals") and getattr(assoc["deals"], "results", None):
322
+ for a in assoc["deals"].results:
323
+ if a.id and a.id.isdigit():
324
+ ticket_deal_links.append({
325
+ "ticket_id": try_parse_int(record.id),
326
+ "deal_id": try_parse_int(a.id),
327
+ })
328
+
329
+ if assoc.get("contacts") and getattr(assoc["contacts"], "results", None):
330
+ for a in assoc["contacts"].results:
331
+ if a.id and a.id.isdigit():
332
+ ticket_contact_links.append({
333
+ "ticket_id": try_parse_int(record.id),
334
+ "contact_id": try_parse_int(a.id),
335
+ })
336
+
337
+ if i % 200 == 0:
338
+ logging.info("Read %d tickets...", i)
339
+
340
+ time.sleep(0.05)
341
+
342
+ except httpx.HTTPStatusError as e:
343
+ logging.error("HTTP error reading ticket %s: %s", tid, e)
344
+ except (TicketsApiException, httpx.HTTPError) as e:
345
+ logging.error("Error reading ticket %s: %s", tid, e)
346
+
347
+ return tickets, ticket_cli_links, ticket_deal_links, ticket_contact_links, max_ts_ms
348
+
349
+ # -----------------------------------------------------------------------------
350
+ # Upsert
351
+ # -----------------------------------------------------------------------------
352
+
353
+
354
+ def upsert_tickets(
355
+ tickets: List[Dict],
356
+ ticket_cli_links: List[Dict],
357
+ ticket_deal_links: List[Dict],
358
+ ticket_contact_links: List[Dict],
359
+ ) -> None:
360
+ if tickets:
361
+ tickets = deduplicate_by_key(tickets, key="ticket_id")
362
+ batched_insert(
363
+ supabase_client, "hubspot_tickets", tickets,
364
+ batch_size=1000, on_conflict=["ticket_id"]
365
+ )
366
+ print(f"Upserted {len(tickets)} tickets.")
367
+
368
+ if ticket_cli_links:
369
+ ticket_cli_links = deduplicate_by_key(
370
+ ticket_cli_links, key=("ticket_id", "cli_id"))
371
+ batched_insert(
372
+ supabase_client, "hubspot_ticket_clis", ticket_cli_links,
373
+ batch_size=1000, on_conflict=["ticket_id", "cli_id"]
374
+ )
375
+ print(f"Upserted {len(ticket_cli_links)} ticket-cli associations.")
376
+
377
+ if ticket_deal_links:
378
+ ticket_deal_links = deduplicate_by_key(
379
+ ticket_deal_links, key=("ticket_id", "deal_id"))
380
+ batched_insert(
381
+ supabase_client, "hubspot_ticket_deals", ticket_deal_links,
382
+ batch_size=1000, on_conflict=["ticket_id", "deal_id"]
383
+ )
384
+ print(f"Upserted {len(ticket_deal_links)} ticket-deal associations.")
385
+
386
+ if ticket_contact_links:
387
+ ticket_contact_links = deduplicate_by_key(
388
+ ticket_contact_links, key=("ticket_id", "contact_id"))
389
+ batched_insert(
390
+ supabase_client, "hubspot_ticket_contacts", ticket_contact_links,
391
+ batch_size=1000, on_conflict=["ticket_id", "contact_id"]
392
+ )
393
+ print(
394
+ f"Upserted {len(ticket_contact_links)} ticket-contact associations.")
395
+
396
+ # -----------------------------------------------------------------------------
397
+ # Main (timestamp cursor)
398
+ # -----------------------------------------------------------------------------
399
+
400
+
401
+ def main(since_ms: Optional[int] = None):
402
+ """
403
+ Orchestrates:
404
+ 1) Search ticket IDs with <cursor_prop> > since_ms (property fallback)
405
+ 2) Read full tickets with associations (track max timestamp for <cursor_prop>)
406
+ 3) Upsert into Supabase
407
+ 4) Update sync metadata with { last_sync_metadata, last_sync_time, cursor_prop }
408
+ """
409
+ # Resolve since_ms
410
+ if since_ms is None and BOOTSTRAP_SINCE_MS_ENV:
411
+ try:
412
+ since_ms = int(BOOTSTRAP_SINCE_MS_ENV)
413
+ except ValueError:
414
+ raise RuntimeError(
415
+ "HUBSPOT_TICKETS_SINCE_MS must be an integer (ms) if set.")
416
+
417
+ if since_ms is None:
418
+ # Default: today@00:00:00Z for first run
419
+ today0 = floor_to_utc_midnight(
420
+ datetime.datetime.now(datetime.timezone.utc))
421
+ since_ms = to_epoch_ms(today0)
422
+
423
+ print(f"Searching tickets with timestamp > {since_ms} ...")
424
+ ids, cursor_prop = search_ticket_ids_after_ms(since_ms)
425
+ print(f"Search property: {cursor_prop}. Found {len(ids)} ticket IDs.")
426
+
427
+ now_iso = datetime.datetime.now(datetime.timezone.utc).isoformat()
428
+
429
+ if not ids:
430
+ print("No tickets beyond the cursor. Updating sync metadata and exiting.")
431
+ # Only record last_sync_time (updated_at handled by helper)
432
+ update_sync_metadata(supabase_client, "hubspot_tickets", now_iso)
433
+ return
434
+
435
+ print("Reading tickets (with associations)...")
436
+ tickets, ticket_cli_links, ticket_deal_links, ticket_contact_links, max_ts_ms = read_tickets_by_ids(
437
+ ids, cursor_prop)
438
+
439
+ print("Upserting into Supabase...")
440
+ upsert_tickets(tickets, ticket_cli_links,
441
+ ticket_deal_links, ticket_contact_links)
442
+
443
+ # Advance cursor to max timestamp we actually ingested for the chosen property
444
+ new_cursor_ms = max_ts_ms if max_ts_ms is not None else since_ms
445
+
446
+ # Record last_sync_time only (updated_at handled by helper)
447
+ update_sync_metadata(supabase_client, "hubspot_tickets", now_iso)
448
+
449
+ print(
450
+ f"Tickets sync complete. Advanced cursor to {new_cursor_ms} using prop '{cursor_prop}'.")
451
+
452
+ # -----------------------------------------------------------------------------
453
+ # CLI
454
+ # -----------------------------------------------------------------------------
455
+
456
+
457
+ def _parse_cli_arg_to_ms(arg: str) -> int:
458
+ """
459
+ Accept:
460
+ - integer epoch ms
461
+ - ISO-8601 (Z or offset)
462
+ - YYYY-MM-DD (floors to 00:00Z)
463
+ """
464
+ # epoch ms or seconds
465
+ if re.fullmatch(r"\d{10,13}", arg):
466
+ v = int(arg)
467
+ if v < 10_000_000_000_000: # seconds -> ms
468
+ v *= 1000
469
+ return v
470
+
471
+ # YYYY-MM-DD
472
+ if re.fullmatch(r"\d{4}-\d{2}-\d{2}", arg):
473
+ d = datetime.datetime.strptime(
474
+ arg, "%Y-%m-%d").replace(tzinfo=datetime.timezone.utc)
475
+ return to_epoch_ms(floor_to_utc_midnight(d))
476
+
477
+ # ISO-8601
478
+ return to_epoch_ms(arg)
479
+
480
+
481
+ if __name__ == "__main__":
482
+ import sys
483
+ if len(sys.argv) > 1:
484
+ try:
485
+ main(since_ms=_parse_cli_arg_to_ms(sys.argv[1]))
486
+ except Exception as e:
487
+ print(
488
+ f"Invalid timestamp. Provide epoch ms, ISO-8601, or YYYY-MM-DD. Error: {e}")
489
+ sys.exit(1)
490
+ else:
491
+ main()
python/hubspot_utils.py ADDED
@@ -0,0 +1,946 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ This script contains utility functions for working with HubSpot data.
3
+ """
4
+ import time
5
+ import datetime
6
+ import re
7
+ import httpx
8
+ import html
9
+ import requests
10
+ import logging
11
+ from typing import Dict, List, Optional, Tuple
12
+ from collections import defaultdict
13
+ import pandas as pd
14
+ from hubspot.crm.objects import ApiException as ObjectApiException
15
+ from hubspot.crm.contacts.exceptions import ApiException as ContactsApiException
16
+ from hubspot.crm.companies.exceptions import ApiException as CompaniesApiException
17
+ from hubspot.crm.contacts import (
18
+ PublicObjectSearchRequest, Filter, FilterGroup, BatchInputSimplePublicObjectId
19
+ )
20
+ from hubspot.crm.companies import (
21
+ PublicObjectSearchRequest as CompanySearchRequest,
22
+ Filter as CompanyFilter,
23
+ FilterGroup as CompanyFilterGroup
24
+ )
25
+
26
+ _MISSING = {None, ""}
27
+
28
+
29
+ def serialize(obj):
30
+ """
31
+ Recursively serialize a complex object into a plain dictionary.
32
+
33
+ This function is useful for serializing HubSpot API responses into a form
34
+ that can be easily stored in a database or file.
35
+
36
+ It currently supports the following types:
37
+
38
+ - Objects with a `to_dict` method
39
+ - Lists
40
+ - Dictionaries
41
+ - Datetime objects
42
+ - Objects with a `__dict__` attribute
43
+
44
+ If an object doesn't match any of the above, it is converted to a string
45
+ using the `str` function.
46
+
47
+ :param obj: The object to serialize
48
+ :return: A plain dictionary
49
+ """
50
+
51
+ if hasattr(obj, "to_dict"):
52
+ return serialize(obj.to_dict())
53
+ elif isinstance(obj, list):
54
+ return [serialize(item) for item in obj]
55
+ elif isinstance(obj, dict):
56
+ return {key: serialize(value) for key, value in obj.items()}
57
+ elif isinstance(obj, datetime.datetime):
58
+ return obj.isoformat()
59
+ elif hasattr(obj, "__dict__"):
60
+ return {key: serialize(value) for key, value in obj.__dict__.items() if not key.startswith("_")}
61
+ else:
62
+ return str(obj)
63
+
64
+
65
+ def try_parse_int(value):
66
+ """
67
+ Attempts to parse the given value as an integer.
68
+
69
+ :param value: The value to be parsed.
70
+ :return: The integer representation of the value if parsing is successful,
71
+ otherwise None.
72
+ """
73
+
74
+ if value is None:
75
+ return None
76
+
77
+ try:
78
+ return int(value)
79
+ except (ValueError, TypeError):
80
+ return None
81
+
82
+
83
+ def try_parse_float(value):
84
+ """
85
+ Attempts to parse the given value as a float.
86
+
87
+ :param value: The value to be parsed.
88
+ :return: The float representation of the value if parsing is successful,
89
+ otherwise None.
90
+ """
91
+
92
+ if value is None:
93
+ return None
94
+ try:
95
+ return float(value)
96
+ except (ValueError, TypeError):
97
+ return None
98
+
99
+
100
+ def parse_ts(ts):
101
+ """
102
+ Parses a timestamp string in ISO 8601 format and converts it to a UTC datetime string.
103
+
104
+ This function attempts to parse the given timestamp string, adjusting for the
105
+ timezone offset if provided. If the parsing is successful, it returns the
106
+ datetime in ISO 8601 format with UTC timezone. If the parsing fails or if
107
+ the input is None, it returns the original timestamp or None.
108
+
109
+ :param ts: The timestamp string to be parsed.
110
+ :return: A UTC timezone-aware datetime string in ISO 8601 format, the original
111
+ timestamp if parsing fails, or None if the input is None.
112
+ """
113
+
114
+ if not ts:
115
+ return None
116
+ try:
117
+ if isinstance(ts, (int, float)) or str(ts).isdigit():
118
+ ts = int(ts)
119
+ dt = datetime.datetime.fromtimestamp(
120
+ ts / 1000.0, tz=datetime.timezone.utc)
121
+ else:
122
+ dt = datetime.datetime.fromisoformat(
123
+ str(ts).replace("Z", "+00:00"))
124
+ return dt.isoformat()
125
+ except (ValueError, TypeError) as e:
126
+ logging.error(
127
+ "Failed to parse timestamp '%s': %s", ts, str(e)
128
+ )
129
+ return None
130
+
131
+
132
+ def deduplicate_by_key(rows, key):
133
+ """
134
+ Removes duplicate rows from a list of dictionaries based on a specified key or tuple of keys.
135
+
136
+ :param rows: The list of dictionaries to deduplicate.
137
+ :param key: A string or tuple of strings representing the key(s) to deduplicate by.
138
+ If a string, the value of the dictionary at that key will be used.
139
+ If a tuple, the values of the dictionary at each key in the tuple
140
+ will be combined into a tuple and used as the deduplication key.
141
+ :return: A list of unique dictionaries, with duplicates removed based on the key(s) specified.
142
+ """
143
+
144
+ seen = set()
145
+ unique_rows = []
146
+ for row in rows:
147
+ val = tuple(row[k] for k in key) if isinstance(
148
+ key, tuple) else row.get(key)
149
+ if val and val not in seen:
150
+ seen.add(val)
151
+ unique_rows.append(row)
152
+ return unique_rows
153
+
154
+
155
+ def fix_surrogates(text: str) -> str:
156
+ """
157
+ Fix surrogates in a string.
158
+ Some emails may contain surrogates, which are invalid utf-8 characters.
159
+ This function will convert such characters to valid utf-8 characters.
160
+ :param text: The string to fix.
161
+ :return: The fixed string.
162
+ """
163
+ return text.encode('utf-16', 'surrogatepass').decode('utf-16')
164
+
165
+
166
+ def strip_footer(text: str) -> str:
167
+ """
168
+ Strips common email footers like confidentiality notices and signatures.
169
+ """
170
+ footer_cutoff_patterns = [
171
+ r"(?i)Registration Number \\d+[A-Z]?",
172
+ r"(?i)this e[- ]?mail message contains confidential information",
173
+ r"(?i)this email and any attachments.*?confidential",
174
+ r"(?i)if you are not the intended recipient",
175
+ r"(?i)please notify us immediately by return e[- ]?mail",
176
+ r"(?i)no liability is accepted for any damage",
177
+ r"(?i)this message is intended only for the use of the individual",
178
+ r"(?i)confidentiality notice",
179
+ r"(?i)please consider the environment before printing",
180
+ r"(?i)registered in England and Wales",
181
+ r"(?i)the views expressed in this email are those of the sender",
182
+ r"(?i)accepts no liability",
183
+ r"(?i)has taken steps to ensure",
184
+ ]
185
+
186
+ for pattern in footer_cutoff_patterns:
187
+ match = re.search(pattern, text)
188
+ if match:
189
+ return text[:match.start()].strip()
190
+ return text
191
+
192
+
193
+ def clean_text(raw_text: str) -> str:
194
+ """
195
+ Clean an email text by removing HTML, unescaping entities, and trimming
196
+ quoted sections, footers, and signatures.
197
+ """
198
+ # Defensive defaults
199
+ if raw_text is None:
200
+ raw_text = ""
201
+
202
+ # Optional helper fallbacks
203
+ try:
204
+ fixed = fix_surrogates(raw_text)
205
+ except NameError:
206
+ # If fix_surrogates isn't available, just pass through
207
+ fixed = raw_text
208
+
209
+ # Start from the corrected source
210
+ text = fixed
211
+
212
+ # Normalize Windows/Mac line endings early (before HTML handling)
213
+ text = text.replace("\r\n", "\n").replace("\r", "\n")
214
+
215
+ # Normalize common HTML line breaks to real newlines before removing tags
216
+ text = re.sub(r"(?i)<br\s*/?>", "\n", text)
217
+
218
+ # Strip HTML tags
219
+ text = re.sub(r"<[^>]+>", "", text)
220
+
221
+ # Unescape HTML entities
222
+ text = html.unescape(text)
223
+
224
+ # --- Trim quoted / forwarded blocks heuristics ---
225
+ on_wrote_pat = re.compile(r"(?im)^[>\s]*on\s+.+?wrote:")
226
+ fwd_hdr_pat = re.compile(
227
+ r"(?im)^[>\s]*(?:from|sent|date|subject|to)\s*:\s.+$")
228
+ sep_pat = re.compile(
229
+ r"(?im)^(?:[>\s]*-----\s*original message\s*-----|[>\s]*begin forwarded message)")
230
+
231
+ cut_idx = None
232
+ for m in (on_wrote_pat.search(text), fwd_hdr_pat.search(text), sep_pat.search(text)):
233
+ if m:
234
+ idx = m.start()
235
+ cut_idx = idx if cut_idx is None or idx < cut_idx else cut_idx
236
+
237
+ if cut_idx is not None:
238
+ text = text[:cut_idx].rstrip("\n")
239
+
240
+ # --- Line-by-line cleanup, footer & signature handling ---
241
+ lines = text.split("\n")
242
+ cleaned = []
243
+
244
+ footer_regex = re.compile(
245
+ r"(?i)\b(confidential|privacy policy|unsubscribe|follow us|visit our website|"
246
+ r"please consider the environment|registered office|copyright|"
247
+ r"this e-?mail and any attachments|do not print this email)\b"
248
+ )
249
+ sig_sep = re.compile(r"^--\s?$") # standard sig delimiter
250
+
251
+ for line in lines:
252
+ stripped = line.strip()
253
+
254
+ # Skip pure separators
255
+ if re.match(r"^[-=_*]{3,}\s*$", stripped):
256
+ continue
257
+
258
+ # Skip common footer lines
259
+ if footer_regex.search(stripped):
260
+ continue
261
+
262
+ # Stop including anything after a signature separator
263
+ if sig_sep.match(stripped):
264
+ break
265
+
266
+ # Keep empty lines (collapse later)
267
+ if stripped == "":
268
+ cleaned.append("")
269
+ continue
270
+
271
+ cleaned.append(stripped)
272
+
273
+ out = "\n".join(cleaned)
274
+ out = re.sub(r"\n{3,}", "\n\n", out).strip()
275
+
276
+ # Optional final footer stripping
277
+ try:
278
+ out = strip_footer(out)
279
+ except NameError:
280
+ pass
281
+
282
+ return out
283
+
284
+
285
+ def get_search_config(hubspot_client, object_type):
286
+ """
287
+ Retrieves the necessary classes and search API instance for the given HubSpot object type.
288
+
289
+ :param hubspot_client: The HubSpot client object.
290
+ :param object_type: The type of object to search for.
291
+ :return: A dictionary containing the necessary classes and search API instance.
292
+ :raises ValueError: If the object_type is unsupported.
293
+ """
294
+
295
+ if object_type == "contacts":
296
+ return {
297
+ "FilterCls": Filter,
298
+ "FilterGroupCls": FilterGroup,
299
+ "SearchRequestCls": PublicObjectSearchRequest,
300
+ "search_api": hubspot_client.crm.contacts.search_api,
301
+ "modified_prop": "createdate",
302
+ }
303
+ if object_type == "companies":
304
+ return {
305
+ "FilterCls": CompanyFilter,
306
+ "FilterGroupCls": CompanyFilterGroup,
307
+ "SearchRequestCls": CompanySearchRequest,
308
+ "search_api": hubspot_client.crm.companies.search_api,
309
+ "modified_prop": "createdate",
310
+ }
311
+
312
+ raise ValueError(f"Unsupported object_type '{object_type}'")
313
+
314
+
315
+ def get_property_label_mapping(hubspot_client, object_type: str, property_name: str) -> dict:
316
+ """
317
+ Retrieves the label mapping for a HubSpot property.
318
+
319
+ :param hubspot_client: The HubSpot client instance.
320
+ :param object_type: "contacts" or "companies"
321
+ :param property_name: The internal name of the property (e.g., "industry").
322
+ :return: Dictionary mapping internal values to human-readable labels.
323
+ """
324
+ try:
325
+ prop_info = hubspot_client.crm.properties.core_api.get_by_name(
326
+ object_type, property_name)
327
+ return {opt.value: opt.label for opt in prop_info.options}
328
+ except Exception as e:
329
+ logging.warning("Failed to fetch mapping for %s: %s",
330
+ property_name, str(e))
331
+ return {}
332
+
333
+
334
+ def build_filter_request(since, after):
335
+ """
336
+ Builds a HubSpot SearchRequest to fetch companies created after a given datetime.
337
+
338
+ :param since: The datetime to filter by createdate
339
+ :param after: Optional result offset to use for pagination
340
+ :return: PublicObjectSearchRequest
341
+ """
342
+
343
+ value = int(since.timestamp() * 1000)
344
+ filter_obj = Filter(property_name="createdate",
345
+ operator="GTE", value=value)
346
+ filter_group = FilterGroup(filters=[filter_obj])
347
+ return PublicObjectSearchRequest(
348
+ filter_groups=[filter_group],
349
+ properties=[
350
+ "full_name",
351
+ "firstname",
352
+ "lastname",
353
+ "email",
354
+ "phone",
355
+ "createdate",
356
+ "lastmodifieddate",
357
+ "lastactivitydate",
358
+ "associatedcompanyid",
359
+ ],
360
+ limit=100,
361
+ after=after
362
+ )
363
+
364
+
365
+ def fetch_custom_object(hubspot_client, object_name="tickets", properties=None,
366
+ associations=None):
367
+ """
368
+ Fetches all records of a custom HubSpot object and returns them as a list of dictionaries.
369
+
370
+ :param hubspot_client: The HubSpot client object.
371
+ :param object_name: The name of the custom object to fetch. Defaults to "tickets".
372
+ :param properties: A list of properties to include in the response. If None, all properties are included.
373
+ :param associations: A list of associations to include in the response. If None, no associations are included.
374
+ :return: A list of dictionaries, where each dictionary represents a record of the custom object.
375
+ :raises ObjectApiException: If the HubSpot API returns an error when fetching the custom object.
376
+ """
377
+ all_objects = []
378
+ after = None
379
+
380
+ while True:
381
+ try:
382
+ response = hubspot_client.crm.objects.basic_api.get_page(
383
+ object_type=object_name,
384
+ properties=properties,
385
+ limit=100,
386
+ after=after,
387
+ archived=False,
388
+ associations=associations
389
+ )
390
+
391
+ if not response.results:
392
+ break
393
+
394
+ for record in response.results:
395
+ props = record.properties
396
+ object_id = record.id
397
+
398
+ # exclude hs_object_id and optionally filter properties
399
+ if properties:
400
+ filtered_props = {k: props.get(
401
+ k) for k in properties if k != "hs_object_id"}
402
+ else:
403
+ filtered_props = {
404
+ k: v for k, v in props.items() if k != "hs_object_id"}
405
+
406
+ object_dict = {"billing_id": object_id, **filtered_props}
407
+
408
+ # cast ints if present
409
+ for key in ["hs_created_by_user_id"]:
410
+ if key in object_dict:
411
+ object_dict[key] = try_parse_int(object_dict[key])
412
+
413
+ # parse timestamps if present
414
+ for key in ["hs_createdate", "hs_lastmodifieddate"]:
415
+ if key in object_dict:
416
+ object_dict[key] = parse_ts(object_dict[key])
417
+
418
+ # include associations if requested
419
+ if associations and record.associations:
420
+ assoc_data = {}
421
+ for assoc_type, assoc_records in record.associations.items():
422
+ assoc_data[assoc_type] = [
423
+ ar.id for ar in assoc_records.results]
424
+ object_dict["associations"] = assoc_data
425
+
426
+ all_objects.append(object_dict)
427
+
428
+ if response.paging and response.paging.next:
429
+ after = response.paging.next.after
430
+ else:
431
+ break
432
+
433
+ time.sleep(0.1)
434
+
435
+ except ObjectApiException as e:
436
+ logging.error("Exception when fetching %s: %s", object_name, e)
437
+ break
438
+
439
+ return all_objects
440
+
441
+
442
+ def fetch_total_objects(
443
+ hubspot_client,
444
+ object_type: str,
445
+ modified_after: datetime.datetime = None,
446
+ archived: bool = False
447
+ ) -> int:
448
+ """
449
+ Fetches the total number of HubSpot objects of the given type.
450
+ Supports counting archived objects using pagination.
451
+
452
+ :param hubspot_client: HubSpot client
453
+ :param object_type: "contacts" or "companies"
454
+ :param modified_after: Only used for non-archived search_api queries
455
+ :param archived: Whether to count archived records
456
+ :return: Total number of matching objects
457
+ """
458
+ try:
459
+ if archived:
460
+ total = 0
461
+ after = None
462
+ while True:
463
+ if object_type == "contacts":
464
+ response = hubspot_client.crm.contacts.basic_api.get_page(
465
+ limit=100, archived=True, after=after
466
+ )
467
+ elif object_type == "companies":
468
+ response = hubspot_client.crm.companies.basic_api.get_page(
469
+ limit=100, archived=True, after=after
470
+ )
471
+ else:
472
+ raise ValueError(
473
+ f"Unsupported object_type '{object_type}' for archived=True")
474
+
475
+ total += len(response.results)
476
+
477
+ if response.paging and response.paging.next:
478
+ after = response.paging.next.after
479
+ else:
480
+ break
481
+ time.sleep(0.1)
482
+
483
+ logging.info("Total %s (archived): %d", object_type, total)
484
+ print(f"Total {object_type} (archived): {total}")
485
+ return total
486
+
487
+ # Non-archived path via search API
488
+ config = get_search_config(hubspot_client, object_type)
489
+ filters = []
490
+
491
+ if modified_after:
492
+ value = int(modified_after.timestamp() * 1000)
493
+ filters.append(config["FilterCls"](
494
+ property_name=config["modified_prop"], operator="GTE", value=value
495
+ ))
496
+
497
+ request_body = config["SearchRequestCls"](
498
+ filter_groups=[config["FilterGroupCls"](
499
+ filters=filters)] if filters else None,
500
+ properties=["id"],
501
+ limit=1
502
+ )
503
+ response = config["search_api"].do_search(request_body)
504
+ total = response.total
505
+
506
+ logging.info("Total %s in HubSpot: %d", object_type, total)
507
+ print(f"Total {object_type} in HubSpot: {total}")
508
+ return total
509
+
510
+ except (ContactsApiException, CompaniesApiException) as api_err:
511
+ logging.error("API error occurred while fetching %s: %s",
512
+ object_type, str(api_err))
513
+ except httpx.HTTPError as http_err:
514
+ logging.error("HTTP error occurred while fetching %s: %s",
515
+ object_type, str(http_err))
516
+ except ValueError as val_err:
517
+ logging.error("ValueError while fetching %s: %s",
518
+ object_type, str(val_err))
519
+ except BaseException as critical_err:
520
+ logging.critical("Unexpected error fetching %s: %s",
521
+ object_type, str(critical_err))
522
+ raise
523
+
524
+ return 0
525
+
526
+
527
+ def _coalesce(*vals):
528
+ """
529
+ Returns the first non-missing value from the given arguments.
530
+
531
+ A value is considered "missing" if it is None or an empty string.
532
+
533
+ :param vals: Variable number of arguments to coalesce.
534
+ :return: The first non-missing value, or None if all values are missing.
535
+ """
536
+ for v in vals:
537
+ if v not in _MISSING:
538
+ return v
539
+ return None
540
+
541
+
542
+ def parse_ts_dt(ts: Optional[str]) -> Optional[datetime.datetime]:
543
+ """
544
+ Parses a timestamp string in ISO 8601 format and converts it to a UTC datetime object.
545
+
546
+ :param ts: The timestamp string to be parsed.
547
+ :return: A UTC timezone-aware datetime object, or None if the input is None or parsing fails.
548
+ """
549
+ if not ts:
550
+ return None
551
+ try:
552
+ return datetime.datetime.fromisoformat(str(ts).replace("Z", "+00:00")).astimezone(datetime.timezone.utc)
553
+ except Exception:
554
+ return None
555
+
556
+
557
+ def to_epoch_ms_from_utc_iso(date_str: str) -> int:
558
+ """
559
+ Converts a date string in ISO 8601 format or 'YYYY-MM-DD' format to milliseconds since the Unix epoch (January 1, 1970, 00:00:00 UTC).
560
+
561
+ :param date_str: The date string to be converted.
562
+ :return: The number of milliseconds since the Unix epoch.
563
+ """
564
+ if not date_str:
565
+ raise ValueError("date_str is required")
566
+ if len(date_str) == 10 and date_str[4] == "-" and date_str[7] == "-":
567
+ # YYYY-MM-DD -> midnight UTC
568
+ dt = datetime.datetime.strptime(
569
+ date_str, "%Y-%m-%d").replace(tzinfo=datetime.timezone.utc)
570
+ else:
571
+ dt = datetime.datetime.fromisoformat(date_str.replace("Z", "+00:00"))
572
+ return int(dt.timestamp() * 1000)
573
+
574
+
575
+ def request_with_retry(method: str,
576
+ url: str,
577
+ headers: Dict[str, str],
578
+ params: Dict[str, str],
579
+ initial_backoff: float = 1.0,
580
+ max_backoff: float = 16.0,
581
+ retries: int = 8) -> Dict:
582
+ """
583
+ Sends a request to the given URL with the given method, headers, and parameters, and
584
+ retries the request up to the given number of times if it fails with a HubSpot API error
585
+ (429, 500, 502, 503, 504). If the request fails with another error code, it raises a
586
+ RuntimeError. If all retries fail, it raises a RuntimeError with a message indicating the
587
+ number of retry attempts.
588
+
589
+ :param method: The HTTP method to use (e.g. GET, POST).
590
+ :param url: The URL to send the request to.
591
+ :param headers: A dictionary of HTTP headers to include in the request.
592
+ :param params: A dictionary of URL parameters to include in the request.
593
+ :param initial_backoff: The initial backoff time in seconds.
594
+ :param max_backoff: The maximum backoff time in seconds.
595
+ :param retries: The number of times to retry the request before giving up.
596
+ :return: The JSON response from the request, decoded as a dictionary.
597
+ :raises RuntimeError: If all retry attempts fail.
598
+ """
599
+ backoff = initial_backoff
600
+ for attempt in range(1, retries + 1):
601
+ try:
602
+ resp = requests.request(
603
+ method, url, headers=headers, params=params, timeout=60)
604
+ if resp.status_code < 400:
605
+ try:
606
+ return resp.json()
607
+ except Exception as e:
608
+ logging.error(
609
+ "Failed to decode JSON response from %s: %s", url, e)
610
+ raise
611
+
612
+ # HubSpot rate limits or transient server error
613
+ if resp.status_code in (429, 500, 502, 503, 504):
614
+ logging.warning(
615
+ "HubSpot API %s (%d). Retrying in %.1fs (attempt %d/%d)...",
616
+ url, resp.status_code, backoff, attempt, retries
617
+ )
618
+ time.sleep(backoff)
619
+ backoff = min(max_backoff, backoff * 2)
620
+ continue
621
+
622
+ # Permanent error
623
+ logging.error("HubSpot API error %s: %s",
624
+ resp.status_code, resp.text)
625
+ resp.raise_for_status()
626
+
627
+ except (requests.exceptions.ConnectionError,
628
+ requests.exceptions.Timeout) as e:
629
+ logging.warning(
630
+ "Network error on attempt %d/%d (%s). Retrying in %.1fs...",
631
+ attempt, retries, str(e), backoff
632
+ )
633
+ time.sleep(backoff)
634
+ backoff = min(max_backoff, backoff * 2)
635
+ continue
636
+
637
+ except Exception as e:
638
+ logging.error("Unexpected error during HubSpot request: %s", e)
639
+ raise
640
+
641
+ # If we exhausted retries
642
+ raise RuntimeError(f"Failed after {retries} attempts calling {url}")
643
+
644
+
645
+ def page_account_activity(base_url: str,
646
+ token: str,
647
+ occurred_after_ms: int,
648
+ occurred_before_ms: int,
649
+ limit: int = 200,
650
+ max_pages: int = 1000) -> List[Dict]:
651
+ """
652
+ Fetches all account activity events from the given base URL that occurred
653
+ after the given timestamp in milliseconds.
654
+
655
+ :param base_url: The base URL to fetch events from.
656
+ :param token: The HubSpot API token to use for authentication.
657
+ :param occurred_after_ms: The timestamp in milliseconds to fetch events after.
658
+ :param limit: The maximum number of events to fetch per page (default=200).
659
+ :param max_pages: The maximum number of pages to fetch before stopping pagination
660
+ (default=1000).
661
+ :return: A list of all fetched events.
662
+ """
663
+ headers = {"Authorization": f"Bearer {token}",
664
+ "Accept": "application/json"}
665
+
666
+ is_security = "activity/security" in base_url
667
+ is_audit = "activity/audit-logs" in base_url
668
+
669
+ params: Dict[str, str] = {"limit": str(limit)}
670
+ if is_audit:
671
+ params["occurredAfter"] = str(occurred_after_ms)
672
+ if occurred_before_ms is not None:
673
+ params["occurredBefore"] = str(occurred_before_ms)
674
+ elif is_security:
675
+ params["fromTimestamp"] = str(occurred_after_ms)
676
+ if occurred_before_ms is not None:
677
+ params["toTimestamp"] = str(occurred_before_ms)
678
+
679
+ all_items: List[Dict] = []
680
+ after: Optional[str] = None
681
+ prev_after: Optional[str] = None
682
+ pages = 0
683
+
684
+ # seen on /security when exhausted
685
+ TERMINAL_AFTER_VALUES = {"MC0w", "0-0", ""}
686
+
687
+ while True:
688
+ if after:
689
+ params["after"] = after
690
+ else:
691
+ params.pop("after", None)
692
+
693
+ data = request_with_retry("GET", base_url, headers, params)
694
+ results = data.get("results") or data.get("events") or []
695
+ if not isinstance(results, list):
696
+ logging.warning("Unexpected results shape at %s: %s",
697
+ base_url, type(results))
698
+ results = []
699
+
700
+ all_items.extend(results)
701
+
702
+ paging = data.get("paging") or {}
703
+ next_obj = paging.get("next") or {}
704
+ next_after = next_obj.get("after")
705
+
706
+ pages += 1
707
+ logging.info("Fetched page %d from %s: %d items (total=%d). after=%s",
708
+ pages, base_url, len(results), len(all_items), str(next_after))
709
+
710
+ if len(results) == 0:
711
+ logging.info(
712
+ "No results returned; stopping pagination for %s.", base_url)
713
+ break
714
+
715
+ if not next_after:
716
+ logging.info(
717
+ "No next cursor; stopping pagination for %s.", base_url)
718
+ break
719
+
720
+ if next_after in TERMINAL_AFTER_VALUES:
721
+ logging.info(
722
+ "Terminal 'after' (%s); stopping pagination for %s.", next_after, base_url)
723
+ break
724
+
725
+ if prev_after is not None and next_after == prev_after:
726
+ logging.info(
727
+ "Repeated 'after' cursor (%s); stopping pagination for %s.", next_after, base_url)
728
+ break
729
+
730
+ if pages >= max_pages:
731
+ logging.warning(
732
+ "Reached max_pages=%d for %s; stopping pagination.", max_pages, base_url)
733
+ break
734
+
735
+ prev_after, after = after, next_after
736
+
737
+ time.sleep(0.1)
738
+
739
+ return all_items
740
+
741
+
742
+ def safe_get_actor(ev: Dict) -> Dict:
743
+ """
744
+ Safely extracts the actor information from an audit event.
745
+
746
+ This function is robust against the possibility of the actor being None or
747
+ not a dictionary.
748
+
749
+ :param ev: The audit event to extract the actor from
750
+ :return: A dictionary containing the actor's userId and userEmail
751
+ """
752
+
753
+ actor = ev.get("actingUser")
754
+ if not isinstance(actor, dict):
755
+ actor = {}
756
+ return {
757
+ "userId": ev.get("userId") or actor.get("userId"),
758
+ "userEmail": ev.get("userEmail") or actor.get("userEmail") or actor.get("email"),
759
+ }
760
+
761
+
762
+ def build_login_index(login_events: List[Dict]) -> Dict[Tuple[Optional[str], Optional[str]], List[Dict]]:
763
+ """
764
+ Builds an in-memory index of login events by (userId,email) and sorts by loginAt (UTC).
765
+
766
+ :param login_events: List of login events to index.
767
+ :return: A dictionary containing the indexed login events, where each key is a tuple of (userId,email)
768
+ and the value is a sorted list of login events for that key.
769
+ """
770
+ idx = defaultdict(list)
771
+ for e in login_events:
772
+ user_id = e.get("userId")
773
+ email = e.get("email")
774
+ ts = parse_ts_dt(e.get("loginAt"))
775
+ if ts is None:
776
+ continue
777
+ e["_ts"] = ts
778
+ idx[(str(user_id) if user_id is not None else None, email)].append(e)
779
+ for k in idx:
780
+ idx[k].sort(key=lambda r: r["_ts"])
781
+ return idx
782
+
783
+
784
+ def build_security_index(security_events: List[Dict]) -> Dict[Tuple[Optional[str], Optional[str]], List[Dict]]:
785
+ """
786
+ Builds an in-memory index of security events by (userId,email) and sorts by createdAt (UTC).
787
+
788
+ :param security_events: List of security events to index.
789
+ :return: A dictionary containing the indexed security events,
790
+ where each key is a tuple of (userId,email) and the value is a sorted list of security events for that key.
791
+ """
792
+ idx = defaultdict(list)
793
+ for e in security_events:
794
+ actor = safe_get_actor(e)
795
+ user_id = actor.get("userId")
796
+ email = actor.get("userEmail") or e.get("actingUser")
797
+ ts = parse_ts_dt(e.get("createdAt"))
798
+ if ts is None:
799
+ continue
800
+ e["_ts"] = ts
801
+ idx[(str(user_id) if user_id is not None else None, email)].append(e)
802
+ for k in idx:
803
+ idx[k].sort(key=lambda r: r["_ts"])
804
+ return idx
805
+
806
+
807
+ def find_best_time_match(ts: Optional[datetime.datetime],
808
+ key: Tuple[Optional[str], Optional[str]],
809
+ index: Dict[Tuple[Optional[str], Optional[str]], List[Dict]],
810
+ window_seconds: int) -> Optional[Dict]:
811
+ """
812
+ Find the best match for a given timestamp in the given index within
813
+ the given window of seconds.
814
+
815
+ :param ts: The timestamp to search for
816
+ :param key: The key to search the index for
817
+ :param index: The index to search
818
+ :param window_seconds: The window of seconds to search in
819
+ :return: The best matching event, or None if no match found
820
+ """
821
+ if ts is None:
822
+ return None
823
+ candidates = index.get(key, [])
824
+ if not candidates:
825
+ return None
826
+ best, best_diff = None, float("inf")
827
+ for c in candidates:
828
+ diff = abs((ts - c["_ts"]).total_seconds())
829
+ if diff <= window_seconds and diff < best_diff:
830
+ best, best_diff = c, diff
831
+ return best
832
+
833
+
834
+ def fill_network_fields(row: Dict, src_event: Dict) -> None:
835
+ """
836
+ Fill in network fields (ip_address, country_code, region_code) in the given row
837
+ from the given src_event if they are missing.
838
+
839
+ :param row: The row to fill in network fields for
840
+ :param src_event: The source event to draw network fields from
841
+ :return: None
842
+ """
843
+
844
+ if not isinstance(src_event, dict):
845
+ return
846
+
847
+ ip = _coalesce(
848
+ src_event.get("ipAddress"),
849
+ src_event.get("sourceIp"),
850
+ src_event.get("ip"),
851
+ (src_event.get("context") or {}).get("ipAddress"),
852
+ )
853
+ country = _coalesce(
854
+ src_event.get("countryCode"),
855
+ (src_event.get("context") or {}).get("countryCode"),
856
+ )
857
+ region = _coalesce(
858
+ src_event.get("regionCode"),
859
+ (src_event.get("context") or {}).get("regionCode"),
860
+ )
861
+
862
+ if row.get("ip_address") in _MISSING and ip not in _MISSING:
863
+ row["ip_address"] = ip
864
+ if row.get("country_code") in _MISSING and country not in _MISSING:
865
+ row["country_code"] = country
866
+ if row.get("region_code") in _MISSING and region not in _MISSING:
867
+ row["region_code"] = region
868
+
869
+
870
+ def normalize_audit_event(ev: Dict) -> Dict:
871
+ """
872
+ Normalize an audit event by extracting relevant fields into a standard format.
873
+
874
+ The normalized format includes the following fields:
875
+
876
+ - audit_id: The unique identifier for the audit event.
877
+ - category: The category of the audit event.
878
+ - sub_category: The sub-category of the audit event.
879
+ - action: The action taken in the audit event.
880
+ - target_object_id: The ID of the object targeted in the audit event.
881
+ - user_id: The ID of the user who triggered the audit event.
882
+ - user_email: The email address of the user who triggered the audit event.
883
+ - hubspot_occurred_at: The timestamp of when the audit event occurred in HubSpot.
884
+ - ip_address: The IP address associated with the audit event.
885
+ - country_code: The country code associated with the audit event.
886
+ - region_code: The region code associated with the audit event.
887
+
888
+ :param ev: The audit event to normalize.
889
+ :return: A dictionary containing the normalized audit event fields.
890
+ """
891
+ actor = safe_get_actor(ev)
892
+ return {"audit_id": ev.get("id"),
893
+ "category": ev.get("category"),
894
+ "sub_category": ev.get("subCategory"),
895
+ "action": ev.get("action"),
896
+ "target_object_id": ev.get("targetObjectId") or ev.get("objectId"),
897
+ "user_id": actor.get("userId"),
898
+ "user_email": actor.get("userEmail"),
899
+ "hubspot_occured_at": ev.get("occurredAt") or ev.get("timestamp"),
900
+ "ip_address": ev.get("ipAddress"),
901
+ "country_code": ev.get("countryCode"),
902
+ "region_code": ev.get("regionCode"),
903
+ }
904
+
905
+
906
+ def enrich_audit_row_by_category(row: Dict,
907
+ login_idx: Dict[Tuple[Optional[str], Optional[str]], List[Dict]],
908
+ security_idx: Dict[Tuple[Optional[str], Optional[str]], List[Dict]],
909
+ match_window_seconds: int = 300) -> Dict:
910
+ """
911
+ Enriches an audit event row by looking up the corresponding user's most recent login
912
+ or critical action event within a given time window.
913
+
914
+ :param row: The audit event row to enrich.
915
+ :param login_idx: A dictionary mapping user IDs/emails to lists of login events.
916
+ :param security_idx: A dictionary mapping user IDs/emails to lists of critical action events.
917
+ :param match_window_seconds: The time window in seconds to search for a matching login
918
+ or critical action event.
919
+ :return: The enriched audit event row with network fields filled in from the matching event, if found.
920
+ """
921
+ cat = (row.get("category") or "").upper()
922
+ if cat not in ("LOGIN", "CRITICAL_ACTION"):
923
+ return row
924
+
925
+ uid = str(row.get("user_id")) if row.get("user_id") is not None else None
926
+ email = row.get("user_email")
927
+ key = (uid, email)
928
+
929
+ ts = parse_ts_dt(row.get("hubspot_occured_at"))
930
+
931
+ picked = None
932
+ if cat == "LOGIN":
933
+ picked = find_best_time_match(ts, key, login_idx, match_window_seconds)
934
+ elif cat == "CRITICAL_ACTION":
935
+ picked = find_best_time_match(
936
+ ts, key, security_idx, match_window_seconds)
937
+
938
+ if picked:
939
+ fill_network_fields(row, picked)
940
+ # Optional backfill of user fields
941
+ if row.get("user_email") in _MISSING and picked.get("email") not in _MISSING:
942
+ row["user_email"] = picked.get("email")
943
+ if row.get("user_id") in _MISSING and picked.get("userId") not in _MISSING:
944
+ row["user_id"] = picked.get("userId")
945
+
946
+ return row
python/load_hubspot_data.py ADDED
@@ -0,0 +1,163 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Unified orchestrator for HubSpot → Supabase pipelines with a timestamp cursor.
3
+
4
+ CLI:
5
+ # epoch ms
6
+ python hubspot_orchestrator.py 1754025600000
7
+ # ISO-8601
8
+ python hubspot_orchestrator.py 2025-08-01T09:30:00Z
9
+ # Back-compat date (floors to 00:00Z)
10
+ python hubspot_orchestrator.py 2025-08-01
11
+ # No arg: defaults to today@00:00Z
12
+ """
13
+
14
+ import sys
15
+ import re
16
+ import logging
17
+ import datetime
18
+
19
+ # Pipelines (each exposes main(since_ms: Optional[int]) and can fall back to env)
20
+ import hubspot_deals
21
+ import hubspot_emails
22
+ import hubspot_tickets
23
+ import hubspot_contacts
24
+ import hubspot_companies
25
+ import hubspot_billing
26
+ import hubspot_audit
27
+
28
+ logging.basicConfig(
29
+ filename=f"logs/hubspot_unified_orchestrator_{datetime.datetime.now().strftime('%Y-%m-%d')}.log",
30
+ filemode="a",
31
+ level=logging.INFO,
32
+ format="%(asctime)s [%(levelname)s] %(message)s",
33
+ )
34
+
35
+ # ---------------------------
36
+ # Time parsing helpers
37
+ # ---------------------------
38
+
39
+
40
+ def _ensure_utc(dt: datetime.datetime) -> datetime.datetime:
41
+ if dt.tzinfo is None:
42
+ dt = dt.replace(tzinfo=datetime.timezone.utc)
43
+ return dt.astimezone(datetime.timezone.utc)
44
+
45
+
46
+ def floor_to_utc_midnight(dt: datetime.datetime) -> datetime.datetime:
47
+ dt = _ensure_utc(dt)
48
+ return dt.replace(hour=0, minute=0, second=0, microsecond=0)
49
+
50
+
51
+ def _parse_iso_like_to_dt(value: str) -> datetime.datetime:
52
+ if isinstance(value, str) and value.endswith("Z"):
53
+ value = value[:-1] + "+00:00"
54
+ dt = datetime.datetime.fromisoformat(value)
55
+ return _ensure_utc(dt)
56
+
57
+
58
+ def to_epoch_ms(dt_or_str) -> int:
59
+ if isinstance(dt_or_str, str):
60
+ dt = _parse_iso_like_to_dt(dt_or_str)
61
+ elif isinstance(dt_or_str, datetime.datetime):
62
+ dt = _ensure_utc(dt_or_str)
63
+ else:
64
+ raise TypeError(f"Unsupported type for to_epoch_ms: {type(dt_or_str)}")
65
+ return int(dt.timestamp() * 1000)
66
+
67
+
68
+ def parse_since_arg_to_ms() -> int:
69
+ """
70
+ Accepts:
71
+ - integer epoch ms (or seconds; seconds auto *1000)
72
+ - ISO-8601 (Z or offset)
73
+ - YYYY-MM-DD (floors to 00:00Z)
74
+ If not provided, defaults to today@00:00Z.
75
+ """
76
+ if len(sys.argv) > 1:
77
+ arg = sys.argv[1].strip()
78
+
79
+ # epoch seconds or ms
80
+ if re.fullmatch(r"\d{10,13}", arg):
81
+ v = int(arg)
82
+ if v < 10_000_000_000_000: # seconds -> ms
83
+ v *= 1000
84
+ return v
85
+
86
+ # YYYY-MM-DD
87
+ if re.fullmatch(r"\d{4}-\d{2}-\d{2}", arg):
88
+ d = datetime.datetime.strptime(
89
+ arg, "%Y-%m-%d").replace(tzinfo=datetime.timezone.utc)
90
+ return to_epoch_ms(floor_to_utc_midnight(d))
91
+
92
+ # ISO-8601
93
+ try:
94
+ return to_epoch_ms(arg)
95
+ except Exception:
96
+ print("Invalid --since argument. Use epoch ms, ISO-8601, or YYYY-MM-DD.")
97
+ sys.exit(1)
98
+
99
+ # default: today@00:00Z
100
+ today0 = floor_to_utc_midnight(
101
+ datetime.datetime.now(datetime.timezone.utc))
102
+ return to_epoch_ms(today0)
103
+
104
+ # ---------------------------
105
+ # Main
106
+ # ---------------------------
107
+
108
+
109
+ def main():
110
+ """
111
+ Runs pipelines in order with a shared timestamp cursor.
112
+ Each pipeline may internally advance its own stored cursor.
113
+ """
114
+ since_ms = parse_since_arg_to_ms()
115
+ print(f"=== Running HubSpot sync pipeline since_ms={since_ms} ===")
116
+
117
+ try:
118
+ print("\n[1/7] Companies")
119
+ hubspot_companies.main(since_ms=since_ms)
120
+ except Exception as e:
121
+ logging.exception("Error running companies pipeline: %s", e)
122
+
123
+ try:
124
+ print("\n[2/7] Contacts")
125
+ hubspot_contacts.main(since_ms=since_ms)
126
+ except Exception as e:
127
+ logging.exception("Error running contacts pipeline: %s", e)
128
+
129
+ try:
130
+ print("\n[3/7] Deals")
131
+ hubspot_deals.main(since_ms=since_ms)
132
+ except Exception as e:
133
+ logging.exception("Error running deals pipeline: %s", e)
134
+
135
+ try:
136
+ print("\n[4/7] Tickets")
137
+ hubspot_tickets.main(since_ms=since_ms)
138
+ except Exception as e:
139
+ logging.exception("Error running tickets pipeline: %s", e)
140
+
141
+ try:
142
+ print("\n[5/7] Emails")
143
+ hubspot_emails.main(since_ms=since_ms)
144
+ except Exception as e:
145
+ logging.exception("Error running emails pipeline: %s", e)
146
+
147
+ try:
148
+ print("\n[6/7] Billing")
149
+ hubspot_billing.main(since_ms=since_ms)
150
+ except Exception as e:
151
+ logging.exception("Error running billing pipeline: %s", e)
152
+
153
+ try:
154
+ print("\n[7/7] Billing")
155
+ hubspot_audit.main(since_ms=since_ms)
156
+ except Exception as e:
157
+ logging.exception("Error running billing pipeline: %s", e)
158
+
159
+ print("\n=== HubSpot sync complete ===")
160
+
161
+
162
+ if __name__ == "__main__":
163
+ main()
python/supabase_utils.py ADDED
@@ -0,0 +1,394 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ This script contains utility functions for working with Supabase.
3
+ """
4
+ import time
5
+ import datetime
6
+ import json
7
+ import logging
8
+ import httpx
9
+ from storage3.exceptions import StorageApiError
10
+ from typing import List, Dict, Any, Optional
11
+
12
+ try:
13
+ from postgrest import APIError # supabase-py v2
14
+ except Exception: # pragma: no cover
15
+ APIError = Exception
16
+ try:
17
+ # supabase-py v2
18
+ from postgrest.types import ReturnMethod # Returning.MINIMAL
19
+ _RETURN_MINIMAL = ReturnMethod.MINIMAL
20
+ except Exception:
21
+ # supabase-py v1 fallback
22
+ _RETURN_MINIMAL = "minimal"
23
+
24
+
25
+ def insert_into_supabase_table(
26
+ supabase_client: Any,
27
+ table_name: str,
28
+ rows: List[Dict[str, Any]],
29
+ on_conflict: Optional[List[str]] = None,
30
+ *,
31
+ max_retries: int = 5,
32
+ backoff_base_seconds: float = 1.0,
33
+ use_returning_minimal: bool = True,
34
+ ) -> None:
35
+ """
36
+ Insert a list of rows into a Supabase table with optional conflict handling.
37
+
38
+ :param supabase_client: Supabase client
39
+ :param table_name: Name of the table to upsert into
40
+ :param rows: List of dictionaries to insert; each dictionary becomes a row
41
+ :param on_conflict: List of column names to use as conflict target
42
+ :param max_retries: Maximum number of times to retry on failure
43
+ :param backoff_base_seconds: Base amount of time to sleep between retries
44
+ :param use_returning_minimal: Whether to use returning=MINIMAL (faster but no inserted IDs)
45
+ :return: None. If no rows are provided, returns None immediately without attempting an insert.
46
+
47
+ This function will retry up to `max_retries` times on failure with an exponential
48
+ backoff schedule. If `on_conflict` is provided, it will use the given columns as the
49
+ conflict target. If `use_returning_minimal`, it will use the `returning=MINIMAL` parameter
50
+ to reduce response size (but won't get inserted IDs back).
51
+
52
+ NOTE: This function does not support transactions; it is intended for use in simple
53
+ data pipelines where retrying on failure is sufficient. If you need stronger
54
+ consistency guarantees, use transactions or another approach.
55
+ """
56
+
57
+ if not rows:
58
+ logging.info("No rows to insert for table %s.", table_name)
59
+ return None
60
+
61
+ q = supabase_client.table(table_name)
62
+ conflict_target = ",".join(on_conflict) if on_conflict else None
63
+
64
+ if conflict_target and use_returning_minimal:
65
+ query = q.upsert(rows, on_conflict=conflict_target,
66
+ returning=_RETURN_MINIMAL)
67
+ elif conflict_target:
68
+ query = q.upsert(rows, on_conflict=conflict_target)
69
+ elif use_returning_minimal:
70
+ query = q.upsert(rows, returning=_RETURN_MINIMAL)
71
+ else:
72
+ query = q.upsert(rows)
73
+
74
+ attempt = 0
75
+ while True:
76
+ try:
77
+ query.execute()
78
+ logging.info("Inserted %s records into %s.", len(rows), table_name)
79
+ return True
80
+
81
+ except APIError as e:
82
+ code = getattr(e, "code", None)
83
+ attempt += 1
84
+
85
+ if attempt > max_retries:
86
+ logging.error(
87
+ "Permanent failure upserting into %s after %d attempts: %s",
88
+ table_name, attempt, e
89
+ )
90
+ return False
91
+
92
+ sleep_for = backoff_base_seconds * (2 ** (attempt - 1))
93
+ if code == "57014":
94
+ logging.warning(
95
+ "Timeout (57014) upserting into %s (attempt %d/%d). Retrying in %.1fs.",
96
+ table_name, attempt, max_retries, sleep_for
97
+ )
98
+ else:
99
+ logging.warning(
100
+ "APIError upserting into %s (attempt %d/%d): %s. Retrying in %.1fs.",
101
+ table_name, attempt, max_retries, e, sleep_for
102
+ )
103
+ time.sleep(sleep_for)
104
+
105
+ except (httpx.RequestError, httpx.HTTPStatusError) as e:
106
+ attempt += 1
107
+ if attempt > max_retries:
108
+ logging.error(
109
+ "Permanent HTTP failure upserting into %s after %d attempts: %s",
110
+ table_name, attempt, e
111
+ )
112
+ return False
113
+
114
+ sleep_for = backoff_base_seconds * (2 ** (attempt - 1))
115
+ logging.warning(
116
+ "HTTP error upserting into %s (attempt %d/%d): %s. Retrying in %.1fs.",
117
+ table_name, attempt, max_retries, e, sleep_for
118
+ )
119
+ time.sleep(sleep_for)
120
+
121
+ except Exception as e:
122
+ attempt += 1
123
+ if attempt > max_retries:
124
+ logging.error(
125
+ "Permanent unexpected failure upserting into %s after %d attempts: %s",
126
+ table_name, attempt, e
127
+ )
128
+ return False
129
+
130
+ sleep_for = backoff_base_seconds * (2 ** (attempt - 1))
131
+ logging.warning(
132
+ "Unexpected error upserting into %s (attempt %d/%d): %s. Retrying in %.1fs.",
133
+ table_name, attempt, max_retries, e, sleep_for
134
+ )
135
+ time.sleep(sleep_for)
136
+
137
+
138
+ def batched_insert(
139
+ supabase_client,
140
+ table_name: str,
141
+ rows: List[Dict[str, Any]],
142
+ batch_size: int = 500,
143
+ on_conflict: Optional[List[str]] = None,
144
+ max_retries: int = 4,
145
+ backoff_base_seconds: float = 1.0,
146
+ min_batch_size: int = 50,
147
+ ) -> list:
148
+ """
149
+ Insert a list of rows into a Supabase table with optional conflict handling, batching, and retries.
150
+
151
+ :param supabase_client: Supabase client
152
+ :param table_name: Name of the table to upsert into
153
+ :param rows: List of dictionaries to insert; each dictionary becomes a row
154
+ :param batch_size: Maximum number of rows to send in a single upsert request
155
+ :param on_conflict: List of column names to use as conflict target
156
+ :param max_retries: Maximum number of times to retry on failure
157
+ :param backoff_base_seconds: Base amount of time to sleep between retries
158
+ :param min_batch_size: Minimum size of a batch; if a timeout occurs, the
159
+ batch will be shrunk to this size before retrying
160
+ :return: List of responses from Supabase (empty if no rows to insert)
161
+
162
+ This function will retry up to `max_retries` times on failure with an exponential
163
+ backoff schedule. If `on_conflict` is provided, it will use the given columns as the
164
+ conflict target. If a timeout occurs, it will shrink the batch size to `min_batch_size`
165
+ and retry the first half of the batch. If the batch is already at `min_batch_size` or
166
+ smaller, it will retry with backoff. If all retries fail, it will raise the last
167
+ exception. If the `rows` list is empty, it will return an empty list without sending
168
+ any requests to Supabase.
169
+ """
170
+
171
+ if not rows:
172
+ return []
173
+
174
+ results = []
175
+ n = len(rows)
176
+ i = 0
177
+
178
+ while i < n:
179
+ current_batch_size = min(batch_size, n - i)
180
+ batch = rows[i: i + current_batch_size]
181
+
182
+ # Attempt to insert this batch with retries
183
+ attempt = 0
184
+ while True:
185
+ try:
186
+ q = supabase_client.table(table_name)
187
+ if on_conflict:
188
+ # `on_conflict` in supabase-py v2 expects a comma-separated string
189
+ conflict_target = ",".join(on_conflict)
190
+ resp = q.upsert(batch, on_conflict=conflict_target,
191
+ returning=_RETURN_MINIMAL).execute()
192
+ else:
193
+ resp = q.upsert(batch, returning=_RETURN_MINIMAL).execute()
194
+
195
+ results.append(resp)
196
+ logging.info(
197
+ f"Inserted batch rows {i}–{i + len(batch)} into {table_name}")
198
+ i += len(batch) # advance window
199
+ break # batch succeeded, move to next
200
+
201
+ except APIError as e:
202
+ if getattr(e, "code", None) == "57014" and current_batch_size > min_batch_size:
203
+ # Halve the batch and retry the first half now;
204
+ # the second half will be handled subsequently
205
+ half = max(min_batch_size, current_batch_size // 2)
206
+ logging.warning(
207
+ f"Timeout on {table_name} rows {i}–{i + current_batch_size}. "
208
+ f"Shrinking batch to {half} and retrying."
209
+ )
210
+ batch_size = half
211
+ time.sleep(backoff_base_seconds)
212
+ break
213
+
214
+ # Other errors or batch already at min size -> retry with backoff
215
+ attempt += 1
216
+ if attempt > max_retries:
217
+ logging.error(
218
+ f"Failed inserting batch rows {i}–{i + current_batch_size} into {table_name} "
219
+ f"after {max_retries} retries: {e}"
220
+ )
221
+ raise
222
+
223
+ sleep_for = backoff_base_seconds * (2 ** (attempt - 1))
224
+ logging.warning(
225
+ f"Error inserting batch rows {i}–{i + current_batch_size} into {table_name} "
226
+ f"(attempt {attempt}/{max_retries}): {e}. Retrying in {sleep_for:.1f}s."
227
+ )
228
+ time.sleep(sleep_for)
229
+ continue
230
+
231
+ except Exception as e:
232
+ # Non-API errors: retry with backoff
233
+ attempt += 1
234
+ if attempt > max_retries:
235
+ logging.error(
236
+ f"Failed inserting batch rows {i}–{i + current_batch_size} into {table_name} "
237
+ f"after {max_retries} retries: {e}"
238
+ )
239
+ raise
240
+
241
+ sleep_for = backoff_base_seconds * (2 ** (attempt - 1))
242
+ logging.warning(
243
+ f"Unexpected error inserting batch rows {i}–{i + current_batch_size} into {table_name} "
244
+ f"(attempt {attempt}/{max_retries}): {e}. Retrying in {sleep_for:.1f}s."
245
+ )
246
+ time.sleep(sleep_for)
247
+ continue
248
+
249
+ return results
250
+
251
+
252
+ def upload_raw_json_to_supabase(supabase_client, json_data, object_type="contacts"):
253
+ """
254
+ Uploads raw JSON data to Supabase storage under a specified object type directory.
255
+
256
+ :param supabase_client: The Supabase client object.
257
+ :param json_data: The JSON data to be uploaded.
258
+ :param object_type: The type of object, used to determine the directory path for storage.
259
+ :return: The path where the JSON file is stored in Supabase.
260
+ """
261
+
262
+ now_str = datetime.datetime.now(
263
+ datetime.timezone.utc).strftime("%Y%m%d_%H%M%S")
264
+ path = f"{object_type}/{now_str}.json"
265
+ file_bytes = json.dumps(json_data, indent=2, default=str).encode("utf-8")
266
+
267
+ supabase_client.storage.from_("hubspot-raw-data").remove([path])
268
+ try:
269
+ supabase_client.storage.from_("hubspot-raw-data").upload(
270
+ path, file=file_bytes, file_options={
271
+ "content-type": "application/json"}
272
+ )
273
+ except StorageApiError as e:
274
+ if e.status_code == 413:
275
+ logging.warning(
276
+ "Upload failed: payload too large for Supabase Storage.")
277
+ else:
278
+ logging.error("Storage API error during upload: %s", e)
279
+ except httpx.RequestError as e:
280
+ logging.error("Upload error: %s", e)
281
+ return path
282
+
283
+
284
+ def get_last_sync_time(supabase_client, object_type):
285
+ """
286
+ Retrieves the last sync time for the given object type from the hubspot_sync_metadata table.
287
+
288
+ :param supabase_client: The Supabase client object.
289
+ :param object_type: The type of object to retrieve the last sync time for.
290
+ :return: The last sync time for the given object type as a datetime object.
291
+ If no previous sync was found, returns None.
292
+ """
293
+
294
+ res = supabase_client.table("hubspot_sync_metadata").select(
295
+ "last_sync_time").eq("object_type", object_type).execute()
296
+ data = res.data
297
+ if data and data[0]["last_sync_time"]:
298
+ last_sync = datetime.datetime.fromisoformat(data[0]["last_sync_time"])
299
+ return last_sync
300
+
301
+ return None
302
+
303
+
304
+ def update_sync_metadata(supabase_client, object_type, sync_time):
305
+ """
306
+ Updates the last sync time for the given object type in the hubspot_sync_metadata table.
307
+
308
+ :param supabase_client: The Supabase client object.
309
+ :param object_type: The type of object to update the last sync time for.
310
+ :param sync_time: The last sync time to update.
311
+ :return: The result of the Supabase upsert operation.
312
+ """
313
+
314
+ payload = [{
315
+ "object_type": object_type,
316
+ "last_sync_time": sync_time,
317
+ "updated_at": datetime.datetime.now(datetime.timezone.utc).isoformat()
318
+ }]
319
+
320
+ return insert_into_supabase_table(supabase_client, "hubspot_sync_metadata", payload)
321
+
322
+
323
+ def get_existing_email_content_ids(supabase_client):
324
+ """
325
+ Fetches existing email_ids from the hubspot_email_contents table.
326
+
327
+ :param supabase_client: Supabase client
328
+ :return: A set of email_id values already in the database
329
+ """
330
+ existing_ids = set()
331
+ page_size = 1000
332
+ offset = 0
333
+
334
+ while True:
335
+ res = supabase_client.table("hubspot_email_contents")\
336
+ .select("email_id")\
337
+ .range(offset, offset + page_size - 1)\
338
+ .execute()
339
+ if not res.data:
340
+ break
341
+ batch = {row["email_id"] for row in res.data}
342
+ existing_ids.update(batch)
343
+ offset += page_size
344
+
345
+ return existing_ids
346
+
347
+
348
+ def fetch_supabase_table(
349
+ supabase_client, table_name="hubspot_contacts",
350
+ id_column="contact_id"
351
+ ):
352
+ """
353
+ Fetches all rows from a Supabase table and returns them as a dict.
354
+
355
+ :param supabase_client: Supabase client instance
356
+ :param table_name: Name of the table to fetch from
357
+ :param id_column: Unique ID column to use as the key for the returned dict
358
+ :return: A dict of rows with the ID column as the key and the full row as the value
359
+ """
360
+
361
+ all_rows = {}
362
+ page = 0
363
+ page_size = 1000
364
+
365
+ while True:
366
+ res = supabase_client.table(table_name).select(
367
+ "*").range(page * page_size, (page + 1) * page_size - 1).execute()
368
+ if not res.data:
369
+ break
370
+ for row in res.data:
371
+ all_rows[str(row[id_column])] = row
372
+ if len(res.data) < page_size:
373
+ break
374
+ page += 1
375
+
376
+ return all_rows
377
+
378
+
379
+ def enrich_supabase_row(base_row: dict) -> dict:
380
+ """
381
+ Enriches a Supabase row with additional columns needed for duplicate detection.
382
+
383
+ :param base_row: The base row to enrich.
384
+ :return: The enriched row.
385
+ """
386
+
387
+ base_row.update({
388
+ "duplicate_id": None,
389
+ "duplicate_status": None,
390
+ "duplicate_action": None,
391
+ "is_primary": True,
392
+ "updated_at": datetime.datetime.now(datetime.timezone.utc).isoformat()
393
+ })
394
+ return base_row
requirements.txt ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ python-dotenv==1.1.0
2
+ hubspot-api-client==12.0.0
3
+ supabase==2.16.0
4
+ httpx==0.28.1
5
+ pandas==2.2.2
6
+ openpyxl==3.1.5
7
+ phonenumbers
8
+ tldextract
9
+ rapidfuzz