n8n / python /hubspot_contacts.py
niwayandm
Add new properties to contacts and billing
0b7ad7b
"""
HubSpot Companies → Supabase (incremental since a millisecond cursor)
Usage from orchestrator:
import hubspot_contacts
hubspot_contacts.main(since_ms=<int milliseconds since epoch UTC>)
Direct CLI:
# epoch ms
python hubspot_contacts.py 1754025600000
# ISO-8601
python hubspot_contacts.py 2025-08-01T09:30:00Z
# Back-compat date (floors to 00:00Z)
python hubspot_contacts.py 2025-08-01
"""
import os
import re
import time
import logging
import datetime
from typing import List, Dict, Tuple, Optional, Union
import httpx
import hubspot
from dotenv import load_dotenv
from supabase import create_client
from hubspot.crm.contacts import ApiException as ContactsApiException
from hubspot_utils import (
try_parse_int, parse_ts, get_property_label_mapping,
)
from supabase_utils import (
fetch_supabase_table, update_sync_metadata, enrich_supabase_row,
upload_raw_json_to_supabase, batched_insert,
)
# -----------------------------------------------------------------------------
# Logging
# -----------------------------------------------------------------------------
logging.basicConfig(
filename=f"logs/hubspot_contact_pipeline_{datetime.datetime.now().strftime('%Y-%m-%d')}.log",
filemode="a",
level=logging.INFO,
format="%(asctime)s [%(levelname)s] %(message)s",
)
# -----------------------------------------------------------------------------
# Environment
# -----------------------------------------------------------------------------
load_dotenv()
HUBSPOT_TOKEN = os.getenv("HUBSPOT_TOKEN")
SUPABASE_URL = os.getenv("SUPABASE_URL")
SUPABASE_SERVICE_ROLE_KEY = os.getenv("SUPABASE_SERVICE_ROLE_KEY")
# Optional bootstrap cursor if orchestrator doesn't provide one
BOOTSTRAP_SINCE_MS_ENV = os.getenv("HUBSPOT_CONTACTS_SINCE_MS")
if not HUBSPOT_TOKEN:
raise RuntimeError("HUBSPOT_TOKEN is not set")
if not SUPABASE_URL or not SUPABASE_SERVICE_ROLE_KEY:
raise RuntimeError("Supabase env vars are not set")
hubspot_client = hubspot.Client.create(access_token=HUBSPOT_TOKEN)
supabase_client = create_client(SUPABASE_URL, SUPABASE_SERVICE_ROLE_KEY)
# -----------------------------------------------------------------------------
# Config
# -----------------------------------------------------------------------------
CONTACT_PROPERTIES = [
"full_name",
"firstname",
"lastname",
"email",
"phone",
"job_title_level",
"createdate",
"lastmodifieddate",
"hs_lastmodifieddate",
"notes_last_updated",
"associatedcompanyid",
"jobtitle",
"hs_object_source_label",
"source_1__cont_",
"source_2",
"hs_object_source_detail_1",
"hs_analytics_source",
"hs_latest_source",
"ar_1__30",
"ar_31__60",
"ar_61__90",
"ar_91_and_over",
"ar_total"
]
# -----------------------------------------------------------------------------
# Time helpers
# -----------------------------------------------------------------------------
def _ensure_utc(dt: datetime.datetime) -> datetime.datetime:
if dt.tzinfo is None:
dt = dt.replace(tzinfo=datetime.timezone.utc)
return dt.astimezone(datetime.timezone.utc)
def floor_to_utc_midnight(dt: datetime.datetime) -> datetime.datetime:
dt = _ensure_utc(dt)
return dt.replace(hour=0, minute=0, second=0, microsecond=0)
def _parse_iso_like_to_dt(value: str) -> datetime.datetime:
if isinstance(value, str) and value.endswith("Z"):
value = value[:-1] + "+00:00"
dt = datetime.datetime.fromisoformat(value)
return _ensure_utc(dt)
def to_epoch_ms(dt_or_str: Union[str, datetime.datetime]) -> int:
if isinstance(dt_or_str, str):
dt = _parse_iso_like_to_dt(dt_or_str)
elif isinstance(dt_or_str, datetime.datetime):
dt = _ensure_utc(dt_or_str)
else:
raise TypeError(f"Unsupported type for to_epoch_ms: {type(dt_or_str)}")
return int(dt.timestamp() * 1000)
def parse_any_ts_ms(value: Optional[Union[str, int, float]]) -> Optional[int]:
"""
Accepts ms-epoch / sec-epoch / ISO-8601; returns ms since epoch or None.
"""
if value is None:
return None
try:
v = int(str(value))
if v < 10_000_000_000_000: # seconds → ms
v *= 1000
return v
except ValueError:
pass
try:
return to_epoch_ms(str(value))
except Exception:
logging.warning("Could not parse timestamp value=%r", value)
return None
# -----------------------------------------------------------------------------
# Mapping & helpers
# -----------------------------------------------------------------------------
def map_contact_data_for_db(contacts: List[Dict]) -> List[Dict]:
mapped = []
for c in contacts:
base_row = {
"contact_id": try_parse_int(c["id"]),
"full_name": c.get("full_name"),
"first_name": c.get("firstname"),
"last_name": c.get("lastname"),
"email": c.get("email"),
"phone_number": c.get("phone"),
"job_title_level": c.get("job_title_level"),
"hubspot_create_date": parse_ts(c.get("createdate")) or None,
"hubspot_modified_date": parse_ts(c.get("lastmodifieddate") or c.get("hs_lastmodifieddate")) or None,
"hubspot_last_activity_date": parse_ts(c.get("notes_last_updated")) or None,
"number_of_associated_deals": c.get("number_of_associated_deals", 0),
"associated_company_id": try_parse_int(c.get("associatedcompanyid")),
"record_source": c.get("hs_object_source_label"),
"source_1": c.get("source_1__cont_"),
"source_2": c.get("source_2"),
"record_source_detail_1": c.get("hs_object_source_detail_1"),
"original_traffic_source": c.get("hs_analytics_source"),
"latest_traffic_source": c.get("hs_latest_source"),
"ar_1_30": c.get("ar_1__30"),
"ar_31_60": c.get("ar_31__60"),
"ar_61_90": c.get("ar_61__90"),
"ar_91_and_over": c.get("ar_91_and_over"),
"ar_total": c.get("ar_total"),
}
mapped.append(enrich_supabase_row(base_row))
return mapped
def contacts_are_different(new_row: Dict, old_row: Dict) -> bool:
compare_keys = [
"full_name", "first_name", "last_name", "email", "phone_number",
"hubspot_create_date", "hubspot_modified_date",
"hubspot_last_activity_date", "number_of_associated_deals",
"associated_company_id", "record_source", "source_1", "source_2",
"record_source_detail_1", "original_traffic_source",
"latest_traffic_source",
]
for key in compare_keys:
if str(new_row.get(key)) != str(old_row.get(key)):
return True
return False
# -----------------------------------------------------------------------------
# Search IDs (ts > since_ms) with property fallback
# -----------------------------------------------------------------------------
def _search_contact_ids_from(since_ms: int, prop: str) -> List[str]:
"""
Search contacts where {prop} > since_ms (epoch-ms).
Sort ascending so we can advance the cursor monotonically.
"""
url = "https://api.hubapi.com/crm/v3/objects/contacts/search"
headers = {
"Authorization": f"Bearer {HUBSPOT_TOKEN}",
"Content-Type": "application/json",
"Accept": "application/json",
}
payload = {
"filterGroups": [{
"filters": [
{"propertyName": prop, "operator": "GT", "value": str(since_ms)}
]
}],
"limit": 100,
"sorts": [{"propertyName": prop, "direction": "ASCENDING"}],
}
ids: List[str] = []
after: Optional[str] = None
with httpx.Client(timeout=30.0) as client:
while True:
body = dict(payload)
if after:
body["after"] = after
resp = client.post(url, headers=headers, json=body)
if resp.status_code >= 400:
try:
logging.error("Contacts search error for prop '%s': %s", prop, resp.json())
except Exception:
logging.error("Contacts search error for prop '%s': %s", prop, resp.text)
resp.raise_for_status()
data = resp.json()
ids.extend([obj["id"] for obj in data.get("results", []) or []])
after = (data.get("paging") or {}).get("next", {}).get("after")
if not after:
break
time.sleep(0.1)
return ids
def search_contact_ids_after_ms(since_ms: int) -> Tuple[List[str], str]:
"""
Try these properties in order; return (ids, prop_used) for the first successful search:
1) hs_lastmodifieddate
2) lastmodifieddate
3) createdate
"""
props_to_try = ["createdate"]
last_err = None
for prop in props_to_try:
try:
ids = _search_contact_ids_from(since_ms, prop)
logging.info("Contacts search with '%s' returned %d IDs.", prop, len(ids))
return ids, prop
except httpx.HTTPStatusError as e:
last_err = e
continue
if last_err:
raise last_err
return [], "hs_lastmodifieddate"
# -----------------------------------------------------------------------------
# Read-by-ID (with associations) → enrich & track max cursor ts
# -----------------------------------------------------------------------------
def _enrich_contact_data_from_record(record, job_level_map: Optional[Dict[str, str]]) -> Dict:
props = record.properties or {}
contact_data: Dict[str, Optional[str]] = {"id": record.id}
for p in CONTACT_PROPERTIES:
contact_data[p] = props.get(p)
# Full name fallback
if not contact_data.get("full_name") or not str(contact_data["full_name"]).strip():
first = contact_data.get("firstname") or ""
last = contact_data.get("lastname") or ""
contact_data["full_name"] = f"{first.strip()} {last.strip()}".strip()
# Associations: deals count
num_deals = 0
if getattr(record, "associations", None) and record.associations.get("deals"):
bucket = record.associations["deals"]
if getattr(bucket, "results", None):
num_deals = len({a.id for a in bucket.results if getattr(a, "id", None)})
contact_data["number_of_associated_deals"] = num_deals
# Map job_title_level code → label (if configured)
if job_level_map:
code = contact_data.get("job_title_level")
contact_data["job_title_level"] = job_level_map.get(code) if code in job_level_map else None
return contact_data
def read_contacts_by_ids(contact_ids: List[str], cursor_prop: str) -> Tuple[List[Dict], Optional[int]]:
if not contact_ids:
return [], None
contacts: List[Dict] = []
assoc_types = ["deals"]
# Fetch property label map once
try:
job_level_map = get_property_label_mapping(hubspot_client, "contacts", "job_title_level")
except Exception as e:
logging.warning("Failed to fetch job_title_level map: %s", e)
job_level_map = None
max_ts_ms: Optional[int] = None
for i, cid in enumerate(contact_ids, start=1):
try:
record = hubspot_client.crm.contacts.basic_api.get_by_id(
contact_id=cid,
properties=CONTACT_PROPERTIES,
associations=assoc_types,
archived=False,
)
# Track max timestamp for the chosen cursor property
cursor_val = (record.properties or {}).get(cursor_prop)
ts_ms = parse_any_ts_ms(cursor_val)
if ts_ms is not None and (max_ts_ms is None or ts_ms > max_ts_ms):
max_ts_ms = ts_ms
contacts.append(_enrich_contact_data_from_record(record, job_level_map))
if i % 200 == 0:
logging.info("Read %d contacts...", i)
time.sleep(0.05) # gentle pacing
except httpx.HTTPStatusError as e:
logging.error("HTTP error reading contact %s: %s", cid, e)
except (ContactsApiException, httpx.HTTPError) as e:
logging.error("Error reading contact %s: %s", cid, e)
return contacts, max_ts_ms
# -----------------------------------------------------------------------------
# Upsert flow
# -----------------------------------------------------------------------------
def upsert_contacts(contacts: List[Dict]) -> None:
if not contacts:
print("No contacts to upsert.")
return
existing = fetch_supabase_table(supabase_client, "hubspot_contacts", "contact_id")
rows_to_upsert: List[Dict] = []
for c in contacts:
contact_id = try_parse_int(c.get("id"))
if not contact_id:
continue
mapped_row = map_contact_data_for_db([c])[0]
existing_row = existing.get(str(contact_id))
if not existing_row or contacts_are_different(mapped_row, existing_row):
rows_to_upsert.append(mapped_row)
print(f"{len(rows_to_upsert)} contacts to insert/update (out of {len(contacts)} read).")
if rows_to_upsert:
# upload_raw_json_to_supabase(supabase_client, rows_to_upsert, object_type="contacts")
batched_insert(supabase_client, "hubspot_contacts", rows_to_upsert, batch_size=1000)
# -----------------------------------------------------------------------------
# Main (timestamp cursor)
# -----------------------------------------------------------------------------
def main(since_ms: Optional[int] = None):
"""
Orchestrates:
1) Search contact IDs with <cursor_prop> > since_ms (property fallback)
2) Read full contacts (track max timestamp for <cursor_prop>)
3) Upsert into Supabase
4) Update sync metadata with { last_sync_metadata, last_sync_time, cursor_prop }
"""
# Resolve since_ms
if since_ms is None and BOOTSTRAP_SINCE_MS_ENV:
try:
since_ms = int(BOOTSTRAP_SINCE_MS_ENV)
except ValueError:
raise RuntimeError("HUBSPOT_CONTACTS_SINCE_MS must be an integer (ms) if set.")
if since_ms is None:
# Default: today@00:00:00Z for first run
today0 = floor_to_utc_midnight(datetime.datetime.now(datetime.timezone.utc))
since_ms = to_epoch_ms(today0)
print(f"Searching contacts with timestamp > {since_ms} ...")
ids, cursor_prop = search_contact_ids_after_ms(since_ms)
print(f"Search property: {cursor_prop}. Found {len(ids)} contact IDs.")
now_iso = datetime.datetime.now(datetime.timezone.utc).isoformat()
if not ids:
print("No contacts beyond the cursor. Updating sync metadata and exiting.")
update_sync_metadata(supabase_client, "contacts", now_iso)
return
print("Reading contacts (with associations)...")
contacts, max_ts_ms = read_contacts_by_ids(ids, cursor_prop)
print("Upserting into Supabase...")
upsert_contacts(contacts)
# Advance cursor to max timestamp we actually ingested for the chosen property
new_cursor_ms = max_ts_ms if max_ts_ms is not None else since_ms
update_sync_metadata(supabase_client, "contacts", now_iso)
print(f"Contacts sync complete. Advanced cursor to {new_cursor_ms} using prop '{cursor_prop}'.")
# -----------------------------------------------------------------------------
# CLI
# -----------------------------------------------------------------------------
def _parse_cli_arg_to_ms(arg: str) -> int:
"""
Accept:
- integer epoch ms
- ISO-8601 (Z or offset)
- YYYY-MM-DD (floors to 00:00Z)
"""
# epoch ms or seconds
if re.fullmatch(r"\d{10,13}", arg):
v = int(arg)
if v < 10_000_000_000_000: # seconds -> ms
v *= 1000
return v
# YYYY-MM-DD
if re.fullmatch(r"\d{4}-\d{2}-\d{2}", arg):
d = datetime.datetime.strptime(arg, "%Y-%m-%d").replace(tzinfo=datetime.timezone.utc)
return to_epoch_ms(floor_to_utc_midnight(d))
# ISO-8601
return to_epoch_ms(arg)
if __name__ == "__main__":
import sys
if len(sys.argv) > 1:
try:
since = _parse_cli_arg_to_ms(sys.argv[1])
except Exception as e:
print(
f"Invalid timestamp. Provide epoch ms, ISO-8601, or YYYY-MM-DD. Error: {e}"
)
sys.exit(1)
main(since_ms=since)
else:
main()