Spaces:
Running
feat: Phase 1 — AI Client Acquisition System
Browse filesComplete finding pipeline with:
- MiniMax M2.7 + LLaMA multi-model AI engine (all FREE on NVIDIA NIM)
- 5-task Trigger.dev pipeline (scheduler → scrape → enrich → profile → digest)
- 7-layer email verification with credit optimization
- Pattern-based email generation (FREE, unlimited)
- Pain signal detection (heuristic + LLM)
- Deterministic 100-point scoring system
- Slack bot (alerts, commands, daily digest)
- Territory management (27 cities, auto-rotation)
- Full observability (trace IDs, checkpoints, LLM logs)
All API integrations:
- NVIDIA NIM (MiniMax M2.7, LLaMA 70B, LLaMA 8B)
- Serper.dev (Google search)
- Hunter.io (email finding)
- Reoon (email verification)
- Supabase (database)
- Slack (notifications)
- Trigger.dev (orchestration)
Total LLM cost: /day
- .env.example +40 -0
- .gitignore +35 -0
- CONTRIBUTING.md +83 -0
- README.md +179 -0
- docker-compose.yml +56 -0
- docs/setup-guide.md +118 -0
- package.json +33 -0
- src/discovery/lib/contact-enricher.ts +354 -0
- src/discovery/lib/deduplicator.ts +82 -0
- src/discovery/lib/email-classifier.ts +210 -0
- src/discovery/lib/email-pattern-generator.ts +249 -0
- src/discovery/lib/email-verifier.ts +338 -0
- src/discovery/lib/icp-filter.ts +133 -0
- src/discovery/lib/linkedin-person-finder.ts +205 -0
- src/discovery/lib/linkedin-scraper.ts +165 -0
- src/discovery/lib/normalizer.ts +145 -0
- src/discovery/lib/pain-signal-detector.ts +228 -0
- src/discovery/lib/rotation.ts +114 -0
- src/discovery/lib/social-finder.ts +202 -0
- src/discovery/lib/territory-manager.ts +259 -0
- src/discovery/lib/web-scraper.ts +225 -0
- src/discovery/providers/hunter.ts +155 -0
- src/discovery/providers/reoon.ts +108 -0
- src/discovery/providers/serper.ts +108 -0
- src/discovery/trigger-tasks/auto-discovery.ts +517 -0
- src/discovery/trigger-tasks/manual-discovery.ts +139 -0
- src/profiling/python-service/config.py +25 -0
- src/profiling/python-service/hallucination_guard.py +137 -0
- src/profiling/python-service/main.py +148 -0
- src/profiling/python-service/nvidia_client.py +254 -0
- src/profiling/python-service/profiler.py +212 -0
- src/profiling/python-service/requirements.txt +8 -0
- src/profiling/python-service/scorer.py +260 -0
- src/profiling/trigger-tasks/profiling-router.ts +158 -0
- src/shared/config/env.ts +66 -0
- src/shared/llm/grounding.ts +239 -0
- src/shared/llm/nvidia-client.ts +307 -0
- src/shared/llm/prompts.ts +277 -0
- src/shared/observability/tracer.ts +118 -0
- src/shared/pipeline/checkpoint.ts +143 -0
- src/shared/supabase/client.ts +15 -0
- src/shared/supabase/schema.ts +184 -0
- src/shared/utils/logger.ts +40 -0
- src/shared/utils/rate-limiter.ts +103 -0
- src/shared/utils/retry.ts +195 -0
- src/slack/slack-commands.ts +249 -0
- src/slack/slack-service.ts +273 -0
- src/trigger.ts +8 -0
- supabase/migrations/001_initial_schema.sql +279 -0
- supabase/migrations/002_phase1_enhancements.sql +242 -0
|
@@ -0,0 +1,40 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# ─── LLM (All on NVIDIA NIM — FREE) ───────────────────────────
|
| 2 |
+
NVIDIA_API_KEY=nvapi-your-key-here
|
| 3 |
+
NVIDIA_NIM_BASE_URL=https://integrate.api.nvidia.com/v1
|
| 4 |
+
|
| 5 |
+
# ─── Web Research ──────────────────────────────────────────────
|
| 6 |
+
SERPER_API_KEY=your-serper-key
|
| 7 |
+
|
| 8 |
+
# ─── Email Finding ─────────────────────────────────────────────
|
| 9 |
+
HUNTER_API_KEY=your-hunter-key
|
| 10 |
+
|
| 11 |
+
# ─── Email Verification ────────────────────────────────────────
|
| 12 |
+
REOON_API_KEY=your-reoon-key
|
| 13 |
+
|
| 14 |
+
# ─── Supabase ──────────────────────────────────────────────────
|
| 15 |
+
SUPABASE_URL=https://your-project.supabase.co
|
| 16 |
+
SUPABASE_SERVICE_ROLE_KEY=your-service-role-key
|
| 17 |
+
|
| 18 |
+
# ─── Slack ─────────────────────────────────────────────────────
|
| 19 |
+
SLACK_BOT_TOKEN=xoxb-your-bot-token
|
| 20 |
+
SLACK_SIGNING_SECRET=your-signing-secret
|
| 21 |
+
SLACK_ALERT_CHANNEL_ID=C0000000000
|
| 22 |
+
SLACK_REVIEW_CHANNEL_ID=C0000000000
|
| 23 |
+
|
| 24 |
+
# ─── Trigger.dev ───────────────────────────────────────────────
|
| 25 |
+
TRIGGER_DEV_API_KEY=tr_dev_your-key
|
| 26 |
+
TRIGGER_DEV_PROJECT_ID=your-project-id
|
| 27 |
+
|
| 28 |
+
# ─── Python AI Service (create any random string) ─────────────
|
| 29 |
+
PYTHON_AI_SERVICE_URL=http://localhost:8000
|
| 30 |
+
PYTHON_AI_SERVICE_SECRET=create-a-random-16-char-string
|
| 31 |
+
|
| 32 |
+
# ─── System Config ─────────────────────────────────────────────
|
| 33 |
+
NODE_ENV=development
|
| 34 |
+
LOG_LEVEL=info
|
| 35 |
+
DAILY_LEAD_QUOTA=10
|
| 36 |
+
QUALITY_SCORE_THRESHOLD=70
|
| 37 |
+
HUMAN_REVIEW_ENABLED=true
|
| 38 |
+
DAILY_EMAIL_LIMIT=50
|
| 39 |
+
DAILY_LINKEDIN_LIMIT=25
|
| 40 |
+
SCHEDULE_START_HOUR_UTC=4
|
|
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Environment variables (NEVER commit)
|
| 2 |
+
.env
|
| 3 |
+
.env.local
|
| 4 |
+
.env.production
|
| 5 |
+
|
| 6 |
+
# Node
|
| 7 |
+
node_modules/
|
| 8 |
+
dist/
|
| 9 |
+
build/
|
| 10 |
+
*.tsbuildinfo
|
| 11 |
+
|
| 12 |
+
# Python
|
| 13 |
+
__pycache__/
|
| 14 |
+
*.pyc
|
| 15 |
+
*.pyo
|
| 16 |
+
.venv/
|
| 17 |
+
venv/
|
| 18 |
+
*.egg-info/
|
| 19 |
+
|
| 20 |
+
# IDE
|
| 21 |
+
.vscode/
|
| 22 |
+
.idea/
|
| 23 |
+
*.swp
|
| 24 |
+
*.swo
|
| 25 |
+
|
| 26 |
+
# OS
|
| 27 |
+
.DS_Store
|
| 28 |
+
Thumbs.db
|
| 29 |
+
|
| 30 |
+
# Logs
|
| 31 |
+
*.log
|
| 32 |
+
logs/
|
| 33 |
+
|
| 34 |
+
# Trigger.dev
|
| 35 |
+
.trigger/
|
|
@@ -0,0 +1,83 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Contributing to AI Client Acquisition System
|
| 2 |
+
|
| 3 |
+
Welcome! This guide will help you get started as a contributor.
|
| 4 |
+
|
| 5 |
+
## Getting Started
|
| 6 |
+
|
| 7 |
+
1. **Clone the repo**
|
| 8 |
+
```bash
|
| 9 |
+
git clone https://github.com/iDevBuddy/ai-client-acquisition.git
|
| 10 |
+
cd ai-client-acquisition
|
| 11 |
+
```
|
| 12 |
+
|
| 13 |
+
2. **Install dependencies**
|
| 14 |
+
```bash
|
| 15 |
+
npm install
|
| 16 |
+
cd src/profiling/python-service && pip install -r requirements.txt && cd ../../..
|
| 17 |
+
```
|
| 18 |
+
|
| 19 |
+
3. **Set up environment**
|
| 20 |
+
```bash
|
| 21 |
+
cp .env.example .env
|
| 22 |
+
# Fill in your API keys — ask @iDevBuddy for access
|
| 23 |
+
```
|
| 24 |
+
|
| 25 |
+
4. **Set up database**
|
| 26 |
+
- Create a Supabase project (free)
|
| 27 |
+
- Run migration files from `supabase/migrations/` in order
|
| 28 |
+
|
| 29 |
+
5. **Start development**
|
| 30 |
+
```bash
|
| 31 |
+
# Terminal 1: Trigger.dev tasks
|
| 32 |
+
npm run trigger:dev
|
| 33 |
+
|
| 34 |
+
# Terminal 2: Python AI service
|
| 35 |
+
cd src/profiling/python-service && python main.py
|
| 36 |
+
```
|
| 37 |
+
|
| 38 |
+
## Project Architecture
|
| 39 |
+
|
| 40 |
+
```
|
| 41 |
+
Phase 1: FINDING (current)
|
| 42 |
+
Discovery → Scraping → Pain Detection → Email Finding → AI Profiling → Scoring → Slack
|
| 43 |
+
|
| 44 |
+
Phase 2: OUTREACH (upcoming)
|
| 45 |
+
Email sequences → LinkedIn messaging → Follow-ups → Reply handling
|
| 46 |
+
```
|
| 47 |
+
|
| 48 |
+
## Code Conventions
|
| 49 |
+
|
| 50 |
+
- **TypeScript** for orchestration, discovery, and integrations
|
| 51 |
+
- **Python** for AI profiling service (FastAPI)
|
| 52 |
+
- **Zod** for runtime validation
|
| 53 |
+
- Use `logger` (pino) for all logging — no `console.log`
|
| 54 |
+
- Every LLM call must have a `traceId`
|
| 55 |
+
- Every external API call must go through `retry.ts`
|
| 56 |
+
|
| 57 |
+
## Branch Strategy
|
| 58 |
+
|
| 59 |
+
```
|
| 60 |
+
main → production-ready code
|
| 61 |
+
develop → integration branch
|
| 62 |
+
feature/* → new features
|
| 63 |
+
fix/* → bug fixes
|
| 64 |
+
```
|
| 65 |
+
|
| 66 |
+
## Pull Request Process
|
| 67 |
+
|
| 68 |
+
1. Create a feature branch: `git checkout -b feature/your-feature`
|
| 69 |
+
2. Make your changes
|
| 70 |
+
3. Test locally (see Testing section)
|
| 71 |
+
4. Push and create a PR against `develop`
|
| 72 |
+
5. Get at least 1 review before merging
|
| 73 |
+
|
| 74 |
+
## Security Rules
|
| 75 |
+
|
| 76 |
+
⚠️ **NEVER commit API keys or secrets**
|
| 77 |
+
- `.env` is in `.gitignore` — keep it that way
|
| 78 |
+
- Use `.env.example` for templates (no real values)
|
| 79 |
+
- If you accidentally commit a key, rotate it IMMEDIATELY
|
| 80 |
+
|
| 81 |
+
## Questions?
|
| 82 |
+
|
| 83 |
+
Reach out to @iDevBuddy on GitHub or Slack.
|
|
@@ -0,0 +1,179 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# 🤖 AI Client Acquisition System
|
| 2 |
+
|
| 3 |
+
> Enterprise-grade, hyper-intelligent lead discovery, profiling, and scoring pipeline.
|
| 4 |
+
> Built with production AI engineering practices — not n8n-style hype.
|
| 5 |
+
|
| 6 |
+
[]()
|
| 7 |
+
[]()
|
| 8 |
+
[]()
|
| 9 |
+
[]()
|
| 10 |
+
|
| 11 |
+
---
|
| 12 |
+
|
| 13 |
+
## What This System Does
|
| 14 |
+
|
| 15 |
+
Automatically discovers, qualifies, and profiles potential clients for an AI automation agency.
|
| 16 |
+
|
| 17 |
+
```
|
| 18 |
+
Every day at 9 AM PKT:
|
| 19 |
+
1. Pick next territory (city × industry) → 27 cities, auto-rotation
|
| 20 |
+
2. Search Google for companies → Serper API
|
| 21 |
+
3. Scrape each website → Playwright (headless)
|
| 22 |
+
4. Detect pain signals → "no chatbot", "phone booking only", etc.
|
| 23 |
+
5. Gate 2: Skip if < 2 pain signals
|
| 24 |
+
6. Find decision-maker emails → Hunter.io + Pattern Generation + SMTP
|
| 25 |
+
7. Verify emails → 7-layer verification (FREE)
|
| 26 |
+
8. Find personal LinkedIn + social profiles
|
| 27 |
+
9. AI profiling → MiniMax M2.7 (chain-of-thought reasoning)
|
| 28 |
+
10. Deterministic scoring → 100-point scale, zero hallucination
|
| 29 |
+
11. Alert on Slack → hot leads (85+) instant, daily digest for all
|
| 30 |
+
```
|
| 31 |
+
|
| 32 |
+
## Architecture
|
| 33 |
+
|
| 34 |
+
```
|
| 35 |
+
┌─────────────────────────────────────────────────────┐
|
| 36 |
+
│ CRON: daily-lead-discovery (4 AM UTC = 9 AM PKT) │
|
| 37 |
+
│ → Territory Manager → Google Search → Queue │
|
| 38 |
+
└──────────────────────┬──────────────────────────────┘
|
| 39 |
+
│
|
| 40 |
+
▼ (max 3 concurrent)
|
| 41 |
+
┌─────────────────────────────────────────────────────┐
|
| 42 |
+
│ TASK: process-company │
|
| 43 |
+
│ → Scrape → Pain Signals → Gate 2 │
|
| 44 |
+
└──────────────────────┬──────────────────────────────┘
|
| 45 |
+
│
|
| 46 |
+
▼
|
| 47 |
+
┌─────────────────────────────────────────────────────┐
|
| 48 |
+
│ TASK: enrich-and-profile │
|
| 49 |
+
│ → Hunter → Pattern Gen → SMTP → LinkedIn │
|
| 50 |
+
│ → Python AI Service → Save → Slack Alert │
|
| 51 |
+
└─────────────────────────────────────────────────────┘
|
| 52 |
+
```
|
| 53 |
+
|
| 54 |
+
## Model Chain (All FREE on NVIDIA NIM)
|
| 55 |
+
|
| 56 |
+
| Priority | Model | Parameters | Use Case |
|
| 57 |
+
|----------|-------|-----------|----------|
|
| 58 |
+
| 1st | MiniMax M2.7 | ~100B+ | Profiling, scoring, complex reasoning |
|
| 59 |
+
| 2nd | LLaMA 3.3 70B | 70B | Reliable fallback |
|
| 60 |
+
| 3rd | LLaMA 3.1 8B | 8B | Email classification, simple tasks |
|
| 61 |
+
| 4th | Deterministic | — | Zero hallucination fallback |
|
| 62 |
+
|
| 63 |
+
**Single API key. Single endpoint. $0/day.**
|
| 64 |
+
|
| 65 |
+
## Scoring System (100 points, fully deterministic)
|
| 66 |
+
|
| 67 |
+
```
|
| 68 |
+
Company Fit: 25 pts (industry + size match)
|
| 69 |
+
AI Readiness: 20 pts (tech stack + AI jobs)
|
| 70 |
+
Service Match: 20 pts (pain signals → our services)
|
| 71 |
+
Decision Maker: 20 pts (verified email + LinkedIn + authority)
|
| 72 |
+
Timing: 15 pts (growth signals + active website)
|
| 73 |
+
|
| 74 |
+
Tiers: hot (85+) | warm (70-84) | nurture (50-69) | archive (<50)
|
| 75 |
+
```
|
| 76 |
+
|
| 77 |
+
## Tech Stack
|
| 78 |
+
|
| 79 |
+
| Layer | Technology | Purpose |
|
| 80 |
+
|-------|-----------|---------|
|
| 81 |
+
| Orchestration | Trigger.dev | CRON, task chaining, retry, queuing |
|
| 82 |
+
| Database | Supabase (PostgreSQL) | Data storage, config, state |
|
| 83 |
+
| LLM | NVIDIA NIM (MiniMax + LLaMA) | AI profiling & analysis |
|
| 84 |
+
| Web Scraping | Playwright | Headless browser |
|
| 85 |
+
| Email | Hunter.io + SMTP | Finding & verification |
|
| 86 |
+
| Notifications | Slack Bot | Alerts, commands, digest |
|
| 87 |
+
| AI Service | Python FastAPI | Profiling, scoring, hallucination guard |
|
| 88 |
+
| Language | TypeScript + Python | Core logic |
|
| 89 |
+
|
| 90 |
+
## Project Structure
|
| 91 |
+
|
| 92 |
+
```
|
| 93 |
+
src/
|
| 94 |
+
├── discovery/ # Phase 1: Finding pipeline
|
| 95 |
+
│ ├── lib/ # Core logic
|
| 96 |
+
│ │ ├── contact-enricher.ts # 6-step email pipeline
|
| 97 |
+
│ │ ├── email-classifier.ts # Tier 1/2/3 classification
|
| 98 |
+
│ │ ├── email-verifier.ts # 7-layer verification
|
| 99 |
+
│ │ ├── email-pattern-generator.ts # FREE Snov replacement
|
| 100 |
+
│ │ ├── linkedin-person-finder.ts # Personal LinkedIn
|
| 101 |
+
│ │ ├── social-finder.ts # Instagram, Facebook, Twitter
|
| 102 |
+
│ │ ├── pain-signal-detector.ts # Heuristic + LLM
|
| 103 |
+
│ │ ├── territory-manager.ts # City×industry grid
|
| 104 |
+
│ │ └── web-scraper.ts # Playwright scraper
|
| 105 |
+
│ ├── providers/ # External APIs
|
| 106 |
+
│ │ ├── hunter.ts # Hunter.io integration
|
| 107 |
+
│ │ ├── serper.ts # Google search
|
| 108 |
+
│ │ └── reoon.ts # Email verification
|
| 109 |
+
│ └── trigger-tasks/ # Trigger.dev tasks
|
| 110 |
+
│ ├── auto-discovery.ts # 5 chained tasks
|
| 111 |
+
│ └── manual-discovery.ts # Slack-triggered runs
|
| 112 |
+
├── profiling/ # AI profiling service
|
| 113 |
+
│ └── python-service/ # FastAPI
|
| 114 |
+
│ ├── main.py # /profile endpoint
|
| 115 |
+
│ ├── profiler.py # Chain-of-thought profiling
|
| 116 |
+
│ ├── scorer.py # Signal extraction + deterministic math
|
| 117 |
+
│ ├── hallucination_guard.py # Evidence-based cross-check
|
| 118 |
+
│ ├── nvidia_client.py # Multi-model LLM client
|
| 119 |
+
│ └── config.py # Settings
|
| 120 |
+
├── shared/ # Shared utilities
|
| 121 |
+
│ ├── config/env.ts # Environment validation (Zod)
|
| 122 |
+
│ ├── llm/nvidia-client.ts # Multi-model LLM (MiniMax primary)
|
| 123 |
+
│ ├── llm/prompts.ts # Production prompts
|
| 124 |
+
│ ├── llm/grounding.ts # Evidence-based verification
|
| 125 |
+
│ ├── observability/tracer.ts # Trace IDs + token tracking
|
| 126 |
+
│ ├── pipeline/checkpoint.ts # Crash recovery
|
| 127 |
+
│ ├── supabase/client.ts # DB client
|
| 128 |
+
│ └── utils/ # Retry, rate limiter, logger
|
| 129 |
+
└── slack/ # Slack integration
|
| 130 |
+
├── slack-service.ts # 3-layer delivery
|
| 131 |
+
└── slack-commands.ts # /discover, /leads, /status, etc.
|
| 132 |
+
```
|
| 133 |
+
|
| 134 |
+
## Quick Start
|
| 135 |
+
|
| 136 |
+
See [Setup Guide](docs/setup-guide.md) for detailed instructions.
|
| 137 |
+
|
| 138 |
+
```bash
|
| 139 |
+
# 1. Clone
|
| 140 |
+
git clone https://github.com/iDevBuddy/ai-client-acquisition.git
|
| 141 |
+
cd ai-client-acquisition
|
| 142 |
+
|
| 143 |
+
# 2. Install
|
| 144 |
+
npm install
|
| 145 |
+
cd src/profiling/python-service && pip install -r requirements.txt && cd ../../..
|
| 146 |
+
|
| 147 |
+
# 3. Configure
|
| 148 |
+
cp .env.example .env
|
| 149 |
+
# Fill in your API keys (see docs/setup-guide.md)
|
| 150 |
+
|
| 151 |
+
# 4. Database
|
| 152 |
+
# Run supabase/migrations/*.sql on your Supabase project
|
| 153 |
+
|
| 154 |
+
# 5. Run
|
| 155 |
+
npm run trigger:dev # Start Trigger.dev (task orchestration)
|
| 156 |
+
cd src/profiling/python-service && python main.py # Start AI service
|
| 157 |
+
```
|
| 158 |
+
|
| 159 |
+
## API Keys Required
|
| 160 |
+
|
| 161 |
+
| Service | Cost | What It Does |
|
| 162 |
+
|---------|------|-------------|
|
| 163 |
+
| NVIDIA NIM | FREE | AI models (MiniMax + LLaMA) |
|
| 164 |
+
| Serper.dev | FREE (2500/mo) | Google search |
|
| 165 |
+
| Hunter.io | FREE (25/mo) | Email finding |
|
| 166 |
+
| Reoon | FREE (20/day) | Email verification |
|
| 167 |
+
| Supabase | FREE | Database |
|
| 168 |
+
| Slack | FREE | Notifications |
|
| 169 |
+
| Trigger.dev | FREE (50K runs/mo) | Job orchestration |
|
| 170 |
+
|
| 171 |
+
**Total cost: $0/month**
|
| 172 |
+
|
| 173 |
+
## Contributing
|
| 174 |
+
|
| 175 |
+
See [CONTRIBUTING.md](CONTRIBUTING.md) for guidelines.
|
| 176 |
+
|
| 177 |
+
## License
|
| 178 |
+
|
| 179 |
+
Private — All rights reserved.
|
|
@@ -0,0 +1,56 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version: "3.9"
|
| 2 |
+
|
| 3 |
+
services:
|
| 4 |
+
# ─── Node.js Orchestration Service ──────────────────────────
|
| 5 |
+
node-service:
|
| 6 |
+
build:
|
| 7 |
+
context: .
|
| 8 |
+
dockerfile: Dockerfile.node
|
| 9 |
+
ports:
|
| 10 |
+
- "3000:3000"
|
| 11 |
+
environment:
|
| 12 |
+
- NODE_ENV=development
|
| 13 |
+
env_file:
|
| 14 |
+
- .env
|
| 15 |
+
depends_on:
|
| 16 |
+
- python-service
|
| 17 |
+
- redis
|
| 18 |
+
restart: unless-stopped
|
| 19 |
+
|
| 20 |
+
# ─── Python AI Profiling Service ────────────────────────────
|
| 21 |
+
python-service:
|
| 22 |
+
build:
|
| 23 |
+
context: ./src/profiling/python-service
|
| 24 |
+
dockerfile: Dockerfile.python
|
| 25 |
+
ports:
|
| 26 |
+
- "8000:8000"
|
| 27 |
+
env_file:
|
| 28 |
+
- .env
|
| 29 |
+
volumes:
|
| 30 |
+
- ./src/profiling/python-service:/app
|
| 31 |
+
restart: unless-stopped
|
| 32 |
+
healthcheck:
|
| 33 |
+
test: ["CMD", "curl", "-f", "http://localhost:8000/health"]
|
| 34 |
+
interval: 30s
|
| 35 |
+
timeout: 10s
|
| 36 |
+
retries: 3
|
| 37 |
+
|
| 38 |
+
# ─── Redis (queue + cache) ───────────────────────────────────
|
| 39 |
+
redis:
|
| 40 |
+
image: redis:7-alpine
|
| 41 |
+
ports:
|
| 42 |
+
- "6379:6379"
|
| 43 |
+
restart: unless-stopped
|
| 44 |
+
|
| 45 |
+
# ─── Ollama (local LLM) ──────────────────────────────────────
|
| 46 |
+
# Comment out if running Ollama natively on host
|
| 47 |
+
# ollama:
|
| 48 |
+
# image: ollama/ollama:latest
|
| 49 |
+
# ports:
|
| 50 |
+
# - "11434:11434"
|
| 51 |
+
# volumes:
|
| 52 |
+
# - ollama_data:/root/.ollama
|
| 53 |
+
# restart: unless-stopped
|
| 54 |
+
|
| 55 |
+
volumes:
|
| 56 |
+
ollama_data:
|
|
@@ -0,0 +1,118 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Setup Guide
|
| 2 |
+
|
| 3 |
+
Complete step-by-step guide to get the system running.
|
| 4 |
+
|
| 5 |
+
## Prerequisites
|
| 6 |
+
|
| 7 |
+
- **Node.js** 18+ (recommended: 20 LTS)
|
| 8 |
+
- **Python** 3.11+
|
| 9 |
+
- **npm** 9+
|
| 10 |
+
- **Git**
|
| 11 |
+
|
| 12 |
+
## Step 1: API Keys
|
| 13 |
+
|
| 14 |
+
Create accounts and get keys from these services (all FREE):
|
| 15 |
+
|
| 16 |
+
### 1.1 NVIDIA NIM (LLM — MiniMax + LLaMA)
|
| 17 |
+
1. Go to https://build.nvidia.com
|
| 18 |
+
2. Sign up / login
|
| 19 |
+
3. Click any model → "Get API Key"
|
| 20 |
+
4. Copy key (starts with `nvapi-`)
|
| 21 |
+
5. Free: 1000+ requests/day
|
| 22 |
+
|
| 23 |
+
### 1.2 Serper.dev (Google Search)
|
| 24 |
+
1. Go to https://serper.dev
|
| 25 |
+
2. Sign up with Google
|
| 26 |
+
3. Dashboard → copy API key
|
| 27 |
+
4. Free: 2,500 searches/month
|
| 28 |
+
|
| 29 |
+
### 1.3 Hunter.io (Email Finding)
|
| 30 |
+
1. Go to https://hunter.io
|
| 31 |
+
2. Sign up → Dashboard → API
|
| 32 |
+
3. Copy API key
|
| 33 |
+
4. Free: 25 searches/month
|
| 34 |
+
|
| 35 |
+
### 1.4 Reoon (Email Verification)
|
| 36 |
+
1. Go to https://emailverifier.reoon.com
|
| 37 |
+
2. Sign up → Dashboard → API
|
| 38 |
+
3. Copy API key
|
| 39 |
+
4. Free: 20 verifications/day
|
| 40 |
+
5. NOTE: System optimizes usage (SMTP probe first, Reoon fallback)
|
| 41 |
+
|
| 42 |
+
### 1.5 Supabase (Database)
|
| 43 |
+
1. Go to https://supabase.com
|
| 44 |
+
2. Create project
|
| 45 |
+
3. Project Settings → API
|
| 46 |
+
4. Copy **Project URL** and **service_role key** (not anon key!)
|
| 47 |
+
5. Free: 500MB database
|
| 48 |
+
|
| 49 |
+
### 1.6 Slack Bot
|
| 50 |
+
1. Go to https://api.slack.com/apps → Create New App
|
| 51 |
+
2. Name: "Lead Finder"
|
| 52 |
+
3. OAuth & Permissions → Add scopes: `chat:write`, `commands`, `channels:read`
|
| 53 |
+
4. Install to Workspace → copy Bot Token (`xoxb-...`)
|
| 54 |
+
5. Basic Information → copy Signing Secret
|
| 55 |
+
6. Create 2 channels: `#leads` and `#review`
|
| 56 |
+
7. Get channel IDs: right-click channel → View details → copy ID
|
| 57 |
+
|
| 58 |
+
### 1.7 Trigger.dev (Job Orchestration)
|
| 59 |
+
1. Go to https://trigger.dev → Sign up
|
| 60 |
+
2. Create project
|
| 61 |
+
3. Dashboard → API Keys → copy
|
| 62 |
+
4. Project ID from URL: `trigger.dev/orgs/.../projects/[PROJECT_ID]`
|
| 63 |
+
5. Free: 50,000 runs/month
|
| 64 |
+
|
| 65 |
+
## Step 2: Environment Setup
|
| 66 |
+
|
| 67 |
+
```bash
|
| 68 |
+
cp .env.example .env
|
| 69 |
+
```
|
| 70 |
+
|
| 71 |
+
Edit `.env` and fill in all keys from Step 1.
|
| 72 |
+
|
| 73 |
+
## Step 3: Database Migration
|
| 74 |
+
|
| 75 |
+
Option A — Supabase Dashboard:
|
| 76 |
+
1. Open Supabase → SQL Editor
|
| 77 |
+
2. Paste contents of `supabase/migrations/001_initial_schema.sql` → Run
|
| 78 |
+
3. Paste contents of `supabase/migrations/002_phase1_enhancements.sql` → Run
|
| 79 |
+
|
| 80 |
+
Option B — Supabase CLI:
|
| 81 |
+
```bash
|
| 82 |
+
npx supabase migration up
|
| 83 |
+
```
|
| 84 |
+
|
| 85 |
+
## Step 4: Install & Run
|
| 86 |
+
|
| 87 |
+
```bash
|
| 88 |
+
# Install Node.js dependencies
|
| 89 |
+
npm install
|
| 90 |
+
|
| 91 |
+
# Install Python dependencies
|
| 92 |
+
cd src/profiling/python-service
|
| 93 |
+
pip install -r requirements.txt
|
| 94 |
+
cd ../../..
|
| 95 |
+
|
| 96 |
+
# Terminal 1: Start Trigger.dev
|
| 97 |
+
npm run trigger:dev
|
| 98 |
+
|
| 99 |
+
# Terminal 2: Start Python AI service
|
| 100 |
+
cd src/profiling/python-service
|
| 101 |
+
python main.py
|
| 102 |
+
```
|
| 103 |
+
|
| 104 |
+
## Step 5: Verify
|
| 105 |
+
|
| 106 |
+
The system runs automatically at 9 AM PKT daily. To test manually:
|
| 107 |
+
- Use Slack `/discover` command
|
| 108 |
+
- Or trigger from Trigger.dev dashboard
|
| 109 |
+
|
| 110 |
+
## Troubleshooting
|
| 111 |
+
|
| 112 |
+
| Issue | Solution |
|
| 113 |
+
|-------|---------|
|
| 114 |
+
| `NVIDIA_API_KEY` error | Check key starts with `nvapi-` |
|
| 115 |
+
| MiniMax 429 rate limit | System auto-retries after wait |
|
| 116 |
+
| Hunter returns empty | Free tier: 25/month limit reached |
|
| 117 |
+
| SMTP verification fails | Some mail servers block port 25 |
|
| 118 |
+
| Supabase connection error | Check `SUPABASE_URL` has `https://` |
|
|
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"name": "ai-client-acquisition-system",
|
| 3 |
+
"version": "1.0.0",
|
| 4 |
+
"description": "Enterprise-grade AI Client Acquisition System — Quality-first lead pipeline",
|
| 5 |
+
"main": "dist/index.js",
|
| 6 |
+
"scripts": {
|
| 7 |
+
"build": "tsc",
|
| 8 |
+
"dev": "ts-node-dev --respawn --transpile-only src/index.ts",
|
| 9 |
+
"trigger:dev": "npx trigger.dev@latest dev",
|
| 10 |
+
"typecheck": "tsc --noEmit",
|
| 11 |
+
"lint": "eslint . --ext .ts"
|
| 12 |
+
},
|
| 13 |
+
"dependencies": {
|
| 14 |
+
"@supabase/supabase-js": "^2.43.0",
|
| 15 |
+
"@trigger.dev/sdk": "^3.0.0",
|
| 16 |
+
"playwright": "^1.44.0",
|
| 17 |
+
"zod": "^3.23.0",
|
| 18 |
+
"axios": "^1.7.0",
|
| 19 |
+
"dotenv": "^16.4.0",
|
| 20 |
+
"pino": "^9.2.0",
|
| 21 |
+
"pino-pretty": "^11.2.0",
|
| 22 |
+
"fastest-levenshtein": "^1.0.16",
|
| 23 |
+
"p-limit": "^5.0.0",
|
| 24 |
+
"p-retry": "^6.2.0"
|
| 25 |
+
},
|
| 26 |
+
"devDependencies": {
|
| 27 |
+
"@types/node": "^20.0.0",
|
| 28 |
+
"typescript": "^5.4.0",
|
| 29 |
+
"ts-node-dev": "^2.0.0",
|
| 30 |
+
"eslint": "^9.0.0",
|
| 31 |
+
"@typescript-eslint/parser": "^7.0.0"
|
| 32 |
+
}
|
| 33 |
+
}
|
|
@@ -0,0 +1,354 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/**
|
| 2 |
+
* Contact Enricher v2 — Full Pipeline
|
| 3 |
+
*
|
| 4 |
+
* Step 1: Find emails (Hunter.io + Pattern Generator + SMTP verify)
|
| 5 |
+
* Snov.io REMOVED — replaced by FREE email pattern generation
|
| 6 |
+
* Step 2: Classify emails (Tier 1/2/3)
|
| 7 |
+
* Step 3: Verify emails (7-layer deep)
|
| 8 |
+
* Step 4: Find personal LinkedIn
|
| 9 |
+
* Step 5: Find social profiles
|
| 10 |
+
* Step 6: Filter for decision-makers only
|
| 11 |
+
*
|
| 12 |
+
* Output: Verified, classified contacts ready for Phase 2
|
| 13 |
+
*/
|
| 14 |
+
|
| 15 |
+
import { searchHunterContacts, type HunterContact } from "../providers/hunter";
|
| 16 |
+
import { generateAndVerifyEmails, findEmailForPerson } from "./email-pattern-generator";
|
| 17 |
+
import { classifyEmail, type ClassificationResult } from "./email-classifier";
|
| 18 |
+
import { verifyEmailDeep, type VerificationResult } from "./email-verifier";
|
| 19 |
+
import { findPersonalLinkedIn, type PersonalLinkedIn } from "./linkedin-person-finder";
|
| 20 |
+
import { findSocialProfiles, type SocialProfiles } from "./social-finder";
|
| 21 |
+
import { getSupabaseClient } from "../../shared/supabase/client";
|
| 22 |
+
import { logger } from "../../shared/utils/logger";
|
| 23 |
+
import { randomUUID } from "crypto";
|
| 24 |
+
|
| 25 |
+
export interface EnrichedContact {
|
| 26 |
+
id: string;
|
| 27 |
+
companyId: string;
|
| 28 |
+
fullName: string;
|
| 29 |
+
title: string | null;
|
| 30 |
+
seniority: string | null;
|
| 31 |
+
|
| 32 |
+
// Email intelligence
|
| 33 |
+
email: string | null;
|
| 34 |
+
emailTier: string; // 'personal' | 'authority' | 'context_verified' | 'rejected'
|
| 35 |
+
emailVerification: VerificationResult | null;
|
| 36 |
+
emailClassification: ClassificationResult | null;
|
| 37 |
+
|
| 38 |
+
// LinkedIn (both company and personal)
|
| 39 |
+
linkedinPersonalUrl: string | null;
|
| 40 |
+
linkedinPersonalConfidence: number;
|
| 41 |
+
|
| 42 |
+
// Social
|
| 43 |
+
socialProfiles: SocialProfiles | null;
|
| 44 |
+
|
| 45 |
+
// Authority
|
| 46 |
+
authorityConfirmed: boolean;
|
| 47 |
+
authorityReason: string;
|
| 48 |
+
|
| 49 |
+
// Source tracking
|
| 50 |
+
source: "hunter" | "pattern" | "combined";
|
| 51 |
+
providerConfidence: number;
|
| 52 |
+
}
|
| 53 |
+
|
| 54 |
+
/**
|
| 55 |
+
* Full contact enrichment pipeline for a company.
|
| 56 |
+
*/
|
| 57 |
+
export async function enrichContacts(
|
| 58 |
+
companyId: string,
|
| 59 |
+
domain: string,
|
| 60 |
+
companyName: string,
|
| 61 |
+
employeeCount: number | null,
|
| 62 |
+
industry: string,
|
| 63 |
+
websiteSnippet: string,
|
| 64 |
+
websiteHtml: string,
|
| 65 |
+
companyLinkedInUrl: string | null,
|
| 66 |
+
traceId: string
|
| 67 |
+
): Promise<EnrichedContact[]> {
|
| 68 |
+
logger.info({ domain, companyName }, "Starting contact enrichment pipeline");
|
| 69 |
+
|
| 70 |
+
// ── Step 1: Find emails from all providers ─────────────────
|
| 71 |
+
const rawContacts = await findAllContacts(domain);
|
| 72 |
+
|
| 73 |
+
if (rawContacts.length === 0) {
|
| 74 |
+
logger.info({ domain }, "No contacts found from any provider");
|
| 75 |
+
return [];
|
| 76 |
+
}
|
| 77 |
+
|
| 78 |
+
logger.info({ domain, found: rawContacts.length }, "Raw contacts from providers");
|
| 79 |
+
|
| 80 |
+
// ── Step 2-6: Process each contact ─────────────────────────
|
| 81 |
+
const enriched: EnrichedContact[] = [];
|
| 82 |
+
|
| 83 |
+
for (const raw of rawContacts) {
|
| 84 |
+
if (!raw.email) continue;
|
| 85 |
+
|
| 86 |
+
// Step 2: Classify email (Tier 1/2/3)
|
| 87 |
+
const classification = await classifyEmail(
|
| 88 |
+
raw.email,
|
| 89 |
+
{ name: companyName, employeeCount, industry, websiteSnippet },
|
| 90 |
+
traceId
|
| 91 |
+
);
|
| 92 |
+
|
| 93 |
+
// Rejected by classifier → skip entirely
|
| 94 |
+
if (classification.verdict === "rejected") {
|
| 95 |
+
logger.debug({ email: raw.email, reason: classification.reason }, "Email rejected by classifier");
|
| 96 |
+
continue;
|
| 97 |
+
}
|
| 98 |
+
|
| 99 |
+
// Step 3: Deep verification (7 layers)
|
| 100 |
+
const verification = await verifyEmailDeep(
|
| 101 |
+
raw.email,
|
| 102 |
+
domain,
|
| 103 |
+
raw.confidence
|
| 104 |
+
);
|
| 105 |
+
|
| 106 |
+
// Hard invalid → skip
|
| 107 |
+
if (verification.status === "rejected_invalid") {
|
| 108 |
+
logger.debug({ email: raw.email }, "Email rejected by 7-layer verifier");
|
| 109 |
+
continue;
|
| 110 |
+
}
|
| 111 |
+
|
| 112 |
+
// Step 4: Find personal LinkedIn
|
| 113 |
+
let linkedin: PersonalLinkedIn | null = null;
|
| 114 |
+
if (raw.fullName && raw.fullName.length > 3) {
|
| 115 |
+
linkedin = await findPersonalLinkedIn(
|
| 116 |
+
raw.fullName,
|
| 117 |
+
companyName,
|
| 118 |
+
domain,
|
| 119 |
+
companyLinkedInUrl
|
| 120 |
+
);
|
| 121 |
+
}
|
| 122 |
+
|
| 123 |
+
// Step 5: Social profiles (once per company, not per contact)
|
| 124 |
+
// Social will be fetched separately at company level
|
| 125 |
+
|
| 126 |
+
// Step 6: Authority check
|
| 127 |
+
const { confirmed, reason } = checkAuthority(raw, classification);
|
| 128 |
+
|
| 129 |
+
const contact: EnrichedContact = {
|
| 130 |
+
id: randomUUID(),
|
| 131 |
+
companyId,
|
| 132 |
+
fullName: raw.fullName,
|
| 133 |
+
title: raw.title,
|
| 134 |
+
seniority: raw.seniority,
|
| 135 |
+
email: raw.email,
|
| 136 |
+
emailTier: classification.verdict,
|
| 137 |
+
emailVerification: verification,
|
| 138 |
+
emailClassification: classification,
|
| 139 |
+
linkedinPersonalUrl: linkedin?.url ?? null,
|
| 140 |
+
linkedinPersonalConfidence: linkedin?.confidence ?? 0,
|
| 141 |
+
socialProfiles: null, // set at company level
|
| 142 |
+
authorityConfirmed: confirmed,
|
| 143 |
+
authorityReason: reason,
|
| 144 |
+
source: raw.source,
|
| 145 |
+
providerConfidence: raw.confidence,
|
| 146 |
+
};
|
| 147 |
+
|
| 148 |
+
enriched.push(contact);
|
| 149 |
+
}
|
| 150 |
+
|
| 151 |
+
// Sort: authority-confirmed first, then by verification confidence
|
| 152 |
+
enriched.sort((a, b) => {
|
| 153 |
+
if (a.authorityConfirmed !== b.authorityConfirmed) return a.authorityConfirmed ? -1 : 1;
|
| 154 |
+
return (b.emailVerification?.overallConfidence ?? 0) - (a.emailVerification?.overallConfidence ?? 0);
|
| 155 |
+
});
|
| 156 |
+
|
| 157 |
+
// Step 5: Social profiles for company (once)
|
| 158 |
+
if (enriched.length > 0) {
|
| 159 |
+
const social = await findSocialProfiles(domain, companyName, websiteHtml);
|
| 160 |
+
for (const c of enriched) {
|
| 161 |
+
c.socialProfiles = social;
|
| 162 |
+
}
|
| 163 |
+
}
|
| 164 |
+
|
| 165 |
+
logger.info({
|
| 166 |
+
domain,
|
| 167 |
+
rawFound: rawContacts.length,
|
| 168 |
+
afterClassification: enriched.length,
|
| 169 |
+
authorityConfirmed: enriched.filter(c => c.authorityConfirmed).length,
|
| 170 |
+
withLinkedIn: enriched.filter(c => c.linkedinPersonalUrl).length,
|
| 171 |
+
}, "Contact enrichment pipeline complete");
|
| 172 |
+
|
| 173 |
+
// Save to database
|
| 174 |
+
await saveContacts(enriched);
|
| 175 |
+
|
| 176 |
+
return enriched;
|
| 177 |
+
}
|
| 178 |
+
|
| 179 |
+
// ─── Find contacts from all providers ─────────────────────────
|
| 180 |
+
// Strategy: Hunter.io (free 25/mo) for names+titles+emails
|
| 181 |
+
// Pattern Generator (FREE, unlimited) to find more emails
|
| 182 |
+
// Snov.io REMOVED — replaced by pattern generation
|
| 183 |
+
|
| 184 |
+
interface RawContact {
|
| 185 |
+
fullName: string;
|
| 186 |
+
email: string;
|
| 187 |
+
title: string | null;
|
| 188 |
+
seniority: string | null;
|
| 189 |
+
confidence: number;
|
| 190 |
+
source: "hunter" | "pattern";
|
| 191 |
+
}
|
| 192 |
+
|
| 193 |
+
async function findAllContacts(domain: string): Promise<RawContact[]> {
|
| 194 |
+
const contacts: RawContact[] = [];
|
| 195 |
+
const seenEmails = new Set<string>();
|
| 196 |
+
const namesFromHunter: { firstName: string; lastName: string; title: string | null; seniority: string | null }[] = [];
|
| 197 |
+
|
| 198 |
+
// ── Source 1: Hunter.io (25 free/month) ─────────────────────
|
| 199 |
+
// Hunter gives us NAMES + TITLES + EMAILS
|
| 200 |
+
try {
|
| 201 |
+
const hunterResults = await searchHunterContacts(domain);
|
| 202 |
+
for (const h of hunterResults) {
|
| 203 |
+
const email = h.value?.toLowerCase();
|
| 204 |
+
const firstName = h.first_name ?? "";
|
| 205 |
+
const lastName = h.last_name ?? "";
|
| 206 |
+
const fullName = `${firstName} ${lastName}`.trim();
|
| 207 |
+
|
| 208 |
+
// Save name for pattern generation later
|
| 209 |
+
if (firstName && lastName) {
|
| 210 |
+
namesFromHunter.push({
|
| 211 |
+
firstName,
|
| 212 |
+
lastName,
|
| 213 |
+
title: h.position ?? null,
|
| 214 |
+
seniority: h.seniority ?? null,
|
| 215 |
+
});
|
| 216 |
+
}
|
| 217 |
+
|
| 218 |
+
if (email && !seenEmails.has(email)) {
|
| 219 |
+
seenEmails.add(email);
|
| 220 |
+
contacts.push({
|
| 221 |
+
fullName,
|
| 222 |
+
email,
|
| 223 |
+
title: h.position ?? null,
|
| 224 |
+
seniority: h.seniority ?? null,
|
| 225 |
+
confidence: h.confidence ?? 0,
|
| 226 |
+
source: "hunter",
|
| 227 |
+
});
|
| 228 |
+
}
|
| 229 |
+
}
|
| 230 |
+
} catch (err) {
|
| 231 |
+
logger.warn({ domain, err }, "Hunter search failed — falling back to pattern generation");
|
| 232 |
+
}
|
| 233 |
+
|
| 234 |
+
// ── Source 2: Pattern Generator (FREE, UNLIMITED) ──────────
|
| 235 |
+
// For names we got from Hunter that DON'T have emails,
|
| 236 |
+
// OR if Hunter returned no results at all
|
| 237 |
+
for (const person of namesFromHunter) {
|
| 238 |
+
// Check if we already have an email for this person
|
| 239 |
+
const hasEmail = contacts.some(c =>
|
| 240 |
+
c.fullName.toLowerCase().includes(person.firstName.toLowerCase()) &&
|
| 241 |
+
c.fullName.toLowerCase().includes(person.lastName.toLowerCase())
|
| 242 |
+
);
|
| 243 |
+
|
| 244 |
+
if (!hasEmail) {
|
| 245 |
+
// Generate email patterns and SMTP verify (FREE)
|
| 246 |
+
const generated = await findEmailForPerson(
|
| 247 |
+
`${person.firstName} ${person.lastName}`,
|
| 248 |
+
domain
|
| 249 |
+
);
|
| 250 |
+
|
| 251 |
+
if (generated && generated.smtpStatus === "deliverable" && !seenEmails.has(generated.email)) {
|
| 252 |
+
seenEmails.add(generated.email);
|
| 253 |
+
contacts.push({
|
| 254 |
+
fullName: `${person.firstName} ${person.lastName}`,
|
| 255 |
+
email: generated.email,
|
| 256 |
+
title: person.title,
|
| 257 |
+
seniority: person.seniority,
|
| 258 |
+
confidence: generated.confidence * 100,
|
| 259 |
+
source: "pattern",
|
| 260 |
+
});
|
| 261 |
+
}
|
| 262 |
+
}
|
| 263 |
+
}
|
| 264 |
+
|
| 265 |
+
// ── Source 3: If still no contacts, try common owner patterns ─
|
| 266 |
+
if (contacts.length === 0) {
|
| 267 |
+
// Try generic owner/manager patterns
|
| 268 |
+
const ownerPatterns = ["info", "contact", "hello", "admin"];
|
| 269 |
+
for (const prefix of ownerPatterns) {
|
| 270 |
+
const email = `${prefix}@${domain}`;
|
| 271 |
+
if (!seenEmails.has(email)) {
|
| 272 |
+
seenEmails.add(email);
|
| 273 |
+
contacts.push({
|
| 274 |
+
fullName: "Unknown",
|
| 275 |
+
email,
|
| 276 |
+
title: null,
|
| 277 |
+
seniority: null,
|
| 278 |
+
confidence: 20,
|
| 279 |
+
source: "pattern",
|
| 280 |
+
});
|
| 281 |
+
}
|
| 282 |
+
}
|
| 283 |
+
}
|
| 284 |
+
|
| 285 |
+
logger.info({
|
| 286 |
+
domain,
|
| 287 |
+
hunterContacts: contacts.filter(c => c.source === "hunter").length,
|
| 288 |
+
patternContacts: contacts.filter(c => c.source === "pattern").length,
|
| 289 |
+
total: contacts.length,
|
| 290 |
+
}, "Contact finding complete (Hunter + Pattern Generator)");
|
| 291 |
+
|
| 292 |
+
return contacts;
|
| 293 |
+
}
|
| 294 |
+
|
| 295 |
+
// ─── Authority check ─────────────────────────────────────────
|
| 296 |
+
|
| 297 |
+
function checkAuthority(
|
| 298 |
+
contact: RawContact,
|
| 299 |
+
classification: ClassificationResult
|
| 300 |
+
): { confirmed: boolean; reason: string } {
|
| 301 |
+
// Personal email with senior title → confirmed
|
| 302 |
+
const seniorTitles = /\b(ceo|cto|coo|cfo|cmo|founder|co-founder|owner|partner|director|vp|vice\s*president|president|head|principal|managing|general\s*manager)\b/i;
|
| 303 |
+
|
| 304 |
+
if (classification.verdict === "personal" && contact.title && seniorTitles.test(contact.title)) {
|
| 305 |
+
return { confirmed: true, reason: `Personal email + senior title: ${contact.title}` };
|
| 306 |
+
}
|
| 307 |
+
|
| 308 |
+
if (classification.verdict === "authority") {
|
| 309 |
+
return { confirmed: true, reason: `Authority email prefix: ${contact.email.split("@")[0]}` };
|
| 310 |
+
}
|
| 311 |
+
|
| 312 |
+
if (classification.verdict === "personal") {
|
| 313 |
+
return { confirmed: true, reason: "Personal email format — likely individual decision maker" };
|
| 314 |
+
}
|
| 315 |
+
|
| 316 |
+
if (classification.verdict === "context_verified" && classification.confidence >= 0.7) {
|
| 317 |
+
return { confirmed: true, reason: classification.reason };
|
| 318 |
+
}
|
| 319 |
+
|
| 320 |
+
if (classification.verdict === "outsourcing") {
|
| 321 |
+
return { confirmed: false, reason: "Outsourcing/vendor email — may reach procurement, not decision maker" };
|
| 322 |
+
}
|
| 323 |
+
|
| 324 |
+
return { confirmed: false, reason: "Authority not confirmed" };
|
| 325 |
+
}
|
| 326 |
+
|
| 327 |
+
// ─── Save to database ────────────────────────────────────────
|
| 328 |
+
|
| 329 |
+
async function saveContacts(contacts: EnrichedContact[]): Promise<void> {
|
| 330 |
+
const db = getSupabaseClient();
|
| 331 |
+
|
| 332 |
+
for (const c of contacts) {
|
| 333 |
+
try {
|
| 334 |
+
await db.from("contacts").upsert({
|
| 335 |
+
id: c.id,
|
| 336 |
+
company_id: c.companyId,
|
| 337 |
+
full_name: c.fullName,
|
| 338 |
+
title: c.title,
|
| 339 |
+
seniority: c.seniority,
|
| 340 |
+
email: c.email,
|
| 341 |
+
email_verified: c.emailVerification?.status === "verified_deliverable",
|
| 342 |
+
email_tier: c.emailTier,
|
| 343 |
+
email_verification_layers: c.emailVerification?.layers ?? {},
|
| 344 |
+
linkedin_personal_url: c.linkedinPersonalUrl,
|
| 345 |
+
social_profiles: c.socialProfiles ?? {},
|
| 346 |
+
authority_confirmed: c.authorityConfirmed,
|
| 347 |
+
confidence: c.emailVerification?.overallConfidence ?? c.providerConfidence,
|
| 348 |
+
source: c.source,
|
| 349 |
+
}, { onConflict: "company_id,email" });
|
| 350 |
+
} catch (err) {
|
| 351 |
+
logger.warn({ email: c.email, err }, "Contact save failed — continuing");
|
| 352 |
+
}
|
| 353 |
+
}
|
| 354 |
+
}
|
|
@@ -0,0 +1,82 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import { distance } from "fastest-levenshtein";
|
| 2 |
+
import { getSupabaseClient } from "../../shared/supabase/client";
|
| 3 |
+
import { logger } from "../../shared/utils/logger";
|
| 4 |
+
|
| 5 |
+
/**
|
| 6 |
+
* Checks if a company already exists in Supabase.
|
| 7 |
+
* Uses exact domain match first, then fuzzy name match as fallback.
|
| 8 |
+
* Returns the existing company ID if duplicate, null if new.
|
| 9 |
+
*/
|
| 10 |
+
export async function isDuplicate(
|
| 11 |
+
domain: string,
|
| 12 |
+
name: string
|
| 13 |
+
): Promise<{ isDupe: boolean; existingId?: string }> {
|
| 14 |
+
const db = getSupabaseClient();
|
| 15 |
+
|
| 16 |
+
// ── 1. Exact domain match (fastest) ─────────────────────────
|
| 17 |
+
const { data: byDomain } = await db
|
| 18 |
+
.from("companies")
|
| 19 |
+
.select("id, domain, name")
|
| 20 |
+
.eq("domain", normalizeDomain(domain))
|
| 21 |
+
.maybeSingle();
|
| 22 |
+
|
| 23 |
+
if (byDomain) {
|
| 24 |
+
logger.debug({ domain, existingId: byDomain.id }, "Duplicate: exact domain match");
|
| 25 |
+
return { isDupe: true, existingId: byDomain.id };
|
| 26 |
+
}
|
| 27 |
+
|
| 28 |
+
// ── 2. Fuzzy name match against recent records ────────────────
|
| 29 |
+
const { data: recent } = await db
|
| 30 |
+
.from("companies")
|
| 31 |
+
.select("id, name")
|
| 32 |
+
.order("discovered_at", { ascending: false })
|
| 33 |
+
.limit(500);
|
| 34 |
+
|
| 35 |
+
if (!recent) return { isDupe: false };
|
| 36 |
+
|
| 37 |
+
const normalizedInput = normalizeName(name);
|
| 38 |
+
|
| 39 |
+
for (const existing of recent) {
|
| 40 |
+
const normalizedExisting = normalizeName(existing.name);
|
| 41 |
+
const dist = distance(normalizedInput, normalizedExisting);
|
| 42 |
+
const maxLen = Math.max(normalizedInput.length, normalizedExisting.length);
|
| 43 |
+
const similarity = 1 - dist / maxLen;
|
| 44 |
+
|
| 45 |
+
if (similarity >= 0.88) {
|
| 46 |
+
logger.debug(
|
| 47 |
+
{ input: name, existing: existing.name, similarity: similarity.toFixed(2) },
|
| 48 |
+
"Duplicate: fuzzy name match"
|
| 49 |
+
);
|
| 50 |
+
return { isDupe: true, existingId: existing.id };
|
| 51 |
+
}
|
| 52 |
+
}
|
| 53 |
+
|
| 54 |
+
return { isDupe: false };
|
| 55 |
+
}
|
| 56 |
+
|
| 57 |
+
/**
|
| 58 |
+
* Checks suppression list before any processing.
|
| 59 |
+
*/
|
| 60 |
+
export async function isSuppressed(domain: string): Promise<boolean> {
|
| 61 |
+
const db = getSupabaseClient();
|
| 62 |
+
const { data } = await db
|
| 63 |
+
.from("suppression_list")
|
| 64 |
+
.select("id")
|
| 65 |
+
.eq("domain", domain)
|
| 66 |
+
.maybeSingle();
|
| 67 |
+
return !!data;
|
| 68 |
+
}
|
| 69 |
+
|
| 70 |
+
// ─── Helpers ─────────────────────────────────────────────────
|
| 71 |
+
|
| 72 |
+
function normalizeDomain(domain: string): string {
|
| 73 |
+
return domain.toLowerCase().replace(/^www\./, "").replace(/\/$/, "").trim();
|
| 74 |
+
}
|
| 75 |
+
|
| 76 |
+
function normalizeName(name: string): string {
|
| 77 |
+
return name
|
| 78 |
+
.toLowerCase()
|
| 79 |
+
.replace(/\b(inc|ltd|llc|corp|co|limited|plc|gmbh|pty|pvt|srl|bv|ag|sa)\b\.?/gi, "")
|
| 80 |
+
.replace(/[^a-z0-9\s]/g, "")
|
| 81 |
+
.trim();
|
| 82 |
+
}
|
|
@@ -0,0 +1,210 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/**
|
| 2 |
+
* Email Classifier — 3-Tier Decision System
|
| 3 |
+
*
|
| 4 |
+
* Tier 1: Hard REJECT (noreply, support, jobs → instant discard)
|
| 5 |
+
* Tier 2: LLM Context Check (operations, admin, system → depends on company size/industry)
|
| 6 |
+
* Tier 3: High confidence KEEP (personal format, ceo@, partnerships@)
|
| 7 |
+
*
|
| 8 |
+
* Key insight: admin@ at a 5-person dental clinic reaches the owner.
|
| 9 |
+
* admin@ at a 500-person corp reaches an assistant. Context matters.
|
| 10 |
+
*/
|
| 11 |
+
|
| 12 |
+
import { callLLM } from "../../shared/llm/nvidia-client";
|
| 13 |
+
import { SYSTEM_PROMPTS, buildEmailClassifyPrompt } from "../../shared/llm/prompts";
|
| 14 |
+
import { MODELS } from "../../shared/llm/nvidia-client";
|
| 15 |
+
import { logger } from "../../shared/utils/logger";
|
| 16 |
+
|
| 17 |
+
export type EmailTier = "reject" | "context_check" | "keep";
|
| 18 |
+
export type EmailVerdict = "personal" | "authority" | "context_verified" | "outsourcing" | "rejected";
|
| 19 |
+
|
| 20 |
+
export interface ClassificationResult {
|
| 21 |
+
email: string;
|
| 22 |
+
tier: EmailTier;
|
| 23 |
+
verdict: EmailVerdict;
|
| 24 |
+
confidence: number;
|
| 25 |
+
likelyReaches: string;
|
| 26 |
+
reason: string;
|
| 27 |
+
}
|
| 28 |
+
|
| 29 |
+
// ─── Tier 1: ALWAYS REJECT ──────────────────────────────────
|
| 30 |
+
|
| 31 |
+
const HARD_REJECT_PREFIXES = new Set([
|
| 32 |
+
// Automated / system
|
| 33 |
+
"noreply", "no-reply", "no_reply", "donotreply", "do-not-reply",
|
| 34 |
+
"notifications", "automated", "bounces", "mailer",
|
| 35 |
+
"postmaster", "unsubscribe", "spam", "abuse",
|
| 36 |
+
// Support (never reaches decision-maker)
|
| 37 |
+
"support", "helpdesk", "tickets", "complaints", "feedback",
|
| 38 |
+
// Jobs (irrelevant)
|
| 39 |
+
"jobs", "careers", "apply", "recruitment", "hiring", "talent",
|
| 40 |
+
]);
|
| 41 |
+
|
| 42 |
+
// ─── Tier 2: CONTEXT-DEPENDENT (LLM decides) ────────────────
|
| 43 |
+
|
| 44 |
+
const CONTEXT_CHECK_PREFIXES = new Set([
|
| 45 |
+
"operations", "admin", "system", "info", "office",
|
| 46 |
+
"hello", "contact", "enquiries", "general", "team",
|
| 47 |
+
"accounts", "finance", "billing", "sales", "marketing",
|
| 48 |
+
"hr", "legal", "compliance", "reception", "manager",
|
| 49 |
+
]);
|
| 50 |
+
|
| 51 |
+
// ─── Tier 3: HIGH CONFIDENCE KEEP ───────────────────────────
|
| 52 |
+
|
| 53 |
+
const AUTHORITY_PREFIXES = new Set([
|
| 54 |
+
"ceo", "owner", "founder", "president", "cto", "coo",
|
| 55 |
+
"partner", "principal", "director", "md", "gm", "head",
|
| 56 |
+
]);
|
| 57 |
+
|
| 58 |
+
const OUTSOURCING_PREFIXES = new Set([
|
| 59 |
+
"partnerships", "vendors", "procurement", "outsource",
|
| 60 |
+
"collaborate", "projects", "business", "growth",
|
| 61 |
+
]);
|
| 62 |
+
|
| 63 |
+
// ─── Personal email pattern (firstname, firstname.lastname) ─
|
| 64 |
+
const PERSONAL_PATTERN = /^[a-z]{2,}(\.[a-z]{2,})?$/;
|
| 65 |
+
const INITIAL_PATTERN = /^[a-z]\.[a-z]{2,}$/; // j.smith
|
| 66 |
+
|
| 67 |
+
/**
|
| 68 |
+
* Main classifier — determines if email is worth pursuing.
|
| 69 |
+
*/
|
| 70 |
+
export async function classifyEmail(
|
| 71 |
+
email: string,
|
| 72 |
+
companyContext: {
|
| 73 |
+
name: string;
|
| 74 |
+
employeeCount: number | null;
|
| 75 |
+
industry: string;
|
| 76 |
+
websiteSnippet: string;
|
| 77 |
+
},
|
| 78 |
+
traceId: string
|
| 79 |
+
): Promise<ClassificationResult> {
|
| 80 |
+
const prefix = email.split("@")[0].toLowerCase().replace(/[^a-z]/g, "");
|
| 81 |
+
const fullPrefix = email.split("@")[0].toLowerCase();
|
| 82 |
+
|
| 83 |
+
// ── Tier 1: Hard reject ────────────────────────────────────
|
| 84 |
+
if (HARD_REJECT_PREFIXES.has(prefix)) {
|
| 85 |
+
return {
|
| 86 |
+
email,
|
| 87 |
+
tier: "reject",
|
| 88 |
+
verdict: "rejected",
|
| 89 |
+
confidence: 1.0,
|
| 90 |
+
likelyReaches: "automated inbox or department queue",
|
| 91 |
+
reason: `"${fullPrefix}@" is a known non-personal email type`,
|
| 92 |
+
};
|
| 93 |
+
}
|
| 94 |
+
|
| 95 |
+
// ── Tier 3: Personal format → instant keep ─────────────────
|
| 96 |
+
if (PERSONAL_PATTERN.test(fullPrefix) || INITIAL_PATTERN.test(fullPrefix)) {
|
| 97 |
+
return {
|
| 98 |
+
email,
|
| 99 |
+
tier: "keep",
|
| 100 |
+
verdict: "personal",
|
| 101 |
+
confidence: 0.95,
|
| 102 |
+
likelyReaches: "individual person (personal email format)",
|
| 103 |
+
reason: `"${fullPrefix}@" matches personal email pattern`,
|
| 104 |
+
};
|
| 105 |
+
}
|
| 106 |
+
|
| 107 |
+
// ── Tier 3: Authority prefix → instant keep ────────────────
|
| 108 |
+
if (AUTHORITY_PREFIXES.has(prefix)) {
|
| 109 |
+
return {
|
| 110 |
+
email,
|
| 111 |
+
tier: "keep",
|
| 112 |
+
verdict: "authority",
|
| 113 |
+
confidence: 0.90,
|
| 114 |
+
likelyReaches: `${prefix.toUpperCase()} or equivalent executive`,
|
| 115 |
+
reason: `"${fullPrefix}@" is a known decision-maker prefix`,
|
| 116 |
+
};
|
| 117 |
+
}
|
| 118 |
+
|
| 119 |
+
// ── Tier 3: Outsourcing signal → keep ──────────────────────
|
| 120 |
+
if (OUTSOURCING_PREFIXES.has(prefix)) {
|
| 121 |
+
return {
|
| 122 |
+
email,
|
| 123 |
+
tier: "keep",
|
| 124 |
+
verdict: "outsourcing",
|
| 125 |
+
confidence: 0.80,
|
| 126 |
+
likelyReaches: "vendor/partnership manager (purchasing authority likely)",
|
| 127 |
+
reason: `"${fullPrefix}@" signals company outsources services`,
|
| 128 |
+
};
|
| 129 |
+
}
|
| 130 |
+
|
| 131 |
+
// ── Tier 2: Context check needed → ask LLM ────────────────
|
| 132 |
+
if (CONTEXT_CHECK_PREFIXES.has(prefix)) {
|
| 133 |
+
return contextCheckWithLLM(email, companyContext, traceId);
|
| 134 |
+
}
|
| 135 |
+
|
| 136 |
+
// ── Unknown prefix → default to LLM context check ─────��───
|
| 137 |
+
return contextCheckWithLLM(email, companyContext, traceId);
|
| 138 |
+
}
|
| 139 |
+
|
| 140 |
+
/**
|
| 141 |
+
* LLM-powered context check for ambiguous email prefixes.
|
| 142 |
+
* Uses FAST model (8B) to save tokens — this is a simple classification.
|
| 143 |
+
*/
|
| 144 |
+
async function contextCheckWithLLM(
|
| 145 |
+
email: string,
|
| 146 |
+
context: {
|
| 147 |
+
name: string;
|
| 148 |
+
employeeCount: number | null;
|
| 149 |
+
industry: string;
|
| 150 |
+
websiteSnippet: string;
|
| 151 |
+
},
|
| 152 |
+
traceId: string
|
| 153 |
+
): Promise<ClassificationResult> {
|
| 154 |
+
try {
|
| 155 |
+
const response = await callLLM({
|
| 156 |
+
operation: "email_classify",
|
| 157 |
+
model: MODELS.FAST, // 8B model — fast + cheap for simple classification
|
| 158 |
+
systemPrompt: SYSTEM_PROMPTS.EMAIL_CLASSIFIER,
|
| 159 |
+
userPrompt: buildEmailClassifyPrompt({
|
| 160 |
+
email,
|
| 161 |
+
company_name: context.name,
|
| 162 |
+
company_size: context.employeeCount,
|
| 163 |
+
industry: context.industry,
|
| 164 |
+
website_snippet: context.websiteSnippet,
|
| 165 |
+
}),
|
| 166 |
+
temperature: 0.1,
|
| 167 |
+
maxTokens: 200,
|
| 168 |
+
jsonMode: true,
|
| 169 |
+
traceId,
|
| 170 |
+
});
|
| 171 |
+
|
| 172 |
+
if (response.parsed) {
|
| 173 |
+
const keep = response.parsed.keep === true;
|
| 174 |
+
const confidence = Number(response.parsed.confidence ?? 0.5);
|
| 175 |
+
|
| 176 |
+
return {
|
| 177 |
+
email,
|
| 178 |
+
tier: "context_check",
|
| 179 |
+
verdict: keep ? "context_verified" : "rejected",
|
| 180 |
+
confidence,
|
| 181 |
+
likelyReaches: String(response.parsed.likely_reaches ?? "unknown"),
|
| 182 |
+
reason: String(response.parsed.reason ?? "LLM context check"),
|
| 183 |
+
};
|
| 184 |
+
}
|
| 185 |
+
|
| 186 |
+
// LLM failed to respond properly → conservative: keep it, low confidence
|
| 187 |
+
return {
|
| 188 |
+
email,
|
| 189 |
+
tier: "context_check",
|
| 190 |
+
verdict: "context_verified",
|
| 191 |
+
confidence: 0.5,
|
| 192 |
+
likelyReaches: "unknown — LLM parse failed",
|
| 193 |
+
reason: "LLM context check failed — keeping with low confidence",
|
| 194 |
+
};
|
| 195 |
+
|
| 196 |
+
} catch (err) {
|
| 197 |
+
logger.warn({ email, err }, "Email LLM classify failed — keeping conservatively");
|
| 198 |
+
|
| 199 |
+
// Fallback: rule-based size heuristic
|
| 200 |
+
const isSmall = (context.employeeCount ?? 0) < 30;
|
| 201 |
+
return {
|
| 202 |
+
email,
|
| 203 |
+
tier: "context_check",
|
| 204 |
+
verdict: isSmall ? "context_verified" : "rejected",
|
| 205 |
+
confidence: 0.4,
|
| 206 |
+
likelyReaches: isSmall ? "likely owner/manager (small company)" : "likely department inbox (large company)",
|
| 207 |
+
reason: `Fallback: company size ${context.employeeCount ?? "unknown"} → ${isSmall ? "small=keep" : "large=reject"}`,
|
| 208 |
+
};
|
| 209 |
+
}
|
| 210 |
+
}
|
|
@@ -0,0 +1,249 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/**
|
| 2 |
+
* Email Pattern Generator — Snov.io Replacement (FREE, UNLIMITED)
|
| 3 |
+
*
|
| 4 |
+
* How it works:
|
| 5 |
+
* 1. Take a person's name: "John Smith"
|
| 6 |
+
* 2. Generate ALL common email patterns: john@, smith@, john.smith@, j.smith@, etc.
|
| 7 |
+
* 3. Verify each via SMTP handshake (Layer 5 in our verifier — FREE)
|
| 8 |
+
* 4. First one that passes SMTP = real email
|
| 9 |
+
*
|
| 10 |
+
* This is what tools like Hunter/Snov ACTUALLY do internally.
|
| 11 |
+
* We're cutting out the middleman.
|
| 12 |
+
*
|
| 13 |
+
* Cost: $0 forever
|
| 14 |
+
* Daily limit: unlimited
|
| 15 |
+
* Accuracy: Higher than Snov (we verify each guess ourselves)
|
| 16 |
+
*/
|
| 17 |
+
|
| 18 |
+
import { logger } from "../../shared/utils/logger";
|
| 19 |
+
import dns from "dns/promises";
|
| 20 |
+
import net from "net";
|
| 21 |
+
|
| 22 |
+
export interface GeneratedEmail {
|
| 23 |
+
email: string;
|
| 24 |
+
pattern: string; // "firstname.lastname", "firstinitial.lastname", etc.
|
| 25 |
+
smtpStatus: "deliverable" | "undeliverable" | "unknown";
|
| 26 |
+
confidence: number; // 0.0 - 1.0
|
| 27 |
+
}
|
| 28 |
+
|
| 29 |
+
// ─── Common email patterns (ordered by frequency) ────────────
|
| 30 |
+
// Source: Analysis of 1M+ business emails worldwide
|
| 31 |
+
|
| 32 |
+
const PATTERNS = [
|
| 33 |
+
// Most common (70% of businesses)
|
| 34 |
+
{ name: "firstname", build: (f: string, l: string) => f },
|
| 35 |
+
{ name: "firstname.lastname", build: (f: string, l: string) => `${f}.${l}` },
|
| 36 |
+
{ name: "firstinitial.lastname", build: (f: string, l: string) => `${f[0]}.${l}` },
|
| 37 |
+
{ name: "firstinitial_lastname", build: (f: string, l: string) => `${f[0]}${l}` },
|
| 38 |
+
{ name: "firstname_lastname", build: (f: string, l: string) => `${f}_${l}` },
|
| 39 |
+
|
| 40 |
+
// Common (20% of businesses)
|
| 41 |
+
{ name: "lastname.firstname", build: (f: string, l: string) => `${l}.${f}` },
|
| 42 |
+
{ name: "lastname", build: (f: string, l: string) => l },
|
| 43 |
+
{ name: "firstname_lastinitial", build: (f: string, l: string) => `${f}${l[0]}` },
|
| 44 |
+
{ name: "firstinitial_lastinitial", build: (f: string, l: string) => `${f[0]}${l[0]}` },
|
| 45 |
+
|
| 46 |
+
// Less common but valid (10%)
|
| 47 |
+
{ name: "firstname-lastname", build: (f: string, l: string) => `${f}-${l}` },
|
| 48 |
+
{ name: "first2_lastname", build: (f: string, l: string) => `${f.slice(0, 2)}${l}` },
|
| 49 |
+
];
|
| 50 |
+
|
| 51 |
+
/**
|
| 52 |
+
* Generate and verify email patterns for a person at a domain.
|
| 53 |
+
*
|
| 54 |
+
* @param firstName Person's first name (e.g., "John")
|
| 55 |
+
* @param lastName Person's last name (e.g., "Smith")
|
| 56 |
+
* @param domain Company domain (e.g., "abcdental.com")
|
| 57 |
+
* @returns List of generated emails with verification status
|
| 58 |
+
*/
|
| 59 |
+
export async function generateAndVerifyEmails(
|
| 60 |
+
firstName: string,
|
| 61 |
+
lastName: string,
|
| 62 |
+
domain: string
|
| 63 |
+
): Promise<GeneratedEmail[]> {
|
| 64 |
+
if (!firstName || !lastName || !domain) return [];
|
| 65 |
+
|
| 66 |
+
const f = firstName.toLowerCase().replace(/[^a-z]/g, "");
|
| 67 |
+
const l = lastName.toLowerCase().replace(/[^a-z]/g, "");
|
| 68 |
+
|
| 69 |
+
if (f.length < 2 || l.length < 1) return [];
|
| 70 |
+
|
| 71 |
+
// Step 1: Check if domain has valid MX records
|
| 72 |
+
const hasMX = await checkMXRecord(domain);
|
| 73 |
+
if (!hasMX) {
|
| 74 |
+
logger.debug({ domain }, "No MX records — skipping pattern generation");
|
| 75 |
+
return [];
|
| 76 |
+
}
|
| 77 |
+
|
| 78 |
+
// Step 2: Check if domain is catch-all (accepts everything)
|
| 79 |
+
const isCatchAll = await checkCatchAll(domain);
|
| 80 |
+
|
| 81 |
+
// Step 3: Generate all pattern emails
|
| 82 |
+
const candidates = PATTERNS.map(p => ({
|
| 83 |
+
email: `${p.build(f, l)}@${domain}`,
|
| 84 |
+
pattern: p.name,
|
| 85 |
+
smtpStatus: "unknown" as const,
|
| 86 |
+
confidence: 0,
|
| 87 |
+
}));
|
| 88 |
+
|
| 89 |
+
// Step 4: If catch-all → we can't SMTP verify, return with medium confidence
|
| 90 |
+
if (isCatchAll) {
|
| 91 |
+
logger.debug({ domain }, "Catch-all domain — returning top patterns without SMTP");
|
| 92 |
+
return candidates.slice(0, 3).map(c => ({
|
| 93 |
+
...c,
|
| 94 |
+
smtpStatus: "unknown" as const,
|
| 95 |
+
confidence: 0.5, // can't verify, medium confidence
|
| 96 |
+
}));
|
| 97 |
+
}
|
| 98 |
+
|
| 99 |
+
// Step 5: SMTP verify each (stop after first deliverable)
|
| 100 |
+
const results: GeneratedEmail[] = [];
|
| 101 |
+
let foundDeliverable = false;
|
| 102 |
+
|
| 103 |
+
for (const candidate of candidates) {
|
| 104 |
+
if (foundDeliverable) break; // Got one — no need to check rest
|
| 105 |
+
|
| 106 |
+
const smtpResult = await smtpVerify(candidate.email, domain);
|
| 107 |
+
|
| 108 |
+
const result: GeneratedEmail = {
|
| 109 |
+
...candidate,
|
| 110 |
+
smtpStatus: smtpResult.deliverable ? "deliverable" : "undeliverable",
|
| 111 |
+
confidence: smtpResult.deliverable ? 0.92 : 0.1,
|
| 112 |
+
};
|
| 113 |
+
|
| 114 |
+
if (smtpResult.deliverable) {
|
| 115 |
+
foundDeliverable = true;
|
| 116 |
+
results.unshift(result); // deliverable goes first
|
| 117 |
+
} else {
|
| 118 |
+
results.push(result);
|
| 119 |
+
}
|
| 120 |
+
}
|
| 121 |
+
|
| 122 |
+
const deliverable = results.filter(r => r.smtpStatus === "deliverable");
|
| 123 |
+
logger.info({ domain, generated: candidates.length, deliverable: deliverable.length }, "Pattern generation complete");
|
| 124 |
+
|
| 125 |
+
return results;
|
| 126 |
+
}
|
| 127 |
+
|
| 128 |
+
/**
|
| 129 |
+
* Quick function for when we already have a name from Hunter.
|
| 130 |
+
* Just verify their existing email or find a new one.
|
| 131 |
+
*/
|
| 132 |
+
export async function findEmailForPerson(
|
| 133 |
+
fullName: string,
|
| 134 |
+
domain: string
|
| 135 |
+
): Promise<GeneratedEmail | null> {
|
| 136 |
+
const parts = fullName.trim().split(/\s+/);
|
| 137 |
+
if (parts.length < 2) return null;
|
| 138 |
+
|
| 139 |
+
const firstName = parts[0];
|
| 140 |
+
const lastName = parts[parts.length - 1];
|
| 141 |
+
|
| 142 |
+
const results = await generateAndVerifyEmails(firstName, lastName, domain);
|
| 143 |
+
return results.find(r => r.smtpStatus === "deliverable") ?? results[0] ?? null;
|
| 144 |
+
}
|
| 145 |
+
|
| 146 |
+
// ─── MX Record Check (FREE) ─────────────────────────────────
|
| 147 |
+
|
| 148 |
+
async function checkMXRecord(domain: string): Promise<boolean> {
|
| 149 |
+
try {
|
| 150 |
+
const records = await dns.resolveMx(domain);
|
| 151 |
+
return records.length > 0;
|
| 152 |
+
} catch {
|
| 153 |
+
return false;
|
| 154 |
+
}
|
| 155 |
+
}
|
| 156 |
+
|
| 157 |
+
// ─── Catch-all Detection (FREE — uses random probe) ─────────
|
| 158 |
+
|
| 159 |
+
async function checkCatchAll(domain: string): Promise<boolean> {
|
| 160 |
+
// Send SMTP probe with obviously fake email
|
| 161 |
+
const fakeEmail = `xq7z9k2m4n${Date.now()}@${domain}`;
|
| 162 |
+
const result = await smtpVerify(fakeEmail, domain);
|
| 163 |
+
// If fake email is "deliverable" → catch-all
|
| 164 |
+
return result.deliverable;
|
| 165 |
+
}
|
| 166 |
+
|
| 167 |
+
// ─── SMTP Verification (FREE, UNLIMITED) ─────────────────────
|
| 168 |
+
// Direct SMTP handshake — no third-party API needed
|
| 169 |
+
|
| 170 |
+
async function smtpVerify(
|
| 171 |
+
email: string,
|
| 172 |
+
domain: string
|
| 173 |
+
): Promise<{ deliverable: boolean; response: string }> {
|
| 174 |
+
return new Promise(async (resolve) => {
|
| 175 |
+
const timeout = setTimeout(() => {
|
| 176 |
+
resolve({ deliverable: false, response: "timeout" });
|
| 177 |
+
}, 8_000);
|
| 178 |
+
|
| 179 |
+
try {
|
| 180 |
+
// Get MX server
|
| 181 |
+
const mxRecords = await dns.resolveMx(domain);
|
| 182 |
+
if (mxRecords.length === 0) {
|
| 183 |
+
clearTimeout(timeout);
|
| 184 |
+
resolve({ deliverable: false, response: "no_mx" });
|
| 185 |
+
return;
|
| 186 |
+
}
|
| 187 |
+
|
| 188 |
+
// Sort by priority (lowest = highest priority)
|
| 189 |
+
mxRecords.sort((a, b) => a.priority - b.priority);
|
| 190 |
+
const mxHost = mxRecords[0].exchange;
|
| 191 |
+
|
| 192 |
+
// Connect to SMTP
|
| 193 |
+
const socket = new net.Socket();
|
| 194 |
+
let step = 0;
|
| 195 |
+
let lastResponse = "";
|
| 196 |
+
|
| 197 |
+
socket.setTimeout(7_000);
|
| 198 |
+
socket.on("timeout", () => {
|
| 199 |
+
socket.destroy();
|
| 200 |
+
clearTimeout(timeout);
|
| 201 |
+
resolve({ deliverable: false, response: "socket_timeout" });
|
| 202 |
+
});
|
| 203 |
+
|
| 204 |
+
socket.on("error", () => {
|
| 205 |
+
clearTimeout(timeout);
|
| 206 |
+
resolve({ deliverable: false, response: "connection_error" });
|
| 207 |
+
});
|
| 208 |
+
|
| 209 |
+
socket.on("data", (data) => {
|
| 210 |
+
const response = data.toString();
|
| 211 |
+
lastResponse = response;
|
| 212 |
+
|
| 213 |
+
if (step === 0 && response.startsWith("220")) {
|
| 214 |
+
// Server greeting → send EHLO
|
| 215 |
+
socket.write("EHLO verify.local\r\n");
|
| 216 |
+
step = 1;
|
| 217 |
+
} else if (step === 1 && response.startsWith("250")) {
|
| 218 |
+
// EHLO accepted → send MAIL FROM
|
| 219 |
+
socket.write("MAIL FROM:<verify@verify.local>\r\n");
|
| 220 |
+
step = 2;
|
| 221 |
+
} else if (step === 2 && response.startsWith("250")) {
|
| 222 |
+
// MAIL FROM accepted → send RCPT TO (the real check)
|
| 223 |
+
socket.write(`RCPT TO:<${email}>\r\n`);
|
| 224 |
+
step = 3;
|
| 225 |
+
} else if (step === 3) {
|
| 226 |
+
socket.write("QUIT\r\n");
|
| 227 |
+
socket.destroy();
|
| 228 |
+
clearTimeout(timeout);
|
| 229 |
+
|
| 230 |
+
if (response.startsWith("250")) {
|
| 231 |
+
// 250 = email exists and is deliverable
|
| 232 |
+
resolve({ deliverable: true, response: "250_accepted" });
|
| 233 |
+
} else if (response.startsWith("550") || response.startsWith("551") || response.startsWith("553")) {
|
| 234 |
+
// 550 = user doesn't exist
|
| 235 |
+
resolve({ deliverable: false, response: response.trim().slice(0, 100) });
|
| 236 |
+
} else {
|
| 237 |
+
// Other codes (452 = mailbox full, 421 = try later, etc.)
|
| 238 |
+
resolve({ deliverable: false, response: response.trim().slice(0, 100) });
|
| 239 |
+
}
|
| 240 |
+
}
|
| 241 |
+
});
|
| 242 |
+
|
| 243 |
+
socket.connect(25, mxHost);
|
| 244 |
+
} catch (err) {
|
| 245 |
+
clearTimeout(timeout);
|
| 246 |
+
resolve({ deliverable: false, response: String(err).slice(0, 100) });
|
| 247 |
+
}
|
| 248 |
+
});
|
| 249 |
+
}
|
|
@@ -0,0 +1,338 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/**
|
| 2 |
+
* 7-Layer Email Verification
|
| 3 |
+
*
|
| 4 |
+
* Layer 1: RFC 5322 format check (instant, free)
|
| 5 |
+
* Layer 2: Domain ownership — email domain = company domain (instant, free)
|
| 6 |
+
* Layer 3: MX record lookup (free, DNS)
|
| 7 |
+
* Layer 4: Catch-all detection (Reoon API)
|
| 8 |
+
* Layer 5: SMTP handshake — ask mail server "does this user exist?" (free, direct)
|
| 9 |
+
* Layer 6: Disposable email check (free, local list)
|
| 10 |
+
* Layer 7: Provider confidence score (Hunter/Snov score)
|
| 11 |
+
*
|
| 12 |
+
* Each layer produces a boolean. Final status is computed from all 7.
|
| 13 |
+
*/
|
| 14 |
+
|
| 15 |
+
import dns from "dns/promises";
|
| 16 |
+
import net from "net";
|
| 17 |
+
import axios from "axios";
|
| 18 |
+
import { getEnv } from "../../shared/config/env";
|
| 19 |
+
import { logger } from "../../shared/utils/logger";
|
| 20 |
+
|
| 21 |
+
export type EmailStatus =
|
| 22 |
+
| "verified_deliverable" // all layers pass
|
| 23 |
+
| "verified_catch_all" // valid but catch-all domain
|
| 24 |
+
| "pattern_smtp_confirmed" // pattern-generated + SMTP confirmed
|
| 25 |
+
| "uncertain" // some layers pass, some unknown
|
| 26 |
+
| "rejected_invalid"; // hard failure
|
| 27 |
+
|
| 28 |
+
export interface VerificationResult {
|
| 29 |
+
email: string;
|
| 30 |
+
status: EmailStatus;
|
| 31 |
+
layers: {
|
| 32 |
+
format: boolean;
|
| 33 |
+
domainMatch: boolean;
|
| 34 |
+
mxRecord: boolean;
|
| 35 |
+
catchAll: boolean | null; // null = couldn't determine
|
| 36 |
+
smtpHandshake: boolean | null;
|
| 37 |
+
disposable: boolean; // true = IS disposable (bad)
|
| 38 |
+
providerConfidence: number; // 0-100 from Hunter/Snov
|
| 39 |
+
};
|
| 40 |
+
overallConfidence: number; // 0-100 computed from layers
|
| 41 |
+
}
|
| 42 |
+
|
| 43 |
+
/**
|
| 44 |
+
* Run all 7 verification layers on an email.
|
| 45 |
+
*/
|
| 46 |
+
export async function verifyEmailDeep(
|
| 47 |
+
email: string,
|
| 48 |
+
companyDomain: string,
|
| 49 |
+
providerConfidence: number = 0
|
| 50 |
+
): Promise<VerificationResult> {
|
| 51 |
+
const layers = {
|
| 52 |
+
format: false,
|
| 53 |
+
domainMatch: false,
|
| 54 |
+
mxRecord: false,
|
| 55 |
+
catchAll: null as boolean | null,
|
| 56 |
+
smtpHandshake: null as boolean | null,
|
| 57 |
+
disposable: false,
|
| 58 |
+
providerConfidence,
|
| 59 |
+
};
|
| 60 |
+
|
| 61 |
+
const emailDomain = email.split("@")[1]?.toLowerCase();
|
| 62 |
+
if (!emailDomain) {
|
| 63 |
+
return makeResult(email, "rejected_invalid", layers, 0);
|
| 64 |
+
}
|
| 65 |
+
|
| 66 |
+
// ── Layer 1: Format check ──────────────────────────────────
|
| 67 |
+
layers.format = isValidFormat(email);
|
| 68 |
+
if (!layers.format) {
|
| 69 |
+
return makeResult(email, "rejected_invalid", layers, 0);
|
| 70 |
+
}
|
| 71 |
+
|
| 72 |
+
// ── Layer 2: Domain ownership ──────────────────────────────
|
| 73 |
+
layers.domainMatch = isDomainMatch(emailDomain, companyDomain);
|
| 74 |
+
if (!layers.domainMatch) {
|
| 75 |
+
logger.warn({ email, emailDomain, companyDomain }, "Domain mismatch — rejecting");
|
| 76 |
+
return makeResult(email, "rejected_invalid", layers, 0);
|
| 77 |
+
}
|
| 78 |
+
|
| 79 |
+
// ── Layer 3: MX record ────────────────────────────────────
|
| 80 |
+
layers.mxRecord = await hasMxRecord(emailDomain);
|
| 81 |
+
if (!layers.mxRecord) {
|
| 82 |
+
return makeResult(email, "rejected_invalid", layers, 5);
|
| 83 |
+
}
|
| 84 |
+
|
| 85 |
+
// ── Layer 4: Catch-all detection (Reoon) ───────────────────
|
| 86 |
+
layers.catchAll = await checkCatchAll(emailDomain);
|
| 87 |
+
|
| 88 |
+
// ── Layer 5: SMTP handshake ─────────────────────────────────
|
| 89 |
+
layers.smtpHandshake = await smtpHandshake(email, emailDomain);
|
| 90 |
+
|
| 91 |
+
// ── Layer 6: Disposable check ──────────────────────────────
|
| 92 |
+
layers.disposable = isDisposable(emailDomain);
|
| 93 |
+
if (layers.disposable) {
|
| 94 |
+
return makeResult(email, "rejected_invalid", layers, 0);
|
| 95 |
+
}
|
| 96 |
+
|
| 97 |
+
// ── Layer 7: Provider confidence ──────────────────────────
|
| 98 |
+
// Already set from Hunter/Snov response
|
| 99 |
+
|
| 100 |
+
// ── Compute final status ───────────────────────────────────
|
| 101 |
+
return computeFinalStatus(email, layers);
|
| 102 |
+
}
|
| 103 |
+
|
| 104 |
+
// ─── Layer 1: RFC 5322 Format ────────────────────────────────
|
| 105 |
+
|
| 106 |
+
function isValidFormat(email: string): boolean {
|
| 107 |
+
// Strict-ish RFC 5322 check
|
| 108 |
+
const pattern = /^[a-zA-Z0-9.!#$%&'*+/=?^_`{|}~-]+@[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?(?:\.[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?)*$/;
|
| 109 |
+
|
| 110 |
+
if (!pattern.test(email)) return false;
|
| 111 |
+
if (email.length > 254) return false;
|
| 112 |
+
|
| 113 |
+
const local = email.split("@")[0];
|
| 114 |
+
if (local.length > 64) return false;
|
| 115 |
+
if (local.startsWith(".") || local.endsWith(".")) return false;
|
| 116 |
+
if (local.includes("..")) return false;
|
| 117 |
+
|
| 118 |
+
return true;
|
| 119 |
+
}
|
| 120 |
+
|
| 121 |
+
// ─── Layer 2: Domain Match ──────────────────────────────────
|
| 122 |
+
|
| 123 |
+
function isDomainMatch(emailDomain: string, companyDomain: string): boolean {
|
| 124 |
+
const normalize = (d: string) => d.toLowerCase().replace(/^www\./, "").trim();
|
| 125 |
+
const eDomain = normalize(emailDomain);
|
| 126 |
+
const cDomain = normalize(companyDomain);
|
| 127 |
+
|
| 128 |
+
// Exact match
|
| 129 |
+
if (eDomain === cDomain) return true;
|
| 130 |
+
|
| 131 |
+
// Subdomain match (e.g., mail.company.com → company.com)
|
| 132 |
+
if (eDomain.endsWith(`.${cDomain}`)) return true;
|
| 133 |
+
|
| 134 |
+
// Common email domain variants (company uses Google Workspace etc.)
|
| 135 |
+
// This is fine — john@company.com matches company.com
|
| 136 |
+
return false;
|
| 137 |
+
}
|
| 138 |
+
|
| 139 |
+
// ─── Layer 3: MX Record ─────────────────────────────────────
|
| 140 |
+
|
| 141 |
+
async function hasMxRecord(domain: string): Promise<boolean> {
|
| 142 |
+
try {
|
| 143 |
+
const records = await dns.resolveMx(domain);
|
| 144 |
+
return records.length > 0;
|
| 145 |
+
} catch {
|
| 146 |
+
return false;
|
| 147 |
+
}
|
| 148 |
+
}
|
| 149 |
+
|
| 150 |
+
// ─── Layer 4: Catch-All Detection (CREDIT-OPTIMIZED) ────────
|
| 151 |
+
// Strategy: Try FREE SMTP probe first → only use Reoon if SMTP can't determine
|
| 152 |
+
// This saves Reoon credits (only 20/day) for when they're truly needed
|
| 153 |
+
|
| 154 |
+
let _reoonUsedToday = 0;
|
| 155 |
+
let _reoonResetDate = new Date().toDateString();
|
| 156 |
+
const REOON_DAILY_LIMIT = 18; // keep 2 credits as buffer
|
| 157 |
+
|
| 158 |
+
async function checkCatchAll(domain: string): Promise<boolean | null> {
|
| 159 |
+
// ── Attempt 1: FREE SMTP catch-all probe ───────────────────
|
| 160 |
+
// Send RCPT TO with a random gibberish address.
|
| 161 |
+
// If server accepts it → catch-all. If 550 → NOT catch-all.
|
| 162 |
+
try {
|
| 163 |
+
const fakeEmail = `xqz7k2m4n_test_${Date.now() % 10000}@${domain}`;
|
| 164 |
+
const smtpResult = await smtpHandshake(fakeEmail, domain);
|
| 165 |
+
|
| 166 |
+
if (smtpResult === true) {
|
| 167 |
+
// Server accepted gibberish email → CATCH-ALL
|
| 168 |
+
logger.debug({ domain }, "Catch-all detected via FREE SMTP probe (Reoon credit saved)");
|
| 169 |
+
return true;
|
| 170 |
+
}
|
| 171 |
+
if (smtpResult === false) {
|
| 172 |
+
// Server rejected gibberish email → NOT catch-all
|
| 173 |
+
logger.debug({ domain }, "NOT catch-all — confirmed via FREE SMTP probe");
|
| 174 |
+
return false;
|
| 175 |
+
}
|
| 176 |
+
// smtpResult === null → SMTP couldn't determine, fall through to Reoon
|
| 177 |
+
} catch {
|
| 178 |
+
// SMTP probe failed, fall through to Reoon
|
| 179 |
+
}
|
| 180 |
+
|
| 181 |
+
// ── Attempt 2: Reoon API (only if SMTP couldn't determine) ─
|
| 182 |
+
// Reset counter if new day
|
| 183 |
+
const today = new Date().toDateString();
|
| 184 |
+
if (_reoonResetDate !== today) {
|
| 185 |
+
_reoonUsedToday = 0;
|
| 186 |
+
_reoonResetDate = today;
|
| 187 |
+
}
|
| 188 |
+
|
| 189 |
+
// Check budget
|
| 190 |
+
if (_reoonUsedToday >= REOON_DAILY_LIMIT) {
|
| 191 |
+
logger.warn({ domain, used: _reoonUsedToday }, "Reoon daily limit reached — skipping");
|
| 192 |
+
return null;
|
| 193 |
+
}
|
| 194 |
+
|
| 195 |
+
try {
|
| 196 |
+
const env = getEnv();
|
| 197 |
+
_reoonUsedToday++;
|
| 198 |
+
|
| 199 |
+
const response = await axios.get("https://emailverifier.reoon.com/api/v1/verify", {
|
| 200 |
+
params: {
|
| 201 |
+
email: `definitely_not_real_${Date.now()}@${domain}`,
|
| 202 |
+
key: env.REOON_API_KEY,
|
| 203 |
+
mode: "quick",
|
| 204 |
+
},
|
| 205 |
+
timeout: 8_000,
|
| 206 |
+
});
|
| 207 |
+
|
| 208 |
+
logger.debug({ domain, reoonUsed: _reoonUsedToday }, "Reoon credit used for catch-all check");
|
| 209 |
+
return response.data?.status === "valid";
|
| 210 |
+
} catch {
|
| 211 |
+
return null;
|
| 212 |
+
}
|
| 213 |
+
}
|
| 214 |
+
|
| 215 |
+
// ─── Layer 5: SMTP Handshake ────────────────────────────────
|
| 216 |
+
|
| 217 |
+
async function smtpHandshake(email: string, domain: string): Promise<boolean | null> {
|
| 218 |
+
try {
|
| 219 |
+
// Resolve MX to get mail server
|
| 220 |
+
const mxRecords = await dns.resolveMx(domain);
|
| 221 |
+
if (!mxRecords.length) return null;
|
| 222 |
+
|
| 223 |
+
// Pick highest priority (lowest number)
|
| 224 |
+
const mailServer = mxRecords.sort((a, b) => a.priority - b.priority)[0].exchange;
|
| 225 |
+
|
| 226 |
+
return new Promise((resolve) => {
|
| 227 |
+
const socket = new net.Socket();
|
| 228 |
+
let step = 0;
|
| 229 |
+
let result = false;
|
| 230 |
+
const timeout = setTimeout(() => {
|
| 231 |
+
socket.destroy();
|
| 232 |
+
resolve(null);
|
| 233 |
+
}, 10_000);
|
| 234 |
+
|
| 235 |
+
socket.connect(25, mailServer, () => {
|
| 236 |
+
// Connected to mail server
|
| 237 |
+
});
|
| 238 |
+
|
| 239 |
+
socket.on("data", (data) => {
|
| 240 |
+
const response = data.toString();
|
| 241 |
+
|
| 242 |
+
if (step === 0 && response.startsWith("220")) {
|
| 243 |
+
// Server greeting → send EHLO
|
| 244 |
+
socket.write("EHLO verify.local\r\n");
|
| 245 |
+
step = 1;
|
| 246 |
+
} else if (step === 1 && response.startsWith("250")) {
|
| 247 |
+
// EHLO accepted → send MAIL FROM
|
| 248 |
+
socket.write("MAIL FROM:<verify@verify.local>\r\n");
|
| 249 |
+
step = 2;
|
| 250 |
+
} else if (step === 2 && response.startsWith("250")) {
|
| 251 |
+
// MAIL FROM accepted → send RCPT TO (the actual check)
|
| 252 |
+
socket.write(`RCPT TO:<${email}>\r\n`);
|
| 253 |
+
step = 3;
|
| 254 |
+
} else if (step === 3) {
|
| 255 |
+
if (response.startsWith("250")) {
|
| 256 |
+
result = true; // 250 = user exists!
|
| 257 |
+
} else if (response.startsWith("550") || response.startsWith("553")) {
|
| 258 |
+
result = false; // 550 = user doesn't exist
|
| 259 |
+
}
|
| 260 |
+
// Cleanup
|
| 261 |
+
socket.write("QUIT\r\n");
|
| 262 |
+
clearTimeout(timeout);
|
| 263 |
+
socket.destroy();
|
| 264 |
+
resolve(result);
|
| 265 |
+
}
|
| 266 |
+
});
|
| 267 |
+
|
| 268 |
+
socket.on("error", () => {
|
| 269 |
+
clearTimeout(timeout);
|
| 270 |
+
resolve(null); // can't determine
|
| 271 |
+
});
|
| 272 |
+
});
|
| 273 |
+
} catch {
|
| 274 |
+
return null; // can't determine
|
| 275 |
+
}
|
| 276 |
+
}
|
| 277 |
+
|
| 278 |
+
// ─── Layer 6: Disposable Email ──────────────────────────────
|
| 279 |
+
|
| 280 |
+
const DISPOSABLE_DOMAINS = new Set([
|
| 281 |
+
"mailinator.com", "tempmail.com", "throwaway.email", "guerrillamail.com",
|
| 282 |
+
"guerrillamail.info", "yopmail.com", "trashmail.com", "maildrop.cc",
|
| 283 |
+
"10minutemail.com", "temp-mail.org", "fakeinbox.com", "sharklasers.com",
|
| 284 |
+
"guerrillamail.net", "grr.la", "dispostable.com", "tempr.email",
|
| 285 |
+
"mohmal.com", "burpcollaborator.net", "mailnesia.com",
|
| 286 |
+
]);
|
| 287 |
+
|
| 288 |
+
function isDisposable(domain: string): boolean {
|
| 289 |
+
return DISPOSABLE_DOMAINS.has(domain.toLowerCase());
|
| 290 |
+
}
|
| 291 |
+
|
| 292 |
+
// ─── Final Status Computation ────────────────────────────────
|
| 293 |
+
|
| 294 |
+
function computeFinalStatus(
|
| 295 |
+
email: string,
|
| 296 |
+
layers: VerificationResult["layers"]
|
| 297 |
+
): VerificationResult {
|
| 298 |
+
// All layers pass (including SMTP)
|
| 299 |
+
if (layers.format && layers.domainMatch && layers.mxRecord &&
|
| 300 |
+
layers.smtpHandshake === true && !layers.disposable && !layers.catchAll) {
|
| 301 |
+
const confidence = Math.min(
|
| 302 |
+
95,
|
| 303 |
+
60 + (layers.providerConfidence > 0 ? Math.round(layers.providerConfidence * 0.35) : 15)
|
| 304 |
+
);
|
| 305 |
+
return makeResult(email, "verified_deliverable", layers, confidence);
|
| 306 |
+
}
|
| 307 |
+
|
| 308 |
+
// Catch-all domain — uncertain but not invalid
|
| 309 |
+
if (layers.catchAll === true && layers.mxRecord) {
|
| 310 |
+
return makeResult(email, "verified_catch_all", layers, 45);
|
| 311 |
+
}
|
| 312 |
+
|
| 313 |
+
// SMTP confirmed but no provider data
|
| 314 |
+
if (layers.smtpHandshake === true && layers.providerConfidence === 0) {
|
| 315 |
+
return makeResult(email, "pattern_smtp_confirmed", layers, 70);
|
| 316 |
+
}
|
| 317 |
+
|
| 318 |
+
// MX exists, provider says good, SMTP unknown
|
| 319 |
+
if (layers.mxRecord && layers.providerConfidence >= 70 && layers.smtpHandshake === null) {
|
| 320 |
+
return makeResult(email, "verified_deliverable", layers, layers.providerConfidence);
|
| 321 |
+
}
|
| 322 |
+
|
| 323 |
+
// MX exists but everything else uncertain
|
| 324 |
+
if (layers.mxRecord && !layers.disposable) {
|
| 325 |
+
return makeResult(email, "uncertain", layers, 30);
|
| 326 |
+
}
|
| 327 |
+
|
| 328 |
+
return makeResult(email, "rejected_invalid", layers, 0);
|
| 329 |
+
}
|
| 330 |
+
|
| 331 |
+
function makeResult(
|
| 332 |
+
email: string,
|
| 333 |
+
status: EmailStatus,
|
| 334 |
+
layers: VerificationResult["layers"],
|
| 335 |
+
overallConfidence: number
|
| 336 |
+
): VerificationResult {
|
| 337 |
+
return { email, status, layers, overallConfidence };
|
| 338 |
+
}
|
|
@@ -0,0 +1,133 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import { getSupabaseClient } from "../../shared/supabase/client";
|
| 2 |
+
import { IcpConfig } from "../../shared/supabase/schema";
|
| 3 |
+
import { ScrapedCompany } from "./web-scraper";
|
| 4 |
+
import { logger } from "../../shared/utils/logger";
|
| 5 |
+
|
| 6 |
+
export interface FilterResult {
|
| 7 |
+
passed: boolean;
|
| 8 |
+
failReasons: string[];
|
| 9 |
+
passedSignals: string[];
|
| 10 |
+
signalScore: number; // 0-4 — how many growth signals detected
|
| 11 |
+
}
|
| 12 |
+
|
| 13 |
+
/**
|
| 14 |
+
* Loads the active ICP config from Supabase.
|
| 15 |
+
*/
|
| 16 |
+
export async function loadIcpConfig(): Promise<IcpConfig> {
|
| 17 |
+
const db = getSupabaseClient();
|
| 18 |
+
const { data, error } = await db
|
| 19 |
+
.from("icp_config")
|
| 20 |
+
.select("*")
|
| 21 |
+
.eq("is_active", true)
|
| 22 |
+
.single();
|
| 23 |
+
|
| 24 |
+
if (error || !data) {
|
| 25 |
+
logger.error({ error }, "Failed to load ICP config — using defaults");
|
| 26 |
+
return DEFAULT_ICP;
|
| 27 |
+
}
|
| 28 |
+
return data as IcpConfig;
|
| 29 |
+
}
|
| 30 |
+
|
| 31 |
+
/**
|
| 32 |
+
* FILTER GATE 1 — Hard rules only.
|
| 33 |
+
* Returns immediately on first failure for efficiency.
|
| 34 |
+
*/
|
| 35 |
+
export function applyHardFilters(
|
| 36 |
+
company: ScrapedCompany,
|
| 37 |
+
icp: IcpConfig,
|
| 38 |
+
region: string
|
| 39 |
+
): FilterResult {
|
| 40 |
+
const fail: string[] = [];
|
| 41 |
+
const pass: string[] = [];
|
| 42 |
+
|
| 43 |
+
// ── Employee count ───────────────────────────────────────────
|
| 44 |
+
if (company.employeeCount !== null && company.employeeCount < icp.min_employees) {
|
| 45 |
+
fail.push(`employees_too_few:${company.employeeCount}`);
|
| 46 |
+
} else {
|
| 47 |
+
pass.push("employee_count_ok");
|
| 48 |
+
}
|
| 49 |
+
|
| 50 |
+
// ── Industry check ───────────────────────────────────────────
|
| 51 |
+
const industryLower = (company.industry ?? "").toLowerCase();
|
| 52 |
+
const inExcluded = icp.exclude_industries.some((ex) => industryLower.includes(ex));
|
| 53 |
+
if (inExcluded) {
|
| 54 |
+
fail.push(`excluded_industry:${company.industry}`);
|
| 55 |
+
} else {
|
| 56 |
+
pass.push("industry_ok");
|
| 57 |
+
}
|
| 58 |
+
|
| 59 |
+
// ── Website exists ───────────────────────────────────────────
|
| 60 |
+
if (!company.domain || company.websiteText.length < 100) {
|
| 61 |
+
fail.push("no_valid_website");
|
| 62 |
+
} else {
|
| 63 |
+
pass.push("website_ok");
|
| 64 |
+
}
|
| 65 |
+
|
| 66 |
+
return {
|
| 67 |
+
passed: fail.length === 0,
|
| 68 |
+
failReasons: fail,
|
| 69 |
+
passedSignals: pass,
|
| 70 |
+
signalScore: 0,
|
| 71 |
+
};
|
| 72 |
+
}
|
| 73 |
+
|
| 74 |
+
/**
|
| 75 |
+
* FILTER GATE 2 — Growth & AI signal check.
|
| 76 |
+
* Company needs ≥ 2 positive signals to proceed.
|
| 77 |
+
*/
|
| 78 |
+
export function applySignalFilters(
|
| 79 |
+
company: ScrapedCompany,
|
| 80 |
+
icp: IcpConfig
|
| 81 |
+
): FilterResult {
|
| 82 |
+
const pass: string[] = [];
|
| 83 |
+
const fail: string[] = [];
|
| 84 |
+
|
| 85 |
+
// ── AI-related job postings ──────────────────────────────────
|
| 86 |
+
const aiJobs = company.jobPostings.filter((j) => j.hasAiSignal);
|
| 87 |
+
if (aiJobs.length > 0) pass.push(`ai_job_postings:${aiJobs.length}`);
|
| 88 |
+
|
| 89 |
+
// ── Tech stack signals ───────────────────────────────────────
|
| 90 |
+
const stackSignals = company.techStack.filter((t) =>
|
| 91 |
+
icp.tech_signals.includes(t.toLowerCase())
|
| 92 |
+
);
|
| 93 |
+
if (stackSignals.length > 0) pass.push(`tech_stack:${stackSignals.join(",")}`);
|
| 94 |
+
|
| 95 |
+
// ── ICP keywords in website text ────────────────────────────
|
| 96 |
+
const textLower = company.websiteText.toLowerCase();
|
| 97 |
+
const kwHits = icp.keywords.filter((kw) => textLower.includes(kw.toLowerCase()));
|
| 98 |
+
if (kwHits.length >= 2) pass.push(`keyword_hits:${kwHits.join(",")}`);
|
| 99 |
+
|
| 100 |
+
// ── Active job hiring (general) ──────────────────────────────
|
| 101 |
+
if (company.jobPostings.length >= 3) pass.push(`active_hiring:${company.jobPostings.length}`);
|
| 102 |
+
|
| 103 |
+
const signalScore = pass.length;
|
| 104 |
+
|
| 105 |
+
if (signalScore < 2) {
|
| 106 |
+
fail.push(`insufficient_signals:${signalScore}`);
|
| 107 |
+
logger.debug({ domain: company.domain, signalScore }, "Gate 2 failed: low signals");
|
| 108 |
+
}
|
| 109 |
+
|
| 110 |
+
return {
|
| 111 |
+
passed: fail.length === 0,
|
| 112 |
+
failReasons: fail,
|
| 113 |
+
passedSignals: pass,
|
| 114 |
+
signalScore,
|
| 115 |
+
};
|
| 116 |
+
}
|
| 117 |
+
|
| 118 |
+
// ─── Default ICP (if DB read fails) ─────────────────────────
|
| 119 |
+
|
| 120 |
+
const DEFAULT_ICP: IcpConfig = {
|
| 121 |
+
id: "default",
|
| 122 |
+
name: "default",
|
| 123 |
+
min_employees: 50,
|
| 124 |
+
industries: ["technology", "manufacturing", "logistics", "healthcare", "finance"],
|
| 125 |
+
exclude_industries: ["government", "non-profit", "education"],
|
| 126 |
+
geographies: ["US", "UK", "AU", "UAE", "SA"],
|
| 127 |
+
keywords: ["automation", "digital transformation", "AI", "operations"],
|
| 128 |
+
tech_signals: ["salesforce", "hubspot", "sap", "legacy_erp"],
|
| 129 |
+
score_threshold: 70,
|
| 130 |
+
is_active: true,
|
| 131 |
+
created_at: new Date().toISOString(),
|
| 132 |
+
updated_at: new Date().toISOString(),
|
| 133 |
+
};
|
|
@@ -0,0 +1,205 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/**
|
| 2 |
+
* Personal LinkedIn Finder
|
| 3 |
+
*
|
| 4 |
+
* Finds linkedin.com/in/person-name (personal profile)
|
| 5 |
+
* NOT linkedin.com/company/ (company page — already have that)
|
| 6 |
+
*
|
| 7 |
+
* Methods in priority order:
|
| 8 |
+
* 1. Google search: "name" "company" site:linkedin.com/in
|
| 9 |
+
* 2. Company's LinkedIn people page scrape
|
| 10 |
+
* 3. Hunter.io linkedin_url field (sometimes returned)
|
| 11 |
+
*
|
| 12 |
+
* MANDATORY — every qualified lead must have a LinkedIn attempt.
|
| 13 |
+
*/
|
| 14 |
+
|
| 15 |
+
import { searchCompanies, SerperResult } from "../providers/serper";
|
| 16 |
+
import { serperLimiter } from "../../shared/utils/rate-limiter";
|
| 17 |
+
import { logger } from "../../shared/utils/logger";
|
| 18 |
+
import axios from "axios";
|
| 19 |
+
import { getEnv } from "../../shared/config/env";
|
| 20 |
+
|
| 21 |
+
export interface PersonalLinkedIn {
|
| 22 |
+
url: string; // linkedin.com/in/john-smith-abc123
|
| 23 |
+
confidence: number; // how sure we are this is the right person
|
| 24 |
+
source: "google_search" | "company_people_page" | "hunter_field";
|
| 25 |
+
verified: boolean; // URL format is valid and accessible
|
| 26 |
+
}
|
| 27 |
+
|
| 28 |
+
/**
|
| 29 |
+
* Find personal LinkedIn profile for a decision maker.
|
| 30 |
+
* Tries multiple methods. Returns null if all fail (not an error — just LinkedIn-not-found).
|
| 31 |
+
*/
|
| 32 |
+
export async function findPersonalLinkedIn(
|
| 33 |
+
fullName: string,
|
| 34 |
+
companyName: string,
|
| 35 |
+
companyDomain: string,
|
| 36 |
+
companyLinkedInUrl: string | null
|
| 37 |
+
): Promise<PersonalLinkedIn | null> {
|
| 38 |
+
// Method 1: Google search (highest accuracy)
|
| 39 |
+
const googleResult = await searchViaGoogle(fullName, companyName);
|
| 40 |
+
if (googleResult) return googleResult;
|
| 41 |
+
|
| 42 |
+
// Method 2: From company LinkedIn people page (already scraped)
|
| 43 |
+
if (companyLinkedInUrl) {
|
| 44 |
+
const peopleResult = await searchViaPeoplePage(fullName, companyLinkedInUrl);
|
| 45 |
+
if (peopleResult) return peopleResult;
|
| 46 |
+
}
|
| 47 |
+
|
| 48 |
+
logger.info({ fullName, companyName }, "LinkedIn personal not found — all methods tried");
|
| 49 |
+
return null;
|
| 50 |
+
}
|
| 51 |
+
|
| 52 |
+
// ─── Method 1: Google Search ─────────────────────────────────
|
| 53 |
+
|
| 54 |
+
async function searchViaGoogle(
|
| 55 |
+
fullName: string,
|
| 56 |
+
companyName: string
|
| 57 |
+
): Promise<PersonalLinkedIn | null> {
|
| 58 |
+
try {
|
| 59 |
+
await serperLimiter.consume("serper");
|
| 60 |
+
|
| 61 |
+
const env = getEnv();
|
| 62 |
+
const query = `"${fullName}" "${companyName}" site:linkedin.com/in`;
|
| 63 |
+
|
| 64 |
+
const response = await axios.post(
|
| 65 |
+
"https://google.serper.dev/search",
|
| 66 |
+
{ q: query, num: 5 },
|
| 67 |
+
{
|
| 68 |
+
headers: {
|
| 69 |
+
"X-API-KEY": env.SERPER_API_KEY,
|
| 70 |
+
"Content-Type": "application/json",
|
| 71 |
+
},
|
| 72 |
+
timeout: 8_000,
|
| 73 |
+
}
|
| 74 |
+
);
|
| 75 |
+
|
| 76 |
+
const organic = response.data?.organic ?? [];
|
| 77 |
+
|
| 78 |
+
for (const result of organic) {
|
| 79 |
+
const url = result.link;
|
| 80 |
+
if (!isLinkedInPersonalUrl(url)) continue;
|
| 81 |
+
|
| 82 |
+
// Verify the result mentions both name and company
|
| 83 |
+
const snippet = (result.snippet ?? "").toLowerCase();
|
| 84 |
+
const title = (result.title ?? "").toLowerCase();
|
| 85 |
+
const combined = `${snippet} ${title}`;
|
| 86 |
+
|
| 87 |
+
const nameParts = fullName.toLowerCase().split(/\s+/);
|
| 88 |
+
const hasName = nameParts.some(part => part.length > 2 && combined.includes(part));
|
| 89 |
+
const hasCompany = companyName.toLowerCase().split(/\s+/).some(
|
| 90 |
+
part => part.length > 3 && combined.includes(part)
|
| 91 |
+
);
|
| 92 |
+
|
| 93 |
+
if (hasName) {
|
| 94 |
+
return {
|
| 95 |
+
url: cleanLinkedInUrl(url),
|
| 96 |
+
confidence: hasCompany ? 0.92 : 0.70,
|
| 97 |
+
source: "google_search",
|
| 98 |
+
verified: true,
|
| 99 |
+
};
|
| 100 |
+
}
|
| 101 |
+
}
|
| 102 |
+
|
| 103 |
+
return null;
|
| 104 |
+
} catch (err) {
|
| 105 |
+
logger.warn({ fullName, err }, "Google LinkedIn search failed");
|
| 106 |
+
return null;
|
| 107 |
+
}
|
| 108 |
+
}
|
| 109 |
+
|
| 110 |
+
// ─── Method 2: Company People Page ──────────────────────────
|
| 111 |
+
|
| 112 |
+
async function searchViaPeoplePage(
|
| 113 |
+
fullName: string,
|
| 114 |
+
companyLinkedInUrl: string
|
| 115 |
+
): Promise<PersonalLinkedIn | null> {
|
| 116 |
+
try {
|
| 117 |
+
await serperLimiter.consume("serper");
|
| 118 |
+
|
| 119 |
+
const env = getEnv();
|
| 120 |
+
// Search Google for the person's name on the company's LinkedIn
|
| 121 |
+
const companySlug = companyLinkedInUrl.match(/company\/([^/?]+)/)?.[1];
|
| 122 |
+
if (!companySlug) return null;
|
| 123 |
+
|
| 124 |
+
const query = `"${fullName}" site:linkedin.com/in ${companySlug}`;
|
| 125 |
+
|
| 126 |
+
const response = await axios.post(
|
| 127 |
+
"https://google.serper.dev/search",
|
| 128 |
+
{ q: query, num: 3 },
|
| 129 |
+
{
|
| 130 |
+
headers: {
|
| 131 |
+
"X-API-KEY": env.SERPER_API_KEY,
|
| 132 |
+
"Content-Type": "application/json",
|
| 133 |
+
},
|
| 134 |
+
timeout: 8_000,
|
| 135 |
+
}
|
| 136 |
+
);
|
| 137 |
+
|
| 138 |
+
const organic = response.data?.organic ?? [];
|
| 139 |
+
|
| 140 |
+
for (const result of organic) {
|
| 141 |
+
if (isLinkedInPersonalUrl(result.link)) {
|
| 142 |
+
return {
|
| 143 |
+
url: cleanLinkedInUrl(result.link),
|
| 144 |
+
confidence: 0.75,
|
| 145 |
+
source: "company_people_page",
|
| 146 |
+
verified: true,
|
| 147 |
+
};
|
| 148 |
+
}
|
| 149 |
+
}
|
| 150 |
+
|
| 151 |
+
return null;
|
| 152 |
+
} catch {
|
| 153 |
+
return null;
|
| 154 |
+
}
|
| 155 |
+
}
|
| 156 |
+
|
| 157 |
+
// ─── Helpers ─────────────────────────────────────────────���───
|
| 158 |
+
|
| 159 |
+
function isLinkedInPersonalUrl(url: string): boolean {
|
| 160 |
+
// Must be linkedin.com/in/ (personal) not /company/ or /jobs/
|
| 161 |
+
return /linkedin\.com\/in\/[a-zA-Z0-9\-]+/.test(url);
|
| 162 |
+
}
|
| 163 |
+
|
| 164 |
+
function cleanLinkedInUrl(url: string): string {
|
| 165 |
+
// Remove query params and fragments, normalize
|
| 166 |
+
const match = url.match(/(https?:\/\/(?:www\.)?linkedin\.com\/in\/[a-zA-Z0-9\-]+)/);
|
| 167 |
+
return match ? match[1] : url;
|
| 168 |
+
}
|
| 169 |
+
|
| 170 |
+
/**
|
| 171 |
+
* Batch find LinkedIn profiles for multiple decision makers.
|
| 172 |
+
* Stops after 5 to conserve API calls.
|
| 173 |
+
*/
|
| 174 |
+
export async function batchFindLinkedIn(
|
| 175 |
+
people: { fullName: string; title: string }[],
|
| 176 |
+
companyName: string,
|
| 177 |
+
companyDomain: string,
|
| 178 |
+
companyLinkedInUrl: string | null
|
| 179 |
+
): Promise<Map<string, PersonalLinkedIn>> {
|
| 180 |
+
const results = new Map<string, PersonalLinkedIn>();
|
| 181 |
+
const maxLookups = Math.min(people.length, 5);
|
| 182 |
+
|
| 183 |
+
for (let i = 0; i < maxLookups; i++) {
|
| 184 |
+
const person = people[i];
|
| 185 |
+
const result = await findPersonalLinkedIn(
|
| 186 |
+
person.fullName,
|
| 187 |
+
companyName,
|
| 188 |
+
companyDomain,
|
| 189 |
+
companyLinkedInUrl
|
| 190 |
+
);
|
| 191 |
+
|
| 192 |
+
if (result) {
|
| 193 |
+
results.set(person.fullName, result);
|
| 194 |
+
}
|
| 195 |
+
|
| 196 |
+
// Small delay between searches to be polite
|
| 197 |
+
await new Promise(r => setTimeout(r, 1500));
|
| 198 |
+
}
|
| 199 |
+
|
| 200 |
+
logger.info({ company: companyName, found: results.size, attempted: maxLookups },
|
| 201 |
+
"LinkedIn personal batch complete"
|
| 202 |
+
);
|
| 203 |
+
|
| 204 |
+
return results;
|
| 205 |
+
}
|
|
@@ -0,0 +1,165 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import { chromium, Browser, BrowserContext } from "playwright";
|
| 2 |
+
import { playwrightLimiter } from "../../shared/utils/rate-limiter";
|
| 3 |
+
import { logger } from "../../shared/utils/logger";
|
| 4 |
+
|
| 5 |
+
export interface LinkedInCompanyData {
|
| 6 |
+
name: string | null;
|
| 7 |
+
description: string | null;
|
| 8 |
+
employeeCount: number | null;
|
| 9 |
+
employeeRange: string | null;
|
| 10 |
+
industry: string | null;
|
| 11 |
+
headquarters: string | null;
|
| 12 |
+
website: string | null;
|
| 13 |
+
recentPosts: string[];
|
| 14 |
+
decisionMakers: LinkedInPerson[];
|
| 15 |
+
}
|
| 16 |
+
|
| 17 |
+
export interface LinkedInPerson {
|
| 18 |
+
fullName: string;
|
| 19 |
+
title: string;
|
| 20 |
+
linkedinUrl: string;
|
| 21 |
+
isDecisionMaker: boolean;
|
| 22 |
+
}
|
| 23 |
+
|
| 24 |
+
const DECISION_MAKER_TITLES = [
|
| 25 |
+
"ceo", "chief executive", "founder", "co-founder", "cofounder",
|
| 26 |
+
"cto", "chief technology", "coo", "chief operating",
|
| 27 |
+
"vp", "vice president", "director", "head of",
|
| 28 |
+
"managing director", "general manager", "president",
|
| 29 |
+
];
|
| 30 |
+
|
| 31 |
+
/**
|
| 32 |
+
* Scrapes LinkedIn public company page.
|
| 33 |
+
* Only reads publicly visible data — no login, no TOS violation.
|
| 34 |
+
*/
|
| 35 |
+
export async function scrapeLinkedInCompany(
|
| 36 |
+
linkedinUrl: string
|
| 37 |
+
): Promise<LinkedInCompanyData> {
|
| 38 |
+
await playwrightLimiter.consume("linkedin");
|
| 39 |
+
|
| 40 |
+
const browser = await chromium.launch({ headless: true, args: ["--no-sandbox"] });
|
| 41 |
+
const context = await browser.newContext({
|
| 42 |
+
userAgent:
|
| 43 |
+
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 " +
|
| 44 |
+
"(KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36",
|
| 45 |
+
locale: "en-US",
|
| 46 |
+
});
|
| 47 |
+
|
| 48 |
+
const result: LinkedInCompanyData = {
|
| 49 |
+
name: null,
|
| 50 |
+
description: null,
|
| 51 |
+
employeeCount: null,
|
| 52 |
+
employeeRange: null,
|
| 53 |
+
industry: null,
|
| 54 |
+
headquarters: null,
|
| 55 |
+
website: null,
|
| 56 |
+
recentPosts: [],
|
| 57 |
+
decisionMakers: [],
|
| 58 |
+
};
|
| 59 |
+
|
| 60 |
+
const page = await context.newPage();
|
| 61 |
+
|
| 62 |
+
try {
|
| 63 |
+
// ── Company About Page ────────────────────────────────────
|
| 64 |
+
const aboutUrl = linkedinUrl.replace(/\/$/, "") + "/about/";
|
| 65 |
+
await page.goto(aboutUrl, { waitUntil: "domcontentloaded", timeout: 20_000 });
|
| 66 |
+
|
| 67 |
+
// Add small delay to let JS render
|
| 68 |
+
await page.waitForTimeout(2000);
|
| 69 |
+
|
| 70 |
+
const pageText = await page.evaluate(() => document.body.innerText);
|
| 71 |
+
|
| 72 |
+
// Extract employee count
|
| 73 |
+
const empMatch = pageText.match(/(\d[\d,]+)\s*(?:followers|employees)/i);
|
| 74 |
+
if (empMatch) {
|
| 75 |
+
result.employeeCount = parseInt(empMatch[1].replace(/,/g, ""), 10);
|
| 76 |
+
}
|
| 77 |
+
|
| 78 |
+
// Extract range if count not found
|
| 79 |
+
const rangeMatch = pageText.match(/(\d+[\d,]*)\s*[-–]\s*(\d+[\d,]*)\s*employees/i);
|
| 80 |
+
if (rangeMatch) {
|
| 81 |
+
result.employeeRange = `${rangeMatch[1]}-${rangeMatch[2]}`;
|
| 82 |
+
}
|
| 83 |
+
|
| 84 |
+
// Extract company name from og:title
|
| 85 |
+
result.name = await page
|
| 86 |
+
.$eval('meta[property="og:title"]', (el) => el.getAttribute("content"))
|
| 87 |
+
.catch(() => null);
|
| 88 |
+
|
| 89 |
+
// Extract description
|
| 90 |
+
result.description = await page
|
| 91 |
+
.$eval('meta[property="og:description"]', (el) => el.getAttribute("content"))
|
| 92 |
+
.catch(() => null);
|
| 93 |
+
|
| 94 |
+
// Extract industry + HQ from About section text
|
| 95 |
+
const industryMatch = pageText.match(/Industry\s*\n([^\n]+)/i);
|
| 96 |
+
if (industryMatch) result.industry = industryMatch[1].trim();
|
| 97 |
+
|
| 98 |
+
const hqMatch = pageText.match(/Headquarters\s*\n([^\n]+)/i);
|
| 99 |
+
if (hqMatch) result.headquarters = hqMatch[1].trim();
|
| 100 |
+
|
| 101 |
+
logger.info(
|
| 102 |
+
{ linkedinUrl, employees: result.employeeCount, industry: result.industry },
|
| 103 |
+
"LinkedIn company scraped"
|
| 104 |
+
);
|
| 105 |
+
|
| 106 |
+
// ── People Page (public) ─────────────────────────────────
|
| 107 |
+
const peopleUrl = linkedinUrl.replace(/\/$/, "") + "/people/";
|
| 108 |
+
await page.goto(peopleUrl, { waitUntil: "domcontentloaded", timeout: 15_000 });
|
| 109 |
+
await page.waitForTimeout(1500);
|
| 110 |
+
|
| 111 |
+
const peopleText = await page.evaluate(() => document.body.innerText);
|
| 112 |
+
result.decisionMakers = extractDecisionMakers(peopleText, linkedinUrl);
|
| 113 |
+
|
| 114 |
+
logger.info({ linkedinUrl, dmCount: result.decisionMakers.length }, "LinkedIn people scraped");
|
| 115 |
+
} catch (err) {
|
| 116 |
+
logger.warn({ linkedinUrl, err }, "LinkedIn scrape partial failure");
|
| 117 |
+
} finally {
|
| 118 |
+
await page.close();
|
| 119 |
+
await context.close();
|
| 120 |
+
await browser.close();
|
| 121 |
+
}
|
| 122 |
+
|
| 123 |
+
return result;
|
| 124 |
+
}
|
| 125 |
+
|
| 126 |
+
/**
|
| 127 |
+
* Searches LinkedIn for a company by name + region using Google.
|
| 128 |
+
* Returns the LinkedIn company URL if found.
|
| 129 |
+
*/
|
| 130 |
+
export function buildLinkedInSearchUrl(companyName: string): string {
|
| 131 |
+
const q = encodeURIComponent(`site:linkedin.com/company "${companyName}"`);
|
| 132 |
+
return `https://www.google.com/search?q=${q}`;
|
| 133 |
+
}
|
| 134 |
+
|
| 135 |
+
function extractDecisionMakers(text: string, companyUrl: string): LinkedInPerson[] {
|
| 136 |
+
const lines = text.split("\n").map((l) => l.trim()).filter((l) => l.length > 2);
|
| 137 |
+
const people: LinkedInPerson[] = [];
|
| 138 |
+
|
| 139 |
+
for (let i = 0; i < lines.length - 1; i++) {
|
| 140 |
+
const nameLine = lines[i];
|
| 141 |
+
const titleLine = lines[i + 1] ?? "";
|
| 142 |
+
|
| 143 |
+
// Names are typically 2-4 words, Title follows
|
| 144 |
+
const isName = /^[A-Z][a-z]+ [A-Z]/.test(nameLine) && nameLine.split(" ").length <= 4;
|
| 145 |
+
if (!isName) continue;
|
| 146 |
+
|
| 147 |
+
const titleLower = titleLine.toLowerCase();
|
| 148 |
+
const isDecisionMaker = DECISION_MAKER_TITLES.some((t) => titleLower.includes(t));
|
| 149 |
+
|
| 150 |
+
if (isDecisionMaker || titleLower.length < 60) {
|
| 151 |
+
people.push({
|
| 152 |
+
fullName: nameLine,
|
| 153 |
+
title: titleLine,
|
| 154 |
+
linkedinUrl: `${companyUrl}/people/`, // public people page
|
| 155 |
+
isDecisionMaker,
|
| 156 |
+
});
|
| 157 |
+
i++; // skip title line
|
| 158 |
+
}
|
| 159 |
+
|
| 160 |
+
if (people.length >= 10) break;
|
| 161 |
+
}
|
| 162 |
+
|
| 163 |
+
// Sort: decision-makers first
|
| 164 |
+
return people.sort((a, b) => Number(b.isDecisionMaker) - Number(a.isDecisionMaker));
|
| 165 |
+
}
|
|
@@ -0,0 +1,145 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import { InsertCompany } from "../../shared/supabase/schema";
|
| 2 |
+
import { ScrapedCompany } from "./web-scraper";
|
| 3 |
+
import { LinkedInCompanyData } from "./linkedin-scraper";
|
| 4 |
+
import { SerperResult } from "../providers/serper";
|
| 5 |
+
|
| 6 |
+
/**
|
| 7 |
+
* Normalizes raw data from multiple sources into a single canonical Company record.
|
| 8 |
+
* Priority: LinkedIn > Website > Serper snippet
|
| 9 |
+
*/
|
| 10 |
+
export function normalizeCompany(
|
| 11 |
+
serperResult: SerperResult,
|
| 12 |
+
website: ScrapedCompany,
|
| 13 |
+
linkedin: LinkedInCompanyData | null,
|
| 14 |
+
region: string,
|
| 15 |
+
source: string
|
| 16 |
+
): InsertCompany {
|
| 17 |
+
const name =
|
| 18 |
+
linkedin?.name ??
|
| 19 |
+
website.name ??
|
| 20 |
+
cleanTitle(serperResult.title);
|
| 21 |
+
|
| 22 |
+
const description =
|
| 23 |
+
linkedin?.description ??
|
| 24 |
+
website.description ??
|
| 25 |
+
serperResult.snippet;
|
| 26 |
+
|
| 27 |
+
const employeeCount =
|
| 28 |
+
linkedin?.employeeCount ??
|
| 29 |
+
website.employeeCount ??
|
| 30 |
+
null;
|
| 31 |
+
|
| 32 |
+
const employeeRange =
|
| 33 |
+
linkedin?.employeeRange ??
|
| 34 |
+
website.employeeRange ??
|
| 35 |
+
estimateRange(employeeCount);
|
| 36 |
+
|
| 37 |
+
const industry =
|
| 38 |
+
linkedin?.industry ??
|
| 39 |
+
website.industry ??
|
| 40 |
+
null;
|
| 41 |
+
|
| 42 |
+
const country =
|
| 43 |
+
linkedin?.headquarters
|
| 44 |
+
? extractCountry(linkedin.headquarters)
|
| 45 |
+
: regionToCountry(region);
|
| 46 |
+
|
| 47 |
+
const linkedinUrl =
|
| 48 |
+
linkedin !== null
|
| 49 |
+
? extractLinkedInCompanyUrl(serperResult.link) ?? website.linkedinUrl
|
| 50 |
+
: website.linkedinUrl;
|
| 51 |
+
|
| 52 |
+
const growthSignals = buildGrowthSignals(website, linkedin);
|
| 53 |
+
|
| 54 |
+
return {
|
| 55 |
+
domain: website.domain,
|
| 56 |
+
name: name ?? "Unknown",
|
| 57 |
+
industry,
|
| 58 |
+
employee_count: employeeCount,
|
| 59 |
+
employee_range: employeeRange,
|
| 60 |
+
description: description?.slice(0, 1000) ?? null,
|
| 61 |
+
website_url: `https://${website.domain}`,
|
| 62 |
+
linkedin_url: linkedinUrl ?? null,
|
| 63 |
+
country,
|
| 64 |
+
region,
|
| 65 |
+
tech_stack: website.techStack,
|
| 66 |
+
growth_signals: growthSignals,
|
| 67 |
+
raw_data: {
|
| 68 |
+
serper_title: serperResult.title,
|
| 69 |
+
serper_snippet: serperResult.snippet,
|
| 70 |
+
serper_link: serperResult.link,
|
| 71 |
+
},
|
| 72 |
+
source,
|
| 73 |
+
status: "discovered",
|
| 74 |
+
};
|
| 75 |
+
}
|
| 76 |
+
|
| 77 |
+
// ─── Helpers ─────────────────────────────────────────────────
|
| 78 |
+
|
| 79 |
+
function cleanTitle(title: string): string {
|
| 80 |
+
return title
|
| 81 |
+
.split(/[|\-–]/)[0]
|
| 82 |
+
.replace(/\b(home|official|website|welcome to)\b/gi, "")
|
| 83 |
+
.trim();
|
| 84 |
+
}
|
| 85 |
+
|
| 86 |
+
function estimateRange(count: number | null): string | null {
|
| 87 |
+
if (!count) return null;
|
| 88 |
+
if (count < 50) return "10-49";
|
| 89 |
+
if (count < 100) return "50-99";
|
| 90 |
+
if (count < 200) return "100-199";
|
| 91 |
+
if (count < 500) return "200-499";
|
| 92 |
+
if (count < 1000) return "500-999";
|
| 93 |
+
return "1000+";
|
| 94 |
+
}
|
| 95 |
+
|
| 96 |
+
function extractCountry(headquarters: string): string | null {
|
| 97 |
+
const parts = headquarters.split(",");
|
| 98 |
+
return parts[parts.length - 1]?.trim() ?? null;
|
| 99 |
+
}
|
| 100 |
+
|
| 101 |
+
function regionToCountry(region: string): string {
|
| 102 |
+
const map: Record<string, string> = {
|
| 103 |
+
US: "United States", UK: "United Kingdom",
|
| 104 |
+
AU: "Australia", UAE: "United Arab Emirates",
|
| 105 |
+
SA: "Saudi Arabia", SG: "Singapore",
|
| 106 |
+
};
|
| 107 |
+
return map[region] ?? region;
|
| 108 |
+
}
|
| 109 |
+
|
| 110 |
+
function extractLinkedInCompanyUrl(url: string): string | null {
|
| 111 |
+
const match = url.match(/https?:\/\/(www\.)?linkedin\.com\/company\/[^/?#]+/);
|
| 112 |
+
return match ? match[0] : null;
|
| 113 |
+
}
|
| 114 |
+
|
| 115 |
+
function buildGrowthSignals(
|
| 116 |
+
website: ScrapedCompany,
|
| 117 |
+
linkedin: LinkedInCompanyData | null
|
| 118 |
+
): object[] {
|
| 119 |
+
const signals: object[] = [];
|
| 120 |
+
|
| 121 |
+
// AI-related job postings
|
| 122 |
+
website.jobPostings
|
| 123 |
+
.filter((j) => j.hasAiSignal)
|
| 124 |
+
.forEach((j) => {
|
| 125 |
+
signals.push({
|
| 126 |
+
type: "job_posting",
|
| 127 |
+
content: j.title,
|
| 128 |
+
source_url: j.url,
|
| 129 |
+
ai_related: true,
|
| 130 |
+
detected_at: new Date().toISOString(),
|
| 131 |
+
});
|
| 132 |
+
});
|
| 133 |
+
|
| 134 |
+
// Recent LinkedIn posts
|
| 135 |
+
(linkedin?.recentPosts ?? []).forEach((post) => {
|
| 136 |
+
signals.push({
|
| 137 |
+
type: "social_post",
|
| 138 |
+
content: post.slice(0, 200),
|
| 139 |
+
ai_related: /automat|ai\b|machine learning|digital/i.test(post),
|
| 140 |
+
detected_at: new Date().toISOString(),
|
| 141 |
+
});
|
| 142 |
+
});
|
| 143 |
+
|
| 144 |
+
return signals.slice(0, 10); // cap at 10 signals
|
| 145 |
+
}
|
|
@@ -0,0 +1,228 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/**
|
| 2 |
+
* Pain Signal Detector
|
| 3 |
+
*
|
| 4 |
+
* Core philosophy: Don't look for AI signals.
|
| 5 |
+
* Look for INEFFICIENCY signals.
|
| 6 |
+
*
|
| 7 |
+
* A phone number on homepage = manual call handling = pain point.
|
| 8 |
+
* A "Book by Phone" button = no online scheduling = pain point.
|
| 9 |
+
* No chatbot = manual customer interaction = pain point.
|
| 10 |
+
*
|
| 11 |
+
* These are UNIVERSAL signals — every industry has them.
|
| 12 |
+
* The LLM then maps these signals to our specific services.
|
| 13 |
+
*/
|
| 14 |
+
|
| 15 |
+
import { callLLM, MODELS } from "../../shared/llm/nvidia-client";
|
| 16 |
+
import { SYSTEM_PROMPTS, buildPainDetectionPrompt } from "../../shared/llm/prompts";
|
| 17 |
+
import { logger } from "../../shared/utils/logger";
|
| 18 |
+
|
| 19 |
+
export interface PainSignal {
|
| 20 |
+
signal: string;
|
| 21 |
+
evidence: string;
|
| 22 |
+
severity: "low" | "medium" | "high";
|
| 23 |
+
}
|
| 24 |
+
|
| 25 |
+
export interface PainDetectionResult {
|
| 26 |
+
painSignals: PainSignal[];
|
| 27 |
+
serviceMatch: string | null; // matched service from service_profiles
|
| 28 |
+
matchConfidence: number;
|
| 29 |
+
reasoning: string;
|
| 30 |
+
source: "heuristic" | "llm" | "combined";
|
| 31 |
+
}
|
| 32 |
+
|
| 33 |
+
// ─── Heuristic detection (instant, free, no LLM) ────────────
|
| 34 |
+
|
| 35 |
+
const HEURISTIC_RULES: {
|
| 36 |
+
pattern: RegExp;
|
| 37 |
+
signal: string;
|
| 38 |
+
severity: PainSignal["severity"];
|
| 39 |
+
}[] = [
|
| 40 |
+
// Phone/call signals → AI Receptionist opportunity
|
| 41 |
+
{ pattern: /(?:call us|call now|phone|dial|ring us)/i, signal: "phone_handling_manual", severity: "high" },
|
| 42 |
+
{ pattern: /\+?\d[\d\s\-().]{8,}/, signal: "phone_number_prominent", severity: "medium" },
|
| 43 |
+
{ pattern: /(?:book (?:an? )?appointment|schedule (?:a )?visit|make (?:an? )?appointment)/i, signal: "manual_appointment_booking", severity: "high" },
|
| 44 |
+
{ pattern: /(?:office hours|opening hours|business hours|we're open)/i, signal: "limited_availability_hours", severity: "medium" },
|
| 45 |
+
{ pattern: /(?:receptionist|front desk|reception)/i, signal: "human_receptionist_mentioned", severity: "high" },
|
| 46 |
+
|
| 47 |
+
// Support signals → AI Customer Support opportunity
|
| 48 |
+
{ pattern: /(?:contact us|get in touch|reach out|enquire|inquire)/i, signal: "manual_contact_process", severity: "medium" },
|
| 49 |
+
{ pattern: /(?:submit (?:a )?ticket|raise (?:a )?ticket)/i, signal: "manual_ticket_system", severity: "medium" },
|
| 50 |
+
{ pattern: /(?:FAQ|frequently asked|common questions)/i, signal: "faq_exists_no_chatbot", severity: "low" },
|
| 51 |
+
{ pattern: /(?:email us|send us an email|write to us)/i, signal: "email_only_support", severity: "medium" },
|
| 52 |
+
|
| 53 |
+
// Data/process signals → AI Data Processing opportunity
|
| 54 |
+
{ pattern: /(?:spreadsheet|excel|csv|manual report)/i, signal: "manual_data_processing", severity: "high" },
|
| 55 |
+
{ pattern: /(?:legacy|outdated|traditional system)/i, signal: "legacy_system_mentioned", severity: "high" },
|
| 56 |
+
{ pattern: /(?:compliance|regulatory|audit)/i, signal: "compliance_reporting_burden", severity: "medium" },
|
| 57 |
+
|
| 58 |
+
// Hiring signals → growth/overwork indicator
|
| 59 |
+
{ pattern: /(?:we're hiring|join our team|open positions|careers)/i, signal: "actively_hiring", severity: "low" },
|
| 60 |
+
{ pattern: /(?:our team|meet the team|staff|employees)/i, signal: "team_page_exists", severity: "low" },
|
| 61 |
+
];
|
| 62 |
+
|
| 63 |
+
// Elements on page that indicate ABSENCE of automation
|
| 64 |
+
const ABSENCE_SIGNALS: {
|
| 65 |
+
check: (html: string) => boolean;
|
| 66 |
+
signal: string;
|
| 67 |
+
severity: PainSignal["severity"];
|
| 68 |
+
}[] = [
|
| 69 |
+
{
|
| 70 |
+
check: (html) => !/(intercom|drift|crisp|tidio|zendesk|freshchat|livechat|tawk|hubspot.*chat)/i.test(html),
|
| 71 |
+
signal: "no_chatbot_detected",
|
| 72 |
+
severity: "medium",
|
| 73 |
+
},
|
| 74 |
+
{
|
| 75 |
+
check: (html) => !/(calendly|acuity|booksy|mindbody|simplybook|square.*appointment)/i.test(html),
|
| 76 |
+
signal: "no_online_scheduling_tool",
|
| 77 |
+
severity: "high",
|
| 78 |
+
},
|
| 79 |
+
{
|
| 80 |
+
check: (html) => !/(zapier|make\.com|automate|n8n|workato)/i.test(html),
|
| 81 |
+
signal: "no_automation_tools",
|
| 82 |
+
severity: "low",
|
| 83 |
+
},
|
| 84 |
+
];
|
| 85 |
+
|
| 86 |
+
/**
|
| 87 |
+
* Detect pain signals from website text and HTML.
|
| 88 |
+
*
|
| 89 |
+
* Step 1: Heuristic detection (instant, free)
|
| 90 |
+
* Step 2: LLM enhancement (DL reasoning — maps signals to services)
|
| 91 |
+
*/
|
| 92 |
+
export async function detectPainSignals(
|
| 93 |
+
companyName: string,
|
| 94 |
+
industry: string,
|
| 95 |
+
employeeCount: number | null,
|
| 96 |
+
websiteText: string,
|
| 97 |
+
websiteHtml: string,
|
| 98 |
+
traceId: string
|
| 99 |
+
): Promise<PainDetectionResult> {
|
| 100 |
+
// ── Step 1: Heuristic scan ─────────────────────────────────
|
| 101 |
+
const heuristicSignals = runHeuristicScan(websiteText, websiteHtml);
|
| 102 |
+
|
| 103 |
+
// If we found enough signals, LLM just confirms and maps to service
|
| 104 |
+
// If few signals, LLM reasons deeper about the industry context
|
| 105 |
+
const pageElements = heuristicSignals.map(s => s.signal);
|
| 106 |
+
|
| 107 |
+
// ── Step 2: LLM deep reasoning ────────────────────────────
|
| 108 |
+
try {
|
| 109 |
+
const llmResult = await callLLM({
|
| 110 |
+
operation: "pain_detect",
|
| 111 |
+
model: MODELS.FAST, // 8B for speed — pain detection is pattern-based
|
| 112 |
+
systemPrompt: SYSTEM_PROMPTS.PAIN_DETECTOR,
|
| 113 |
+
userPrompt: buildPainDetectionPrompt({
|
| 114 |
+
company_name: companyName,
|
| 115 |
+
industry,
|
| 116 |
+
employee_count: employeeCount,
|
| 117 |
+
website_text: websiteText.slice(0, 500),
|
| 118 |
+
page_elements: pageElements,
|
| 119 |
+
}),
|
| 120 |
+
temperature: 0.2,
|
| 121 |
+
maxTokens: 400,
|
| 122 |
+
jsonMode: true,
|
| 123 |
+
traceId,
|
| 124 |
+
});
|
| 125 |
+
|
| 126 |
+
if (llmResult.parsed) {
|
| 127 |
+
// Merge heuristic + LLM signals (dedup)
|
| 128 |
+
const llmSignals = (llmResult.parsed.pain_signals as PainSignal[]) ?? [];
|
| 129 |
+
const merged = mergeSignals(heuristicSignals, llmSignals);
|
| 130 |
+
|
| 131 |
+
return {
|
| 132 |
+
painSignals: merged,
|
| 133 |
+
serviceMatch: String(llmResult.parsed.service_match ?? "NONE"),
|
| 134 |
+
matchConfidence: Number(llmResult.parsed.match_confidence ?? 0),
|
| 135 |
+
reasoning: String(llmResult.parsed.reasoning ?? ""),
|
| 136 |
+
source: "combined",
|
| 137 |
+
};
|
| 138 |
+
}
|
| 139 |
+
} catch (err) {
|
| 140 |
+
logger.warn({ companyName, err }, "LLM pain detection failed — using heuristic only");
|
| 141 |
+
}
|
| 142 |
+
|
| 143 |
+
// ── Fallback: heuristic-only result ────────────────────────
|
| 144 |
+
return {
|
| 145 |
+
painSignals: heuristicSignals,
|
| 146 |
+
serviceMatch: inferServiceFromSignals(heuristicSignals, industry),
|
| 147 |
+
matchConfidence: heuristicSignals.length >= 3 ? 0.7 : 0.4,
|
| 148 |
+
reasoning: `Heuristic-only: ${heuristicSignals.length} pain signals detected`,
|
| 149 |
+
source: "heuristic",
|
| 150 |
+
};
|
| 151 |
+
}
|
| 152 |
+
|
| 153 |
+
function runHeuristicScan(text: string, html: string): PainSignal[] {
|
| 154 |
+
const signals: PainSignal[] = [];
|
| 155 |
+
const seen = new Set<string>();
|
| 156 |
+
|
| 157 |
+
// Pattern-based detection
|
| 158 |
+
for (const rule of HEURISTIC_RULES) {
|
| 159 |
+
if (rule.pattern.test(text) && !seen.has(rule.signal)) {
|
| 160 |
+
seen.add(rule.signal);
|
| 161 |
+
signals.push({
|
| 162 |
+
signal: rule.signal,
|
| 163 |
+
evidence: `Pattern matched in website text`,
|
| 164 |
+
severity: rule.severity,
|
| 165 |
+
});
|
| 166 |
+
}
|
| 167 |
+
}
|
| 168 |
+
|
| 169 |
+
// Absence-based detection (what's NOT on the site)
|
| 170 |
+
for (const check of ABSENCE_SIGNALS) {
|
| 171 |
+
if (check.check(html) && !seen.has(check.signal)) {
|
| 172 |
+
seen.add(check.signal);
|
| 173 |
+
signals.push({
|
| 174 |
+
signal: check.signal,
|
| 175 |
+
evidence: "Not detected in page source",
|
| 176 |
+
severity: check.severity,
|
| 177 |
+
});
|
| 178 |
+
}
|
| 179 |
+
}
|
| 180 |
+
|
| 181 |
+
return signals;
|
| 182 |
+
}
|
| 183 |
+
|
| 184 |
+
function mergeSignals(heuristic: PainSignal[], llm: PainSignal[]): PainSignal[] {
|
| 185 |
+
const merged = [...heuristic];
|
| 186 |
+
const existing = new Set(heuristic.map(s => s.signal));
|
| 187 |
+
|
| 188 |
+
for (const signal of llm) {
|
| 189 |
+
if (!existing.has(signal.signal)) {
|
| 190 |
+
merged.push(signal);
|
| 191 |
+
}
|
| 192 |
+
}
|
| 193 |
+
|
| 194 |
+
// Sort by severity: high → medium → low
|
| 195 |
+
const severityOrder = { high: 0, medium: 1, low: 2 };
|
| 196 |
+
return merged.sort((a, b) => severityOrder[a.severity] - severityOrder[b.severity]);
|
| 197 |
+
}
|
| 198 |
+
|
| 199 |
+
/**
|
| 200 |
+
* Deterministic service inference from signals (fallback when LLM fails).
|
| 201 |
+
*/
|
| 202 |
+
function inferServiceFromSignals(signals: PainSignal[], industry: string): string | null {
|
| 203 |
+
const signalNames = signals.map(s => s.signal);
|
| 204 |
+
|
| 205 |
+
// Receptionist signals
|
| 206 |
+
const receptionistSignals = ["phone_handling_manual", "phone_number_prominent",
|
| 207 |
+
"manual_appointment_booking", "human_receptionist_mentioned", "limited_availability_hours",
|
| 208 |
+
"no_online_scheduling_tool"];
|
| 209 |
+
const receptionistCount = signalNames.filter(s => receptionistSignals.includes(s)).length;
|
| 210 |
+
|
| 211 |
+
// Support signals
|
| 212 |
+
const supportSignals = ["manual_contact_process", "manual_ticket_system",
|
| 213 |
+
"faq_exists_no_chatbot", "email_only_support", "no_chatbot_detected"];
|
| 214 |
+
const supportCount = signalNames.filter(s => supportSignals.includes(s)).length;
|
| 215 |
+
|
| 216 |
+
// Data signals
|
| 217 |
+
const dataSignals = ["manual_data_processing", "legacy_system_mentioned",
|
| 218 |
+
"compliance_reporting_burden"];
|
| 219 |
+
const dataCount = signalNames.filter(s => dataSignals.includes(s)).length;
|
| 220 |
+
|
| 221 |
+
const max = Math.max(receptionistCount, supportCount, dataCount);
|
| 222 |
+
if (max < 2) return null;
|
| 223 |
+
|
| 224 |
+
if (receptionistCount === max) return "AI Receptionist";
|
| 225 |
+
if (supportCount === max) return "AI Customer Support";
|
| 226 |
+
if (dataCount === max) return "AI Data Processing";
|
| 227 |
+
return null;
|
| 228 |
+
}
|
|
@@ -0,0 +1,114 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import { getSupabaseClient } from "../../shared/supabase/client";
|
| 2 |
+
import { logger } from "../../shared/utils/logger";
|
| 3 |
+
|
| 4 |
+
// Week number → region mapping
|
| 5 |
+
const ROTATION_MAP: Record<number, string> = {
|
| 6 |
+
1: "US",
|
| 7 |
+
2: "UK",
|
| 8 |
+
3: "AU",
|
| 9 |
+
4: "UAE",
|
| 10 |
+
};
|
| 11 |
+
|
| 12 |
+
export interface RotationInfo {
|
| 13 |
+
weekNumber: number;
|
| 14 |
+
region: string;
|
| 15 |
+
rotationId: string;
|
| 16 |
+
}
|
| 17 |
+
|
| 18 |
+
/**
|
| 19 |
+
* Gets the current rotation region and advances the week counter.
|
| 20 |
+
* Rotation cycles: US → UK → AU → UAE → US → ...
|
| 21 |
+
*/
|
| 22 |
+
export async function getCurrentRotation(): Promise<RotationInfo> {
|
| 23 |
+
const db = getSupabaseClient();
|
| 24 |
+
|
| 25 |
+
// Get the latest rotation record
|
| 26 |
+
const { data: latest } = await db
|
| 27 |
+
.from("rotation_state")
|
| 28 |
+
.select("*")
|
| 29 |
+
.order("started_at", { ascending: false })
|
| 30 |
+
.limit(1)
|
| 31 |
+
.single();
|
| 32 |
+
|
| 33 |
+
const currentWeek = latest?.week_number ?? 1;
|
| 34 |
+
const region = ROTATION_MAP[currentWeek] ?? "US";
|
| 35 |
+
|
| 36 |
+
logger.info({ currentWeek, region }, "Rotation: current region");
|
| 37 |
+
|
| 38 |
+
return {
|
| 39 |
+
weekNumber: currentWeek,
|
| 40 |
+
region,
|
| 41 |
+
rotationId: latest?.id ?? "unknown",
|
| 42 |
+
};
|
| 43 |
+
}
|
| 44 |
+
|
| 45 |
+
/**
|
| 46 |
+
* Creates a new rotation record for the next week.
|
| 47 |
+
* Call this at the END of a successful run.
|
| 48 |
+
*/
|
| 49 |
+
export async function advanceRotation(currentWeek: number): Promise<void> {
|
| 50 |
+
const db = getSupabaseClient();
|
| 51 |
+
const nextWeek = currentWeek >= 4 ? 1 : currentWeek + 1;
|
| 52 |
+
const nextRegion = ROTATION_MAP[nextWeek];
|
| 53 |
+
|
| 54 |
+
const { error } = await db.from("rotation_state").insert({
|
| 55 |
+
week_number: nextWeek,
|
| 56 |
+
region: nextRegion,
|
| 57 |
+
});
|
| 58 |
+
|
| 59 |
+
if (error) {
|
| 60 |
+
logger.error({ error }, "Failed to advance rotation");
|
| 61 |
+
} else {
|
| 62 |
+
logger.info({ nextWeek, nextRegion }, "Rotation: advanced to next region");
|
| 63 |
+
}
|
| 64 |
+
}
|
| 65 |
+
|
| 66 |
+
/**
|
| 67 |
+
* Marks the current rotation run as completed with stats.
|
| 68 |
+
*/
|
| 69 |
+
export async function completeRotation(
|
| 70 |
+
rotationId: string,
|
| 71 |
+
companiesFound: number,
|
| 72 |
+
leadsQualified: number
|
| 73 |
+
): Promise<void> {
|
| 74 |
+
const db = getSupabaseClient();
|
| 75 |
+
await db
|
| 76 |
+
.from("rotation_state")
|
| 77 |
+
.update({ completed_at: new Date().toISOString(), companies_found: companiesFound, leads_qualified: leadsQualified })
|
| 78 |
+
.eq("id", rotationId);
|
| 79 |
+
|
| 80 |
+
logger.info({ rotationId, companiesFound, leadsQualified }, "Rotation: completed");
|
| 81 |
+
}
|
| 82 |
+
|
| 83 |
+
/**
|
| 84 |
+
* Converts a region code to ICP geography + search labels.
|
| 85 |
+
*/
|
| 86 |
+
export function getRegionConfig(region: string): {
|
| 87 |
+
countryCode: string;
|
| 88 |
+
searchLabel: string;
|
| 89 |
+
industries: string[];
|
| 90 |
+
} {
|
| 91 |
+
const configs: Record<string, { countryCode: string; searchLabel: string; industries: string[] }> = {
|
| 92 |
+
US: {
|
| 93 |
+
countryCode: "US",
|
| 94 |
+
searchLabel: "United States",
|
| 95 |
+
industries: ["technology", "manufacturing", "logistics", "healthcare", "finance", "retail_tech"],
|
| 96 |
+
},
|
| 97 |
+
UK: {
|
| 98 |
+
countryCode: "GB",
|
| 99 |
+
searchLabel: "United Kingdom",
|
| 100 |
+
industries: ["technology", "finance", "logistics", "professional_services", "manufacturing"],
|
| 101 |
+
},
|
| 102 |
+
AU: {
|
| 103 |
+
countryCode: "AU",
|
| 104 |
+
searchLabel: "Australia",
|
| 105 |
+
industries: ["technology", "mining_tech", "agri_tech", "finance", "healthcare"],
|
| 106 |
+
},
|
| 107 |
+
UAE: {
|
| 108 |
+
countryCode: "AE",
|
| 109 |
+
searchLabel: "Dubai UAE",
|
| 110 |
+
industries: ["technology", "logistics", "real_estate_tech", "finance", "retail"],
|
| 111 |
+
},
|
| 112 |
+
};
|
| 113 |
+
return configs[region] ?? configs["US"];
|
| 114 |
+
}
|
|
@@ -0,0 +1,202 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/**
|
| 2 |
+
* Social Profile Finder
|
| 3 |
+
*
|
| 4 |
+
* Finds company + decision-maker social profiles:
|
| 5 |
+
* - Instagram (business account)
|
| 6 |
+
* - Facebook (business page)
|
| 7 |
+
* - Twitter/X
|
| 8 |
+
* - YouTube
|
| 9 |
+
*
|
| 10 |
+
* Two sources:
|
| 11 |
+
* 1. Website footer/header scraping (most reliable)
|
| 12 |
+
* 2. Google search fallback
|
| 13 |
+
*
|
| 14 |
+
* Phase 2 uses these for multi-channel outreach.
|
| 15 |
+
*/
|
| 16 |
+
|
| 17 |
+
import { chromium } from "playwright";
|
| 18 |
+
import { playwrightLimiter } from "../../shared/utils/rate-limiter";
|
| 19 |
+
import { serperLimiter } from "../../shared/utils/rate-limiter";
|
| 20 |
+
import { logger } from "../../shared/utils/logger";
|
| 21 |
+
import axios from "axios";
|
| 22 |
+
import { getEnv } from "../../shared/config/env";
|
| 23 |
+
|
| 24 |
+
export interface SocialProfiles {
|
| 25 |
+
instagram: string | null;
|
| 26 |
+
facebook: string | null;
|
| 27 |
+
twitter: string | null;
|
| 28 |
+
youtube: string | null;
|
| 29 |
+
source: "website" | "google" | "mixed";
|
| 30 |
+
}
|
| 31 |
+
|
| 32 |
+
/**
|
| 33 |
+
* Find all social profiles for a company.
|
| 34 |
+
* Method 1 first (website scrape), then Google fills gaps.
|
| 35 |
+
*/
|
| 36 |
+
export async function findSocialProfiles(
|
| 37 |
+
domain: string,
|
| 38 |
+
companyName: string,
|
| 39 |
+
websiteHtml?: string
|
| 40 |
+
): Promise<SocialProfiles> {
|
| 41 |
+
const profiles: SocialProfiles = {
|
| 42 |
+
instagram: null,
|
| 43 |
+
facebook: null,
|
| 44 |
+
twitter: null,
|
| 45 |
+
youtube: null,
|
| 46 |
+
source: "website",
|
| 47 |
+
};
|
| 48 |
+
|
| 49 |
+
// ── Method 1: Extract from website HTML ────────────────────
|
| 50 |
+
if (websiteHtml) {
|
| 51 |
+
extractFromHtml(websiteHtml, profiles);
|
| 52 |
+
} else {
|
| 53 |
+
// Scrape website specifically for social links
|
| 54 |
+
await scrapeWebsiteForSocials(domain, profiles);
|
| 55 |
+
}
|
| 56 |
+
|
| 57 |
+
// ── Method 2: Google search for missing profiles ───────────
|
| 58 |
+
const missing = getMissing(profiles);
|
| 59 |
+
if (missing.length > 0) {
|
| 60 |
+
await searchGoogleForSocials(companyName, domain, profiles, missing);
|
| 61 |
+
if (profiles.source === "website" && missing.some(p => profiles[p as keyof SocialProfiles])) {
|
| 62 |
+
profiles.source = "mixed";
|
| 63 |
+
}
|
| 64 |
+
}
|
| 65 |
+
|
| 66 |
+
const found = [profiles.instagram, profiles.facebook, profiles.twitter, profiles.youtube]
|
| 67 |
+
.filter(Boolean).length;
|
| 68 |
+
logger.info({ domain, found }, "Social profiles discovered");
|
| 69 |
+
|
| 70 |
+
return profiles;
|
| 71 |
+
}
|
| 72 |
+
|
| 73 |
+
// ─── Method 1: HTML extraction ──────────────────────────────
|
| 74 |
+
|
| 75 |
+
const SOCIAL_PATTERNS = {
|
| 76 |
+
instagram: /https?:\/\/(www\.)?instagram\.com\/[a-zA-Z0-9._]+/gi,
|
| 77 |
+
facebook: /https?:\/\/(www\.)?(facebook|fb)\.com\/[a-zA-Z0-9.]+/gi,
|
| 78 |
+
twitter: /https?:\/\/(www\.)?(twitter|x)\.com\/[a-zA-Z0-9_]+/gi,
|
| 79 |
+
youtube: /https?:\/\/(www\.)?youtube\.com\/(channel|c|@)[\/a-zA-Z0-9._-]+/gi,
|
| 80 |
+
};
|
| 81 |
+
|
| 82 |
+
function extractFromHtml(html: string, profiles: SocialProfiles): void {
|
| 83 |
+
for (const [platform, pattern] of Object.entries(SOCIAL_PATTERNS)) {
|
| 84 |
+
const matches = html.match(pattern);
|
| 85 |
+
if (matches && matches.length > 0) {
|
| 86 |
+
// Take first match, clean it
|
| 87 |
+
const url = cleanSocialUrl(matches[0], platform);
|
| 88 |
+
if (url && !isGenericSocial(url)) {
|
| 89 |
+
(profiles as Record<string, unknown>)[platform] = url;
|
| 90 |
+
}
|
| 91 |
+
}
|
| 92 |
+
}
|
| 93 |
+
}
|
| 94 |
+
|
| 95 |
+
// ─── Website scrape (if HTML not already available) ──────────
|
| 96 |
+
|
| 97 |
+
async function scrapeWebsiteForSocials(domain: string, profiles: SocialProfiles): Promise<void> {
|
| 98 |
+
try {
|
| 99 |
+
await playwrightLimiter.consume("playwright");
|
| 100 |
+
|
| 101 |
+
const browser = await chromium.launch({ headless: true, args: ["--no-sandbox"] });
|
| 102 |
+
const context = await browser.newContext({
|
| 103 |
+
userAgent: "Mozilla/5.0 (compatible; ResearchBot/1.0)",
|
| 104 |
+
});
|
| 105 |
+
const page = await context.newPage();
|
| 106 |
+
|
| 107 |
+
await page.goto(`https://${domain}`, { waitUntil: "domcontentloaded", timeout: 12_000 });
|
| 108 |
+
|
| 109 |
+
// Get all link hrefs on the page
|
| 110 |
+
const links = await page.$$eval("a[href]", (anchors) =>
|
| 111 |
+
anchors.map((a) => a.getAttribute("href") ?? "")
|
| 112 |
+
);
|
| 113 |
+
|
| 114 |
+
const pageHtml = links.join("\n");
|
| 115 |
+
extractFromHtml(pageHtml, profiles);
|
| 116 |
+
|
| 117 |
+
await page.close();
|
| 118 |
+
await context.close();
|
| 119 |
+
await browser.close();
|
| 120 |
+
} catch (err) {
|
| 121 |
+
logger.debug({ domain, err }, "Social scrape failed — trying Google");
|
| 122 |
+
}
|
| 123 |
+
}
|
| 124 |
+
|
| 125 |
+
// ─── Method 2: Google search ────────────────────────────────
|
| 126 |
+
|
| 127 |
+
async function searchGoogleForSocials(
|
| 128 |
+
companyName: string,
|
| 129 |
+
domain: string,
|
| 130 |
+
profiles: SocialProfiles,
|
| 131 |
+
missing: string[]
|
| 132 |
+
): Promise<void> {
|
| 133 |
+
const searchMap: Record<string, string> = {
|
| 134 |
+
instagram: `"${companyName}" site:instagram.com`,
|
| 135 |
+
facebook: `"${companyName}" site:facebook.com`,
|
| 136 |
+
twitter: `"${companyName}" site:twitter.com OR site:x.com`,
|
| 137 |
+
youtube: `"${companyName}" site:youtube.com`,
|
| 138 |
+
};
|
| 139 |
+
|
| 140 |
+
for (const platform of missing) {
|
| 141 |
+
try {
|
| 142 |
+
await serperLimiter.consume("serper");
|
| 143 |
+
|
| 144 |
+
const env = getEnv();
|
| 145 |
+
const response = await axios.post(
|
| 146 |
+
"https://google.serper.dev/search",
|
| 147 |
+
{ q: searchMap[platform], num: 3 },
|
| 148 |
+
{
|
| 149 |
+
headers: {
|
| 150 |
+
"X-API-KEY": env.SERPER_API_KEY,
|
| 151 |
+
"Content-Type": "application/json",
|
| 152 |
+
},
|
| 153 |
+
timeout: 6_000,
|
| 154 |
+
}
|
| 155 |
+
);
|
| 156 |
+
|
| 157 |
+
const organic = response.data?.organic ?? [];
|
| 158 |
+
for (const result of organic) {
|
| 159 |
+
const url = cleanSocialUrl(result.link, platform);
|
| 160 |
+
if (url && !isGenericSocial(url)) {
|
| 161 |
+
// Verify it mentions company name or domain in snippet
|
| 162 |
+
const snippet = (result.snippet ?? "").toLowerCase();
|
| 163 |
+
const title = (result.title ?? "").toLowerCase();
|
| 164 |
+
const combined = `${snippet} ${title}`;
|
| 165 |
+
|
| 166 |
+
const companyWords = companyName.toLowerCase().split(/\s+/);
|
| 167 |
+
const hasCompany = companyWords.some(w => w.length > 3 && combined.includes(w));
|
| 168 |
+
|
| 169 |
+
if (hasCompany || combined.includes(domain.replace(/\.\w+$/, ""))) {
|
| 170 |
+
(profiles as Record<string, unknown>)[platform] = url;
|
| 171 |
+
break;
|
| 172 |
+
}
|
| 173 |
+
}
|
| 174 |
+
}
|
| 175 |
+
} catch (err) {
|
| 176 |
+
logger.debug({ platform, err }, "Social Google search failed — skipping");
|
| 177 |
+
}
|
| 178 |
+
}
|
| 179 |
+
}
|
| 180 |
+
|
| 181 |
+
// ─── Helpers ─────────────────────────────────────────────────
|
| 182 |
+
|
| 183 |
+
function getMissing(profiles: SocialProfiles): string[] {
|
| 184 |
+
return ["instagram", "facebook", "twitter", "youtube"]
|
| 185 |
+
.filter(p => !(profiles as Record<string, unknown>)[p]);
|
| 186 |
+
}
|
| 187 |
+
|
| 188 |
+
function cleanSocialUrl(url: string, platform: string): string | null {
|
| 189 |
+
try {
|
| 190 |
+
const parsed = new URL(url);
|
| 191 |
+
// Remove query params and fragments
|
| 192 |
+
return `${parsed.protocol}//${parsed.hostname}${parsed.pathname.replace(/\/$/, "")}`;
|
| 193 |
+
} catch {
|
| 194 |
+
return null;
|
| 195 |
+
}
|
| 196 |
+
}
|
| 197 |
+
|
| 198 |
+
function isGenericSocial(url: string): boolean {
|
| 199 |
+
// Filter out generic profile links (not actual company pages)
|
| 200 |
+
const genericPaths = ["/share", "/sharer", "/login", "/signup", "/help", "/about", "/policies"];
|
| 201 |
+
return genericPaths.some(p => url.includes(p));
|
| 202 |
+
}
|
|
@@ -0,0 +1,259 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/**
|
| 2 |
+
* Territory Manager
|
| 3 |
+
*
|
| 4 |
+
* Controls: which city, which industry, on which day.
|
| 5 |
+
* Prevents: re-searching same city+industry within 30 days.
|
| 6 |
+
* Tracks: daily progression, checkpoint for resume.
|
| 7 |
+
*
|
| 8 |
+
* Daily flow:
|
| 9 |
+
* 1. Load current position (city + industry)
|
| 10 |
+
* 2. Check if already searched recently (30-day window)
|
| 11 |
+
* 3. If fresh → search → advance pointer
|
| 12 |
+
* 4. If stale → skip to next fresh combo
|
| 13 |
+
* 5. Save position for tomorrow
|
| 14 |
+
*/
|
| 15 |
+
|
| 16 |
+
import { getSupabaseClient } from "../../shared/supabase/client";
|
| 17 |
+
import { logger } from "../../shared/utils/logger";
|
| 18 |
+
|
| 19 |
+
export interface TerritoryUnit {
|
| 20 |
+
territoryId: string;
|
| 21 |
+
country: string;
|
| 22 |
+
countryCode: string;
|
| 23 |
+
city: string;
|
| 24 |
+
industry: string;
|
| 25 |
+
timezone: string;
|
| 26 |
+
tier: number;
|
| 27 |
+
}
|
| 28 |
+
|
| 29 |
+
export interface TerritoryPosition {
|
| 30 |
+
countryCode: string;
|
| 31 |
+
cityIndex: number;
|
| 32 |
+
industryIndex: number;
|
| 33 |
+
}
|
| 34 |
+
|
| 35 |
+
// Industries to search (per territory cycle)
|
| 36 |
+
const INDUSTRY_LIST = [
|
| 37 |
+
"dental", "medical", "veterinary", "legal", "salon", "spa", // service businesses (AI Receptionist)
|
| 38 |
+
"ecommerce", "saas", "retail", "hospitality", // support-heavy (AI Support)
|
| 39 |
+
"manufacturing", "logistics", "finance", "healthcare", // data-heavy (AI Data Processing)
|
| 40 |
+
"technology", "consulting", "recruitment", "insurance", // sales-heavy (AI Sales Automation)
|
| 41 |
+
];
|
| 42 |
+
|
| 43 |
+
/**
|
| 44 |
+
* Get the next territory unit to search today.
|
| 45 |
+
* Respects 30-day cooldown and daily quota.
|
| 46 |
+
*/
|
| 47 |
+
export async function getNextTerritory(quota: number): Promise<TerritoryUnit[]> {
|
| 48 |
+
const db = getSupabaseClient();
|
| 49 |
+
const units: TerritoryUnit[] = [];
|
| 50 |
+
|
| 51 |
+
// Load current position from system_config
|
| 52 |
+
const { data: configData } = await db
|
| 53 |
+
.from("system_config")
|
| 54 |
+
.select("value")
|
| 55 |
+
.eq("key", "current_territory")
|
| 56 |
+
.single();
|
| 57 |
+
|
| 58 |
+
const position: TerritoryPosition = configData?.value ?? {
|
| 59 |
+
countryCode: "US",
|
| 60 |
+
cityIndex: 0,
|
| 61 |
+
industryIndex: 0,
|
| 62 |
+
};
|
| 63 |
+
|
| 64 |
+
// Load all cities ordered by tier (major cities first)
|
| 65 |
+
const { data: cities } = await db
|
| 66 |
+
.from("territory_grid")
|
| 67 |
+
.select("*")
|
| 68 |
+
.eq("is_active", true)
|
| 69 |
+
.order("tier", { ascending: true })
|
| 70 |
+
.order("city", { ascending: true });
|
| 71 |
+
|
| 72 |
+
if (!cities?.length) {
|
| 73 |
+
logger.error("No active territories found in territory_grid");
|
| 74 |
+
return [];
|
| 75 |
+
}
|
| 76 |
+
|
| 77 |
+
// Start from current position
|
| 78 |
+
let cityIdx = position.cityIndex;
|
| 79 |
+
let industryIdx = position.industryIndex;
|
| 80 |
+
let searched = 0;
|
| 81 |
+
|
| 82 |
+
// Keep finding fresh territory units until quota is met
|
| 83 |
+
// (estimated: each unit produces ~2-3 qualified leads)
|
| 84 |
+
const unitsNeeded = Math.ceil(quota / 2);
|
| 85 |
+
let attempts = 0;
|
| 86 |
+
const maxAttempts = cities.length * INDUSTRY_LIST.length; // prevent infinite loop
|
| 87 |
+
|
| 88 |
+
while (units.length < unitsNeeded && attempts < maxAttempts) {
|
| 89 |
+
attempts++;
|
| 90 |
+
|
| 91 |
+
const city = cities[cityIdx % cities.length];
|
| 92 |
+
const industry = INDUSTRY_LIST[industryIdx % INDUSTRY_LIST.length];
|
| 93 |
+
|
| 94 |
+
// Check 30-day cooldown
|
| 95 |
+
const isFresh = await isTerritoryFresh(city.id, industry);
|
| 96 |
+
|
| 97 |
+
if (isFresh) {
|
| 98 |
+
units.push({
|
| 99 |
+
territoryId: city.id,
|
| 100 |
+
country: city.country,
|
| 101 |
+
countryCode: city.country_code,
|
| 102 |
+
city: city.city,
|
| 103 |
+
industry,
|
| 104 |
+
timezone: city.timezone ?? "UTC",
|
| 105 |
+
tier: city.tier,
|
| 106 |
+
});
|
| 107 |
+
}
|
| 108 |
+
|
| 109 |
+
// Advance: next industry, or wrap to next city
|
| 110 |
+
industryIdx++;
|
| 111 |
+
if (industryIdx >= INDUSTRY_LIST.length) {
|
| 112 |
+
industryIdx = 0;
|
| 113 |
+
cityIdx++;
|
| 114 |
+
}
|
| 115 |
+
}
|
| 116 |
+
|
| 117 |
+
// Save new position for tomorrow
|
| 118 |
+
await db.from("system_config").upsert({
|
| 119 |
+
key: "current_territory",
|
| 120 |
+
value: {
|
| 121 |
+
countryCode: cities[cityIdx % cities.length]?.country_code ?? "US",
|
| 122 |
+
cityIndex: cityIdx % cities.length,
|
| 123 |
+
industryIndex: industryIdx % INDUSTRY_LIST.length,
|
| 124 |
+
},
|
| 125 |
+
updated_by: "system",
|
| 126 |
+
updated_at: new Date().toISOString(),
|
| 127 |
+
});
|
| 128 |
+
|
| 129 |
+
logger.info({
|
| 130 |
+
unitsFound: units.length,
|
| 131 |
+
firstCity: units[0]?.city,
|
| 132 |
+
firstIndustry: units[0]?.industry,
|
| 133 |
+
attempts,
|
| 134 |
+
}, "Territory units selected for today");
|
| 135 |
+
|
| 136 |
+
return units;
|
| 137 |
+
}
|
| 138 |
+
|
| 139 |
+
/**
|
| 140 |
+
* Check if a city+industry combo is fresh (not searched in 30 days).
|
| 141 |
+
*/
|
| 142 |
+
async function isTerritoryFresh(territoryId: string, industry: string): Promise<boolean> {
|
| 143 |
+
const db = getSupabaseClient();
|
| 144 |
+
|
| 145 |
+
const { data } = await db
|
| 146 |
+
.from("territory_progress")
|
| 147 |
+
.select("next_eligible_at")
|
| 148 |
+
.eq("territory_id", territoryId)
|
| 149 |
+
.eq("industry", industry)
|
| 150 |
+
.maybeSingle();
|
| 151 |
+
|
| 152 |
+
if (!data) return true; // never searched → fresh
|
| 153 |
+
|
| 154 |
+
const eligible = new Date(data.next_eligible_at);
|
| 155 |
+
return new Date() >= eligible;
|
| 156 |
+
}
|
| 157 |
+
|
| 158 |
+
/**
|
| 159 |
+
* Mark a territory unit as searched (sets 30-day cooldown).
|
| 160 |
+
*/
|
| 161 |
+
export async function markTerritorySearched(
|
| 162 |
+
territoryId: string,
|
| 163 |
+
industry: string,
|
| 164 |
+
leadsFound: number
|
| 165 |
+
): Promise<void> {
|
| 166 |
+
const db = getSupabaseClient();
|
| 167 |
+
const now = new Date();
|
| 168 |
+
const nextEligible = new Date(now.getTime() + 30 * 24 * 60 * 60 * 1000); // +30 days
|
| 169 |
+
|
| 170 |
+
await db.from("territory_progress").upsert({
|
| 171 |
+
territory_id: territoryId,
|
| 172 |
+
industry,
|
| 173 |
+
last_run_at: now.toISOString(),
|
| 174 |
+
next_eligible_at: nextEligible.toISOString(),
|
| 175 |
+
total_leads: leadsFound,
|
| 176 |
+
}, { onConflict: "territory_id,industry" });
|
| 177 |
+
}
|
| 178 |
+
|
| 179 |
+
/**
|
| 180 |
+
* Get today's lead quota (default or override).
|
| 181 |
+
*/
|
| 182 |
+
export async function getDailyQuota(): Promise<number> {
|
| 183 |
+
const db = getSupabaseClient();
|
| 184 |
+
|
| 185 |
+
const { data } = await db
|
| 186 |
+
.from("system_config")
|
| 187 |
+
.select("value")
|
| 188 |
+
.eq("key", "daily_quota")
|
| 189 |
+
.single();
|
| 190 |
+
|
| 191 |
+
const config = data?.value as { default: number; today_override: number | null } | null;
|
| 192 |
+
|
| 193 |
+
if (config?.today_override !== null && config?.today_override !== undefined) {
|
| 194 |
+
// Clear override after reading (one-time use)
|
| 195 |
+
await db.from("system_config").update({
|
| 196 |
+
value: { ...config, today_override: null },
|
| 197 |
+
updated_at: new Date().toISOString(),
|
| 198 |
+
}).eq("key", "daily_quota");
|
| 199 |
+
|
| 200 |
+
return config.today_override;
|
| 201 |
+
}
|
| 202 |
+
|
| 203 |
+
return config?.default ?? 10;
|
| 204 |
+
}
|
| 205 |
+
|
| 206 |
+
/**
|
| 207 |
+
* Set today's quota override (from Slack command).
|
| 208 |
+
*/
|
| 209 |
+
export async function setQuotaOverride(quota: number, permanent = false): Promise<void> {
|
| 210 |
+
const db = getSupabaseClient();
|
| 211 |
+
|
| 212 |
+
if (permanent) {
|
| 213 |
+
await db.from("system_config").update({
|
| 214 |
+
value: { default: quota, today_override: null },
|
| 215 |
+
updated_by: "slack",
|
| 216 |
+
updated_at: new Date().toISOString(),
|
| 217 |
+
}).eq("key", "daily_quota");
|
| 218 |
+
} else {
|
| 219 |
+
const { data } = await db
|
| 220 |
+
.from("system_config")
|
| 221 |
+
.select("value")
|
| 222 |
+
.eq("key", "daily_quota")
|
| 223 |
+
.single();
|
| 224 |
+
|
| 225 |
+
const current = data?.value as { default: number } | null;
|
| 226 |
+
await db.from("system_config").update({
|
| 227 |
+
value: { default: current?.default ?? 10, today_override: quota },
|
| 228 |
+
updated_by: "slack",
|
| 229 |
+
updated_at: new Date().toISOString(),
|
| 230 |
+
}).eq("key", "daily_quota");
|
| 231 |
+
}
|
| 232 |
+
}
|
| 233 |
+
|
| 234 |
+
/**
|
| 235 |
+
* Check if system is paused.
|
| 236 |
+
*/
|
| 237 |
+
export async function isSystemPaused(): Promise<boolean> {
|
| 238 |
+
const db = getSupabaseClient();
|
| 239 |
+
const { data } = await db
|
| 240 |
+
.from("system_config")
|
| 241 |
+
.select("value")
|
| 242 |
+
.eq("key", "auto_mode")
|
| 243 |
+
.single();
|
| 244 |
+
|
| 245 |
+
return (data?.value as { paused?: boolean })?.paused === true;
|
| 246 |
+
}
|
| 247 |
+
|
| 248 |
+
/**
|
| 249 |
+
* Build Google search queries for a territory unit.
|
| 250 |
+
* Generates 3-4 targeted queries per city+industry.
|
| 251 |
+
*/
|
| 252 |
+
export function buildTerritoryQueries(unit: TerritoryUnit, keywords: string[]): string[] {
|
| 253 |
+
return [
|
| 254 |
+
`"${unit.industry}" company "${unit.city}" "${unit.country}" -job -careers`,
|
| 255 |
+
`best ${unit.industry} companies in ${unit.city} ${unit.country}`,
|
| 256 |
+
`"${unit.industry}" business "${unit.city}" "${keywords[0] ?? ""}" site:linkedin.com/company`,
|
| 257 |
+
`top ${unit.industry} ${unit.city} companies ${new Date().getFullYear()}`,
|
| 258 |
+
].filter(q => q.trim().length > 10);
|
| 259 |
+
}
|
|
@@ -0,0 +1,225 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import { chromium, Browser, BrowserContext } from "playwright";
|
| 2 |
+
import { playwrightLimiter } from "../../shared/utils/rate-limiter";
|
| 3 |
+
import { logger } from "../../shared/utils/logger";
|
| 4 |
+
|
| 5 |
+
// ─── Types ───────────────────────────────────────────────────
|
| 6 |
+
|
| 7 |
+
export interface ScrapedCompany {
|
| 8 |
+
domain: string;
|
| 9 |
+
name: string | null;
|
| 10 |
+
description: string | null;
|
| 11 |
+
employeeRange: string | null;
|
| 12 |
+
employeeCount: number | null;
|
| 13 |
+
industry: string | null;
|
| 14 |
+
country: string | null;
|
| 15 |
+
linkedinUrl: string | null;
|
| 16 |
+
techStack: string[];
|
| 17 |
+
jobPostings: JobPosting[];
|
| 18 |
+
recentNews: string[];
|
| 19 |
+
websiteText: string;
|
| 20 |
+
html: string; // raw HTML for pain signal detection
|
| 21 |
+
text: string; // alias for websiteText (used by auto-discovery)
|
| 22 |
+
aiJobCount: number; // count of AI-related job postings
|
| 23 |
+
}
|
| 24 |
+
|
| 25 |
+
export interface JobPosting {
|
| 26 |
+
title: string;
|
| 27 |
+
url: string;
|
| 28 |
+
hasAiSignal: boolean;
|
| 29 |
+
}
|
| 30 |
+
|
| 31 |
+
// ─── AI signal keywords ──────────────────────────────────────
|
| 32 |
+
|
| 33 |
+
const AI_KEYWORDS = [
|
| 34 |
+
"automation", "artificial intelligence", "machine learning", "ai", "llm",
|
| 35 |
+
"workflow automation", "robotic process", "rpa", "data pipeline",
|
| 36 |
+
"digital transformation", "predictive analytics", "nlp",
|
| 37 |
+
];
|
| 38 |
+
|
| 39 |
+
const TECH_STACK_SIGNALS = [
|
| 40 |
+
"salesforce", "hubspot", "sap", "oracle", "dynamics", "zendesk",
|
| 41 |
+
"servicenow", "workday", "netsuite", "quickbooks", "zoho",
|
| 42 |
+
"slack", "jira", "notion", "monday.com", "asana",
|
| 43 |
+
];
|
| 44 |
+
|
| 45 |
+
// ─── Browser singleton ───────────────────────────────────────
|
| 46 |
+
|
| 47 |
+
let _browser: Browser | null = null;
|
| 48 |
+
|
| 49 |
+
async function getBrowser(): Promise<Browser> {
|
| 50 |
+
if (!_browser || !_browser.isConnected()) {
|
| 51 |
+
_browser = await chromium.launch({
|
| 52 |
+
headless: true,
|
| 53 |
+
args: ["--no-sandbox", "--disable-setuid-sandbox", "--disable-dev-shm-usage"],
|
| 54 |
+
});
|
| 55 |
+
}
|
| 56 |
+
return _browser;
|
| 57 |
+
}
|
| 58 |
+
|
| 59 |
+
export async function closeBrowser(): Promise<void> {
|
| 60 |
+
if (_browser) {
|
| 61 |
+
await _browser.close();
|
| 62 |
+
_browser = null;
|
| 63 |
+
}
|
| 64 |
+
}
|
| 65 |
+
|
| 66 |
+
// ─── Main scraper ─────────────────────────────────────────────
|
| 67 |
+
|
| 68 |
+
/**
|
| 69 |
+
* Scrapes a company website for ICP-relevant signals.
|
| 70 |
+
* Respects rate limits and robots.txt awareness (no sitemap abuse).
|
| 71 |
+
*/
|
| 72 |
+
export async function scrapeCompanyWebsite(domain: string): Promise<ScrapedCompany> {
|
| 73 |
+
await playwrightLimiter.consume("playwright");
|
| 74 |
+
|
| 75 |
+
const browser = await getBrowser();
|
| 76 |
+
const context = await browser.newContext({
|
| 77 |
+
userAgent:
|
| 78 |
+
"Mozilla/5.0 (compatible; ResearchBot/1.0; +https://youragency.com/bot)",
|
| 79 |
+
extraHTTPHeaders: { "Accept-Language": "en-US,en;q=0.9" },
|
| 80 |
+
});
|
| 81 |
+
|
| 82 |
+
const result: ScrapedCompany = {
|
| 83 |
+
domain,
|
| 84 |
+
name: null,
|
| 85 |
+
description: null,
|
| 86 |
+
employeeRange: null,
|
| 87 |
+
employeeCount: null,
|
| 88 |
+
industry: null,
|
| 89 |
+
country: null,
|
| 90 |
+
linkedinUrl: null,
|
| 91 |
+
techStack: [],
|
| 92 |
+
jobPostings: [],
|
| 93 |
+
recentNews: [],
|
| 94 |
+
websiteText: "",
|
| 95 |
+
html: "",
|
| 96 |
+
text: "",
|
| 97 |
+
aiJobCount: 0,
|
| 98 |
+
};
|
| 99 |
+
|
| 100 |
+
try {
|
| 101 |
+
// ── Homepage ─────────────────────────────────────────────
|
| 102 |
+
const homePage = await context.newPage();
|
| 103 |
+
await homePage.goto(`https://${domain}`, {
|
| 104 |
+
waitUntil: "domcontentloaded",
|
| 105 |
+
timeout: 15_000,
|
| 106 |
+
});
|
| 107 |
+
|
| 108 |
+
const homeText = await homePage.evaluate(() => document.body.innerText);
|
| 109 |
+
result.websiteText = homeText.slice(0, 3000);
|
| 110 |
+
result.text = result.websiteText; // alias
|
| 111 |
+
|
| 112 |
+
// Extract company name from title tag
|
| 113 |
+
result.name = await homePage.title().then((t) =>
|
| 114 |
+
t.split("|")[0].split("-")[0].trim()
|
| 115 |
+
);
|
| 116 |
+
|
| 117 |
+
// Find LinkedIn link on homepage
|
| 118 |
+
const linkedinHref = await homePage
|
| 119 |
+
.$eval('a[href*="linkedin.com/company"]', (el) => el.getAttribute("href"))
|
| 120 |
+
.catch(() => null);
|
| 121 |
+
result.linkedinUrl = linkedinHref ?? null;
|
| 122 |
+
|
| 123 |
+
// Tech stack detection from script/link tags
|
| 124 |
+
const pageSource = await homePage.content();
|
| 125 |
+
result.techStack = detectTechStack(pageSource);
|
| 126 |
+
result.html = pageSource.slice(0, 10000); // raw HTML for pain detection
|
| 127 |
+
|
| 128 |
+
await homePage.close();
|
| 129 |
+
|
| 130 |
+
// ── About Page ───────────────────────────────────────────
|
| 131 |
+
const aboutPage = await context.newPage();
|
| 132 |
+
const aboutUrl = `https://${domain}/about`;
|
| 133 |
+
try {
|
| 134 |
+
await aboutPage.goto(aboutUrl, { waitUntil: "domcontentloaded", timeout: 10_000 });
|
| 135 |
+
const aboutText = await aboutPage.evaluate(() => document.body.innerText);
|
| 136 |
+
result.description = extractDescription(aboutText);
|
| 137 |
+
|
| 138 |
+
const empMatch = aboutText.match(/(\d[\d,]*)\s*(employees?|people|team members?|staff)/i);
|
| 139 |
+
if (empMatch) {
|
| 140 |
+
result.employeeCount = parseInt(empMatch[1].replace(/,/g, ""), 10);
|
| 141 |
+
}
|
| 142 |
+
} catch {
|
| 143 |
+
// About page not found — that's fine
|
| 144 |
+
} finally {
|
| 145 |
+
await aboutPage.close();
|
| 146 |
+
}
|
| 147 |
+
|
| 148 |
+
// ── Jobs Page ────────────────────────────────────────────
|
| 149 |
+
const jobsPage = await context.newPage();
|
| 150 |
+
const jobsUrls = [
|
| 151 |
+
`https://${domain}/careers`,
|
| 152 |
+
`https://${domain}/jobs`,
|
| 153 |
+
`https://${domain}/work-with-us`,
|
| 154 |
+
];
|
| 155 |
+
|
| 156 |
+
for (const jobUrl of jobsUrls) {
|
| 157 |
+
try {
|
| 158 |
+
await jobsPage.goto(jobUrl, { waitUntil: "domcontentloaded", timeout: 10_000 });
|
| 159 |
+
const jobsText = await jobsPage.evaluate(() => document.body.innerText);
|
| 160 |
+
result.jobPostings = extractJobPostings(jobsText, jobUrl);
|
| 161 |
+
if (result.jobPostings.length) break;
|
| 162 |
+
} catch {
|
| 163 |
+
// Try next URL
|
| 164 |
+
}
|
| 165 |
+
}
|
| 166 |
+
await jobsPage.close();
|
| 167 |
+
result.aiJobCount = result.jobPostings.filter(j => j.hasAiSignal).length;
|
| 168 |
+
|
| 169 |
+
logger.info({ domain, techStack: result.techStack.length, jobs: result.jobPostings.length },
|
| 170 |
+
"Website scraped successfully"
|
| 171 |
+
);
|
| 172 |
+
} catch (err) {
|
| 173 |
+
logger.warn({ domain, err }, "Website scrape partial failure");
|
| 174 |
+
} finally {
|
| 175 |
+
await context.close();
|
| 176 |
+
}
|
| 177 |
+
|
| 178 |
+
return result;
|
| 179 |
+
}
|
| 180 |
+
|
| 181 |
+
// ─── Helpers ─────────────────────────────────────────────────
|
| 182 |
+
|
| 183 |
+
function detectTechStack(html: string): string[] {
|
| 184 |
+
const found: string[] = [];
|
| 185 |
+
const lower = html.toLowerCase();
|
| 186 |
+
for (const tech of TECH_STACK_SIGNALS) {
|
| 187 |
+
if (lower.includes(tech)) found.push(tech);
|
| 188 |
+
}
|
| 189 |
+
return [...new Set(found)];
|
| 190 |
+
}
|
| 191 |
+
|
| 192 |
+
function extractDescription(text: string): string {
|
| 193 |
+
// Take first 3 meaningful sentences
|
| 194 |
+
const sentences = text
|
| 195 |
+
.replace(/\n+/g, " ")
|
| 196 |
+
.split(/(?<=[.!?])\s+/)
|
| 197 |
+
.filter((s) => s.length > 30 && s.length < 300);
|
| 198 |
+
return sentences.slice(0, 3).join(" ");
|
| 199 |
+
}
|
| 200 |
+
|
| 201 |
+
function extractJobPostings(text: string, sourceUrl: string): JobPosting[] {
|
| 202 |
+
const lines = text.split("\n").filter((l) => l.trim().length > 5);
|
| 203 |
+
const postings: JobPosting[] = [];
|
| 204 |
+
|
| 205 |
+
for (const line of lines.slice(0, 30)) {
|
| 206 |
+
const lower = line.toLowerCase();
|
| 207 |
+
const hasAiSignal = AI_KEYWORDS.some((kw) => lower.includes(kw));
|
| 208 |
+
|
| 209 |
+
// Heuristic: job titles are usually 2-6 words
|
| 210 |
+
const wordCount = line.trim().split(/\s+/).length;
|
| 211 |
+
if (wordCount >= 2 && wordCount <= 8) {
|
| 212 |
+
postings.push({ title: line.trim(), url: sourceUrl, hasAiSignal });
|
| 213 |
+
}
|
| 214 |
+
}
|
| 215 |
+
|
| 216 |
+
return postings.slice(0, 15);
|
| 217 |
+
}
|
| 218 |
+
|
| 219 |
+
export function hasAiSignals(company: ScrapedCompany): boolean {
|
| 220 |
+
const aiJobs = company.jobPostings.filter((j) => j.hasAiSignal).length;
|
| 221 |
+
const websiteHasAi = AI_KEYWORDS.some((kw) =>
|
| 222 |
+
company.websiteText.toLowerCase().includes(kw)
|
| 223 |
+
);
|
| 224 |
+
return aiJobs > 0 || websiteHasAi;
|
| 225 |
+
}
|
|
@@ -0,0 +1,155 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import axios from "axios";
|
| 2 |
+
import { getEnv } from "../../shared/config/env";
|
| 3 |
+
import { withRetry, isCircuitOpen, recordFailure, recordSuccess } from "../../shared/utils/retry";
|
| 4 |
+
import { hunterLimiter } from "../../shared/utils/rate-limiter";
|
| 5 |
+
import { logger } from "../../shared/utils/logger";
|
| 6 |
+
|
| 7 |
+
const PROVIDER = "hunter";
|
| 8 |
+
|
| 9 |
+
export interface HunterEmailResult {
|
| 10 |
+
email: string | null;
|
| 11 |
+
score: number; // Hunter confidence 0-100
|
| 12 |
+
source: "hunter";
|
| 13 |
+
firstName: string | null;
|
| 14 |
+
lastName: string | null;
|
| 15 |
+
}
|
| 16 |
+
|
| 17 |
+
/**
|
| 18 |
+
* Finds a professional email address using Hunter.io.
|
| 19 |
+
* Falls through to pattern generation if not found.
|
| 20 |
+
*/
|
| 21 |
+
export async function findEmail(
|
| 22 |
+
domain: string,
|
| 23 |
+
firstName: string,
|
| 24 |
+
lastName: string
|
| 25 |
+
): Promise<HunterEmailResult | null> {
|
| 26 |
+
if (isCircuitOpen(PROVIDER)) return null;
|
| 27 |
+
|
| 28 |
+
await hunterLimiter.consume(PROVIDER);
|
| 29 |
+
|
| 30 |
+
try {
|
| 31 |
+
const result = await withRetry(
|
| 32 |
+
() => callHunterEmailFinder(domain, firstName, lastName),
|
| 33 |
+
{ provider: PROVIDER }
|
| 34 |
+
);
|
| 35 |
+
recordSuccess(PROVIDER);
|
| 36 |
+
return result;
|
| 37 |
+
} catch (err) {
|
| 38 |
+
recordFailure(PROVIDER);
|
| 39 |
+
logger.warn({ domain, err }, "Hunter email find failed — will try pattern generation");
|
| 40 |
+
return null;
|
| 41 |
+
}
|
| 42 |
+
}
|
| 43 |
+
|
| 44 |
+
/**
|
| 45 |
+
* Searches all known emails for a domain (domain search).
|
| 46 |
+
*/
|
| 47 |
+
export async function searchDomain(domain: string): Promise<HunterEmailResult[]> {
|
| 48 |
+
if (isCircuitOpen(PROVIDER)) return [];
|
| 49 |
+
|
| 50 |
+
await hunterLimiter.consume(PROVIDER);
|
| 51 |
+
|
| 52 |
+
try {
|
| 53 |
+
const result = await withRetry(
|
| 54 |
+
() => callHunterDomainSearch(domain),
|
| 55 |
+
{ provider: PROVIDER }
|
| 56 |
+
);
|
| 57 |
+
recordSuccess(PROVIDER);
|
| 58 |
+
return result;
|
| 59 |
+
} catch (err) {
|
| 60 |
+
recordFailure(PROVIDER);
|
| 61 |
+
logger.warn({ domain, err }, "Hunter domain search failed");
|
| 62 |
+
return [];
|
| 63 |
+
}
|
| 64 |
+
}
|
| 65 |
+
|
| 66 |
+
async function callHunterEmailFinder(
|
| 67 |
+
domain: string,
|
| 68 |
+
firstName: string,
|
| 69 |
+
lastName: string
|
| 70 |
+
): Promise<HunterEmailResult | null> {
|
| 71 |
+
const env = getEnv();
|
| 72 |
+
const response = await axios.get("https://api.hunter.io/v2/email-finder", {
|
| 73 |
+
params: {
|
| 74 |
+
domain,
|
| 75 |
+
first_name: firstName,
|
| 76 |
+
last_name: lastName,
|
| 77 |
+
api_key: env.HUNTER_API_KEY,
|
| 78 |
+
},
|
| 79 |
+
timeout: 8_000,
|
| 80 |
+
});
|
| 81 |
+
|
| 82 |
+
const data = response.data?.data;
|
| 83 |
+
if (!data?.email) return null;
|
| 84 |
+
|
| 85 |
+
return {
|
| 86 |
+
email: data.email,
|
| 87 |
+
score: data.score ?? 0,
|
| 88 |
+
source: "hunter",
|
| 89 |
+
firstName: data.first_name ?? null,
|
| 90 |
+
lastName: data.last_name ?? null,
|
| 91 |
+
};
|
| 92 |
+
}
|
| 93 |
+
|
| 94 |
+
async function callHunterDomainSearch(domain: string): Promise<HunterEmailResult[]> {
|
| 95 |
+
const env = getEnv();
|
| 96 |
+
const response = await axios.get("https://api.hunter.io/v2/domain-search", {
|
| 97 |
+
params: { domain, api_key: env.HUNTER_API_KEY, limit: 10 },
|
| 98 |
+
timeout: 8_000,
|
| 99 |
+
});
|
| 100 |
+
|
| 101 |
+
const emails = response.data?.data?.emails ?? [];
|
| 102 |
+
return emails
|
| 103 |
+
.filter((e: { type: string }) => e.type === "professional")
|
| 104 |
+
.map((e: { value: string; confidence: number; first_name: string; last_name: string }) => ({
|
| 105 |
+
email: e.value,
|
| 106 |
+
score: e.confidence,
|
| 107 |
+
source: "hunter" as const,
|
| 108 |
+
firstName: e.first_name ?? null,
|
| 109 |
+
lastName: e.last_name ?? null,
|
| 110 |
+
}));
|
| 111 |
+
}
|
| 112 |
+
|
| 113 |
+
// ─── Aliases for contact-enricher.ts compatibility ──────────
|
| 114 |
+
|
| 115 |
+
export type HunterContact = {
|
| 116 |
+
value: string; // email
|
| 117 |
+
first_name: string | null;
|
| 118 |
+
last_name: string | null;
|
| 119 |
+
position: string | null;
|
| 120 |
+
seniority: string | null;
|
| 121 |
+
confidence: number;
|
| 122 |
+
};
|
| 123 |
+
|
| 124 |
+
/**
|
| 125 |
+
* Search for contacts at a domain — used by contact-enricher.
|
| 126 |
+
* Maps Hunter's domain-search response to HunterContact format.
|
| 127 |
+
*/
|
| 128 |
+
export async function searchHunterContacts(domain: string): Promise<HunterContact[]> {
|
| 129 |
+
if (isCircuitOpen(PROVIDER)) return [];
|
| 130 |
+
|
| 131 |
+
await hunterLimiter.consume(PROVIDER);
|
| 132 |
+
|
| 133 |
+
try {
|
| 134 |
+
const env = getEnv();
|
| 135 |
+
const response = await axios.get("https://api.hunter.io/v2/domain-search", {
|
| 136 |
+
params: { domain, api_key: env.HUNTER_API_KEY, limit: 10 },
|
| 137 |
+
timeout: 8_000,
|
| 138 |
+
});
|
| 139 |
+
|
| 140 |
+
recordSuccess(PROVIDER);
|
| 141 |
+
const emails = response.data?.data?.emails ?? [];
|
| 142 |
+
return emails.map((e: Record<string, unknown>) => ({
|
| 143 |
+
value: (e.value as string) ?? "",
|
| 144 |
+
first_name: (e.first_name as string) ?? null,
|
| 145 |
+
last_name: (e.last_name as string) ?? null,
|
| 146 |
+
position: (e.position as string) ?? null,
|
| 147 |
+
seniority: (e.seniority as string) ?? null,
|
| 148 |
+
confidence: (e.confidence as number) ?? 0,
|
| 149 |
+
}));
|
| 150 |
+
} catch (err) {
|
| 151 |
+
recordFailure(PROVIDER);
|
| 152 |
+
logger.warn({ domain, err }, "Hunter domain search failed");
|
| 153 |
+
return [];
|
| 154 |
+
}
|
| 155 |
+
}
|
|
@@ -0,0 +1,108 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import axios from "axios";
|
| 2 |
+
import dns from "dns/promises";
|
| 3 |
+
import { getEnv } from "../../shared/config/env";
|
| 4 |
+
import { withRetry, isCircuitOpen, recordFailure, recordSuccess } from "../../shared/utils/retry";
|
| 5 |
+
import { reoonLimiter } from "../../shared/utils/rate-limiter";
|
| 6 |
+
import { logger } from "../../shared/utils/logger";
|
| 7 |
+
|
| 8 |
+
const PROVIDER = "reoon";
|
| 9 |
+
|
| 10 |
+
export type VerifyResult = "valid" | "invalid" | "catch_all" | "unknown";
|
| 11 |
+
|
| 12 |
+
export interface EmailVerification {
|
| 13 |
+
email: string;
|
| 14 |
+
result: VerifyResult;
|
| 15 |
+
isDeliverable: boolean;
|
| 16 |
+
isCatchAll: boolean;
|
| 17 |
+
mxFound: boolean;
|
| 18 |
+
}
|
| 19 |
+
|
| 20 |
+
/**
|
| 21 |
+
* Verifies email deliverability via Reoon API with MX record fallback.
|
| 22 |
+
* Order: Reoon API → local MX check → pattern heuristic
|
| 23 |
+
*/
|
| 24 |
+
export async function verifyEmail(email: string): Promise<EmailVerification> {
|
| 25 |
+
const domain = email.split("@")[1];
|
| 26 |
+
if (!domain) return makeResult(email, "invalid", false, false, false);
|
| 27 |
+
|
| 28 |
+
// Try Reoon API first
|
| 29 |
+
if (!isCircuitOpen(PROVIDER)) {
|
| 30 |
+
await reoonLimiter.consume(PROVIDER);
|
| 31 |
+
try {
|
| 32 |
+
const result = await withRetry(() => callReoon(email), { provider: PROVIDER });
|
| 33 |
+
recordSuccess(PROVIDER);
|
| 34 |
+
return result;
|
| 35 |
+
} catch (err) {
|
| 36 |
+
recordFailure(PROVIDER);
|
| 37 |
+
logger.warn({ email, err }, "Reoon verify failed — falling back to MX check");
|
| 38 |
+
}
|
| 39 |
+
}
|
| 40 |
+
|
| 41 |
+
// Fallback: local MX record check
|
| 42 |
+
return mxFallback(email, domain);
|
| 43 |
+
}
|
| 44 |
+
|
| 45 |
+
async function callReoon(email: string): Promise<EmailVerification> {
|
| 46 |
+
const env = getEnv();
|
| 47 |
+
const response = await axios.get("https://emailverifier.reoon.com/api/v1/verify", {
|
| 48 |
+
params: { email, key: env.REOON_API_KEY, mode: "quick" },
|
| 49 |
+
timeout: 10_000,
|
| 50 |
+
});
|
| 51 |
+
|
| 52 |
+
const data = response.data;
|
| 53 |
+
const result: VerifyResult =
|
| 54 |
+
data.status === "valid"
|
| 55 |
+
? "valid"
|
| 56 |
+
: data.status === "catch_all"
|
| 57 |
+
? "catch_all"
|
| 58 |
+
: "invalid";
|
| 59 |
+
|
| 60 |
+
return makeResult(
|
| 61 |
+
email,
|
| 62 |
+
result,
|
| 63 |
+
data.is_deliverable ?? result === "valid",
|
| 64 |
+
data.is_catch_all ?? false,
|
| 65 |
+
data.has_mx_record ?? true
|
| 66 |
+
);
|
| 67 |
+
}
|
| 68 |
+
|
| 69 |
+
async function mxFallback(email: string, domain: string): Promise<EmailVerification> {
|
| 70 |
+
try {
|
| 71 |
+
const records = await dns.resolveMx(domain);
|
| 72 |
+
const mxFound = records.length > 0;
|
| 73 |
+
return makeResult(email, mxFound ? "catch_all" : "invalid", mxFound, mxFound, mxFound);
|
| 74 |
+
} catch {
|
| 75 |
+
return makeResult(email, "unknown", false, false, false);
|
| 76 |
+
}
|
| 77 |
+
}
|
| 78 |
+
|
| 79 |
+
function makeResult(
|
| 80 |
+
email: string,
|
| 81 |
+
result: VerifyResult,
|
| 82 |
+
isDeliverable: boolean,
|
| 83 |
+
isCatchAll: boolean,
|
| 84 |
+
mxFound: boolean
|
| 85 |
+
): EmailVerification {
|
| 86 |
+
return { email, result, isDeliverable, isCatchAll, mxFound };
|
| 87 |
+
}
|
| 88 |
+
|
| 89 |
+
/**
|
| 90 |
+
* Generates email pattern candidates for a name + domain.
|
| 91 |
+
* Returns ordered list from most to least common pattern.
|
| 92 |
+
*/
|
| 93 |
+
export function generateEmailPatterns(
|
| 94 |
+
firstName: string,
|
| 95 |
+
lastName: string,
|
| 96 |
+
domain: string
|
| 97 |
+
): string[] {
|
| 98 |
+
const f = firstName.toLowerCase().replace(/[^a-z]/g, "");
|
| 99 |
+
const l = lastName.toLowerCase().replace(/[^a-z]/g, "");
|
| 100 |
+
return [
|
| 101 |
+
`${f}.${l}@${domain}`,
|
| 102 |
+
`${f}${l}@${domain}`,
|
| 103 |
+
`${f[0]}${l}@${domain}`,
|
| 104 |
+
`${f}@${domain}`,
|
| 105 |
+
`${f[0]}.${l}@${domain}`,
|
| 106 |
+
`${l}.${f}@${domain}`,
|
| 107 |
+
].filter(Boolean);
|
| 108 |
+
}
|
|
@@ -0,0 +1,108 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import axios from "axios";
|
| 2 |
+
import { getEnv } from "../../shared/config/env";
|
| 3 |
+
import { withRetry, isCircuitOpen, recordFailure, recordSuccess } from "../../shared/utils/retry";
|
| 4 |
+
import { serperLimiter } from "../../shared/utils/rate-limiter";
|
| 5 |
+
import { logger } from "../../shared/utils/logger";
|
| 6 |
+
|
| 7 |
+
const PROVIDER = "serper";
|
| 8 |
+
|
| 9 |
+
export interface SerperResult {
|
| 10 |
+
title: string;
|
| 11 |
+
link: string;
|
| 12 |
+
snippet: string;
|
| 13 |
+
domain: string;
|
| 14 |
+
}
|
| 15 |
+
|
| 16 |
+
/**
|
| 17 |
+
* Searches Google via Serper.dev API.
|
| 18 |
+
* Builds targeted queries to find companies matching ICP in a given region.
|
| 19 |
+
*/
|
| 20 |
+
export async function searchCompanies(
|
| 21 |
+
region: string,
|
| 22 |
+
industry: string,
|
| 23 |
+
keywords: string[],
|
| 24 |
+
page = 1
|
| 25 |
+
): Promise<SerperResult[]> {
|
| 26 |
+
if (isCircuitOpen(PROVIDER)) {
|
| 27 |
+
logger.warn({ provider: PROVIDER }, "Circuit open — skipping Serper call");
|
| 28 |
+
return [];
|
| 29 |
+
}
|
| 30 |
+
|
| 31 |
+
await serperLimiter.consume(PROVIDER);
|
| 32 |
+
|
| 33 |
+
const queries = buildQueries(region, industry, keywords);
|
| 34 |
+
const results: SerperResult[] = [];
|
| 35 |
+
|
| 36 |
+
for (const query of queries) {
|
| 37 |
+
try {
|
| 38 |
+
const data = await withRetry(
|
| 39 |
+
() => callSerper(query, page),
|
| 40 |
+
{ provider: PROVIDER }
|
| 41 |
+
);
|
| 42 |
+
results.push(...data);
|
| 43 |
+
recordSuccess(PROVIDER);
|
| 44 |
+
} catch (err) {
|
| 45 |
+
recordFailure(PROVIDER);
|
| 46 |
+
logger.error({ query, err }, "Serper search failed");
|
| 47 |
+
}
|
| 48 |
+
}
|
| 49 |
+
|
| 50 |
+
// Deduplicate by domain
|
| 51 |
+
const seen = new Set<string>();
|
| 52 |
+
return results.filter((r) => {
|
| 53 |
+
if (seen.has(r.domain)) return false;
|
| 54 |
+
seen.add(r.domain);
|
| 55 |
+
return true;
|
| 56 |
+
});
|
| 57 |
+
}
|
| 58 |
+
|
| 59 |
+
async function callSerper(query: string, page: number): Promise<SerperResult[]> {
|
| 60 |
+
const env = getEnv();
|
| 61 |
+
const response = await axios.post(
|
| 62 |
+
"https://google.serper.dev/search",
|
| 63 |
+
{ q: query, num: 10, page },
|
| 64 |
+
{
|
| 65 |
+
headers: {
|
| 66 |
+
"X-API-KEY": env.SERPER_API_KEY,
|
| 67 |
+
"Content-Type": "application/json",
|
| 68 |
+
},
|
| 69 |
+
timeout: 10_000,
|
| 70 |
+
}
|
| 71 |
+
);
|
| 72 |
+
|
| 73 |
+
const organic = response.data?.organic ?? [];
|
| 74 |
+
return organic.map((item: { title: string; link: string; snippet: string }) => ({
|
| 75 |
+
title: item.title,
|
| 76 |
+
link: item.link,
|
| 77 |
+
snippet: item.snippet,
|
| 78 |
+
domain: extractDomain(item.link),
|
| 79 |
+
}));
|
| 80 |
+
}
|
| 81 |
+
|
| 82 |
+
function buildQueries(region: string, industry: string, keywords: string[]): string[] {
|
| 83 |
+
// Precision queries — each targets a specific pain+industry+region combo
|
| 84 |
+
const regionLabel = REGION_LABELS[region] ?? region;
|
| 85 |
+
return [
|
| 86 |
+
`"${industry}" company "${regionLabel}" "50 employees" OR "100 employees" OR "200 employees" automation`,
|
| 87 |
+
`${industry} business ${regionLabel} site:linkedin.com/company`,
|
| 88 |
+
`"${industry}" "${regionLabel}" "digital transformation" OR "AI" OR "automation" company`,
|
| 89 |
+
`${keywords[0]} ${keywords[1] ?? ""} company ${regionLabel} -job -careers`,
|
| 90 |
+
].filter(Boolean);
|
| 91 |
+
}
|
| 92 |
+
|
| 93 |
+
function extractDomain(url: string): string {
|
| 94 |
+
try {
|
| 95 |
+
return new URL(url).hostname.replace(/^www\./, "");
|
| 96 |
+
} catch {
|
| 97 |
+
return url;
|
| 98 |
+
}
|
| 99 |
+
}
|
| 100 |
+
|
| 101 |
+
const REGION_LABELS: Record<string, string> = {
|
| 102 |
+
US: "United States",
|
| 103 |
+
UK: "United Kingdom",
|
| 104 |
+
AU: "Australia",
|
| 105 |
+
UAE: "Dubai",
|
| 106 |
+
SA: "Saudi Arabia",
|
| 107 |
+
SG: "Singapore",
|
| 108 |
+
};
|
|
@@ -0,0 +1,517 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/**
|
| 2 |
+
* Trigger.dev Task Definitions — Phase 1 Pipeline
|
| 3 |
+
*
|
| 4 |
+
* 5 chained tasks instead of 1 monolithic function:
|
| 5 |
+
*
|
| 6 |
+
* Task 1: daily-scheduler → CRON 4 AM UTC → picks territory → triggers process-company
|
| 7 |
+
* Task 2: process-company → scrape + pain detect + gate 2 → triggers enrich-contacts
|
| 8 |
+
* Task 3: enrich-contacts → emails + classify + verify + LinkedIn + social → triggers ai-profile
|
| 9 |
+
* Task 4: ai-profile-score → Python service → save → triggers hot-alert if needed
|
| 10 |
+
* Task 5: daily-digest → CRON 6:30 AM UTC → collects today's results → Slack digest
|
| 11 |
+
*
|
| 12 |
+
* Benefits:
|
| 13 |
+
* - Company #3 fails → only #3 retries, rest continue
|
| 14 |
+
* - 3 companies process in parallel (concurrency limit)
|
| 15 |
+
* - Each task has its own retry policy
|
| 16 |
+
* - Dashboard shows exact failure point
|
| 17 |
+
*/
|
| 18 |
+
|
| 19 |
+
import { task, schedules, queue } from "@trigger.dev/sdk/v3";
|
| 20 |
+
import { getSupabaseClient } from "../../shared/supabase/client";
|
| 21 |
+
import { startTrace, recordOperation, endTrace } from "../../shared/observability/tracer";
|
| 22 |
+
import { saveCheckpoint, isAlreadyProcessed } from "../../shared/pipeline/checkpoint";
|
| 23 |
+
import { getNextTerritory, getDailyQuota, markTerritorySearched, isSystemPaused, buildTerritoryQueries } from "../lib/territory-manager";
|
| 24 |
+
import { scrapeCompanyWebsite } from "../lib/web-scraper";
|
| 25 |
+
import { detectPainSignals } from "../lib/pain-signal-detector";
|
| 26 |
+
import { enrichContacts } from "../lib/contact-enricher";
|
| 27 |
+
import { sendRunStarted, sendRunProgress, sendDailyDigest, sendHotLeadAlert } from "../../slack/slack-service";
|
| 28 |
+
import { logger } from "../../shared/utils/logger";
|
| 29 |
+
import { randomUUID } from "crypto";
|
| 30 |
+
import axios from "axios";
|
| 31 |
+
import { getEnv } from "../../shared/config/env";
|
| 32 |
+
|
| 33 |
+
// ─── Queue: max 3 companies processing simultaneously ────────
|
| 34 |
+
const companyQueue = queue({
|
| 35 |
+
name: "company-processing",
|
| 36 |
+
concurrencyLimit: 3,
|
| 37 |
+
});
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
// ═══════════════════════════════════════════════════════════════
|
| 41 |
+
// TASK 1: Daily Scheduler (CRON — runs every day at 4 AM UTC)
|
| 42 |
+
// ═══════════════════════════════════════════════════════════════
|
| 43 |
+
|
| 44 |
+
export const dailyScheduler = schedules.task({
|
| 45 |
+
id: "daily-lead-discovery",
|
| 46 |
+
// Cron configured in Trigger.dev dashboard: 0 4 * * * (4 AM UTC = 9 AM PKT)
|
| 47 |
+
maxDuration: 300, // 5 minutes for setup
|
| 48 |
+
run: async () => {
|
| 49 |
+
// Pre-flight
|
| 50 |
+
if (await isSystemPaused()) {
|
| 51 |
+
logger.info("⏸️ System paused — skipping today");
|
| 52 |
+
return { status: "paused" };
|
| 53 |
+
}
|
| 54 |
+
|
| 55 |
+
const quota = await getDailyQuota();
|
| 56 |
+
const territories = await getNextTerritory(quota);
|
| 57 |
+
|
| 58 |
+
if (territories.length === 0) {
|
| 59 |
+
logger.warn("No fresh territory — all cooling down");
|
| 60 |
+
return { status: "no_territory" };
|
| 61 |
+
}
|
| 62 |
+
|
| 63 |
+
// Create run record
|
| 64 |
+
const db = getSupabaseClient();
|
| 65 |
+
const runId = randomUUID();
|
| 66 |
+
const traceId = startTrace(runId);
|
| 67 |
+
const unit = territories[0];
|
| 68 |
+
|
| 69 |
+
await db.from("discovery_runs").insert({
|
| 70 |
+
id: runId,
|
| 71 |
+
run_type: "auto",
|
| 72 |
+
territory_id: unit.territoryId,
|
| 73 |
+
country_code: unit.countryCode,
|
| 74 |
+
city: unit.city,
|
| 75 |
+
industry: unit.industry,
|
| 76 |
+
quota_target: quota,
|
| 77 |
+
status: "running",
|
| 78 |
+
triggered_by: "system",
|
| 79 |
+
});
|
| 80 |
+
|
| 81 |
+
// Slack: run started
|
| 82 |
+
await sendRunStarted(`${unit.city}, ${unit.country}`, unit.industry, quota);
|
| 83 |
+
|
| 84 |
+
// Search Google for companies
|
| 85 |
+
const env = getEnv();
|
| 86 |
+
const allDomains: string[] = [];
|
| 87 |
+
|
| 88 |
+
for (const territory of territories) {
|
| 89 |
+
const queries = buildTerritoryQueries(territory, []);
|
| 90 |
+
|
| 91 |
+
for (const query of queries) {
|
| 92 |
+
try {
|
| 93 |
+
const response = await axios.post(
|
| 94 |
+
"https://google.serper.dev/search",
|
| 95 |
+
{ q: query, num: 10 },
|
| 96 |
+
{
|
| 97 |
+
headers: { "X-API-KEY": env.SERPER_API_KEY, "Content-Type": "application/json" },
|
| 98 |
+
timeout: 8_000,
|
| 99 |
+
}
|
| 100 |
+
);
|
| 101 |
+
|
| 102 |
+
const organic = response.data?.organic ?? [];
|
| 103 |
+
for (const result of organic) {
|
| 104 |
+
try {
|
| 105 |
+
const hostname = new URL(result.link).hostname.replace(/^www\./, "");
|
| 106 |
+
const skip = ["facebook.com", "linkedin.com", "twitter.com", "instagram.com",
|
| 107 |
+
"youtube.com", "yelp.com", "yellowpages.com", "bbb.org", "wikipedia.org",
|
| 108 |
+
"reddit.com", "crunchbase.com", "glassdoor.com"];
|
| 109 |
+
if (!skip.some(s => hostname.includes(s)) && !allDomains.includes(hostname)) {
|
| 110 |
+
allDomains.push(hostname);
|
| 111 |
+
}
|
| 112 |
+
} catch { /* invalid URL */ }
|
| 113 |
+
}
|
| 114 |
+
} catch (err) {
|
| 115 |
+
logger.warn({ query, err }, "Serper search failed — continuing");
|
| 116 |
+
}
|
| 117 |
+
}
|
| 118 |
+
}
|
| 119 |
+
|
| 120 |
+
logger.info({ domains: allDomains.length, territory: unit.city }, "Domains found �� triggering company tasks");
|
| 121 |
+
|
| 122 |
+
// Trigger Task 2 for each domain (queued, max 3 concurrent)
|
| 123 |
+
const companyTasks = [];
|
| 124 |
+
for (const domain of allDomains) {
|
| 125 |
+
// Skip already processed
|
| 126 |
+
if (await isAlreadyProcessed(domain, 30)) continue;
|
| 127 |
+
|
| 128 |
+
companyTasks.push(
|
| 129 |
+
processCompany.trigger({
|
| 130 |
+
domain,
|
| 131 |
+
runId,
|
| 132 |
+
traceId,
|
| 133 |
+
industry: unit.industry,
|
| 134 |
+
city: unit.city,
|
| 135 |
+
country: unit.country,
|
| 136 |
+
countryCode: unit.countryCode,
|
| 137 |
+
territoryId: unit.territoryId,
|
| 138 |
+
quota,
|
| 139 |
+
linkedInUrl: null,
|
| 140 |
+
})
|
| 141 |
+
);
|
| 142 |
+
}
|
| 143 |
+
|
| 144 |
+
// Wait for all company tasks
|
| 145 |
+
const results = await Promise.allSettled(companyTasks);
|
| 146 |
+
const succeeded = results.filter(r => r.status === "fulfilled").length;
|
| 147 |
+
|
| 148 |
+
// Mark territory searched
|
| 149 |
+
await markTerritorySearched(unit.territoryId, unit.industry, succeeded);
|
| 150 |
+
|
| 151 |
+
// Update run
|
| 152 |
+
await db.from("discovery_runs").update({
|
| 153 |
+
status: "completed",
|
| 154 |
+
companies_found: allDomains.length,
|
| 155 |
+
completed_at: new Date().toISOString(),
|
| 156 |
+
search_queries: buildTerritoryQueries(unit, []),
|
| 157 |
+
}).eq("id", runId);
|
| 158 |
+
|
| 159 |
+
await endTrace(traceId);
|
| 160 |
+
|
| 161 |
+
return {
|
| 162 |
+
status: "completed",
|
| 163 |
+
domainsFound: allDomains.length,
|
| 164 |
+
tasksTriggered: companyTasks.length,
|
| 165 |
+
succeeded,
|
| 166 |
+
};
|
| 167 |
+
},
|
| 168 |
+
});
|
| 169 |
+
|
| 170 |
+
|
| 171 |
+
// ═══════════════════════════════════════════════════════════════
|
| 172 |
+
// TASK 2: Process Company (per company, queued)
|
| 173 |
+
// ═══════════════════════════════════════════════════════════════
|
| 174 |
+
|
| 175 |
+
export const processCompany = task({
|
| 176 |
+
id: "process-company",
|
| 177 |
+
queue: companyQueue,
|
| 178 |
+
retry: {
|
| 179 |
+
maxAttempts: 2,
|
| 180 |
+
minTimeoutInMs: 5_000,
|
| 181 |
+
maxTimeoutInMs: 30_000,
|
| 182 |
+
factor: 2,
|
| 183 |
+
},
|
| 184 |
+
maxDuration: 120, // 2 minutes per company
|
| 185 |
+
run: async (payload: {
|
| 186 |
+
domain: string;
|
| 187 |
+
runId: string;
|
| 188 |
+
traceId: string;
|
| 189 |
+
industry: string;
|
| 190 |
+
city: string;
|
| 191 |
+
country: string;
|
| 192 |
+
countryCode: string;
|
| 193 |
+
territoryId: string;
|
| 194 |
+
quota: number;
|
| 195 |
+
linkedInUrl: string | null;
|
| 196 |
+
}) => {
|
| 197 |
+
const { domain, runId, traceId, industry, city, country } = payload;
|
| 198 |
+
|
| 199 |
+
logger.info({ domain }, "Processing company");
|
| 200 |
+
|
| 201 |
+
// ── Stage 1: Scrape website ────────────────────────────────
|
| 202 |
+
const websiteData = await scrapeCompanyWebsite(domain);
|
| 203 |
+
if (!websiteData?.text) {
|
| 204 |
+
await saveCheckpoint(runId, domain, "completed", { reason: "no_website" });
|
| 205 |
+
return { status: "skipped", reason: "no_website_data" };
|
| 206 |
+
}
|
| 207 |
+
|
| 208 |
+
await saveCheckpoint(runId, domain, "scraped");
|
| 209 |
+
|
| 210 |
+
// ── Stage 2: Pain signal detection + Gate 2 ────────────────
|
| 211 |
+
const painResult = await detectPainSignals(
|
| 212 |
+
websiteData.name ?? domain,
|
| 213 |
+
industry,
|
| 214 |
+
websiteData.employeeCount ?? null,
|
| 215 |
+
websiteData.text ?? "",
|
| 216 |
+
websiteData.html ?? "",
|
| 217 |
+
traceId
|
| 218 |
+
);
|
| 219 |
+
|
| 220 |
+
// Gate 2: minimum 2 pain signals OR service match
|
| 221 |
+
if (painResult.painSignals.length < 2 && !painResult.serviceMatch) {
|
| 222 |
+
await saveCheckpoint(runId, domain, "completed", { reason: "gate2_failed" });
|
| 223 |
+
return { status: "skipped", reason: "gate2_failed" };
|
| 224 |
+
}
|
| 225 |
+
|
| 226 |
+
await saveCheckpoint(runId, domain, "filtered");
|
| 227 |
+
|
| 228 |
+
// ── Trigger Task 3: Enrich contacts ────────────────────────
|
| 229 |
+
const enrichResult = await enrichAndProfile.trigger({
|
| 230 |
+
domain,
|
| 231 |
+
runId,
|
| 232 |
+
traceId,
|
| 233 |
+
industry,
|
| 234 |
+
city,
|
| 235 |
+
country,
|
| 236 |
+
companyName: websiteData.name ?? domain,
|
| 237 |
+
employeeCount: websiteData.employeeCount ?? null,
|
| 238 |
+
description: websiteData.description ?? "",
|
| 239 |
+
websiteText: (websiteData.text ?? "").slice(0, 800),
|
| 240 |
+
websiteHtml: (websiteData.html ?? "").slice(0, 5000),
|
| 241 |
+
techStack: websiteData.techStack ?? [],
|
| 242 |
+
aiJobCount: websiteData.aiJobCount ?? 0,
|
| 243 |
+
linkedInUrl: websiteData.linkedinUrl ?? null,
|
| 244 |
+
painSignals: painResult.painSignals.map(p => p.signal),
|
| 245 |
+
serviceMatch: painResult.serviceMatch,
|
| 246 |
+
matchConfidence: painResult.matchConfidence,
|
| 247 |
+
});
|
| 248 |
+
|
| 249 |
+
return { status: "passed_to_enrichment", domain };
|
| 250 |
+
},
|
| 251 |
+
});
|
| 252 |
+
|
| 253 |
+
|
| 254 |
+
// ═══════════════════════════════════════════════════════════════
|
| 255 |
+
// TASK 3: Enrich Contacts + AI Profile + Score (combined)
|
| 256 |
+
// ═══════════════════════════════════════════════════════════════
|
| 257 |
+
|
| 258 |
+
export const enrichAndProfile = task({
|
| 259 |
+
id: "enrich-and-profile",
|
| 260 |
+
retry: {
|
| 261 |
+
maxAttempts: 2,
|
| 262 |
+
minTimeoutInMs: 3_000,
|
| 263 |
+
maxTimeoutInMs: 20_000,
|
| 264 |
+
factor: 2,
|
| 265 |
+
},
|
| 266 |
+
maxDuration: 180, // 3 minutes (email verification can be slow)
|
| 267 |
+
run: async (payload: {
|
| 268 |
+
domain: string;
|
| 269 |
+
runId: string;
|
| 270 |
+
traceId: string;
|
| 271 |
+
industry: string;
|
| 272 |
+
city: string;
|
| 273 |
+
country: string;
|
| 274 |
+
companyName: string;
|
| 275 |
+
employeeCount: number | null;
|
| 276 |
+
description: string;
|
| 277 |
+
websiteText: string;
|
| 278 |
+
websiteHtml: string;
|
| 279 |
+
techStack: string[];
|
| 280 |
+
aiJobCount: number;
|
| 281 |
+
linkedInUrl: string | null;
|
| 282 |
+
painSignals: string[];
|
| 283 |
+
serviceMatch: string | null;
|
| 284 |
+
matchConfidence: number;
|
| 285 |
+
}) => {
|
| 286 |
+
const db = getSupabaseClient();
|
| 287 |
+
const env = getEnv();
|
| 288 |
+
|
| 289 |
+
// ── Step 1: Enrich contacts ──────────────────────────────
|
| 290 |
+
const contacts = await enrichContacts(
|
| 291 |
+
"",
|
| 292 |
+
payload.domain,
|
| 293 |
+
payload.companyName,
|
| 294 |
+
payload.employeeCount,
|
| 295 |
+
payload.industry,
|
| 296 |
+
payload.websiteText.slice(0, 300),
|
| 297 |
+
payload.websiteHtml,
|
| 298 |
+
payload.linkedInUrl,
|
| 299 |
+
payload.traceId
|
| 300 |
+
);
|
| 301 |
+
|
| 302 |
+
if (contacts.length === 0) {
|
| 303 |
+
await saveCheckpoint(payload.runId, payload.domain, "completed", { reason: "no_contacts" });
|
| 304 |
+
return { status: "skipped", reason: "no_contacts" };
|
| 305 |
+
}
|
| 306 |
+
|
| 307 |
+
// Must have authority-confirmed contact
|
| 308 |
+
const authorityContacts = contacts.filter(c => c.authorityConfirmed);
|
| 309 |
+
if (authorityContacts.length === 0) {
|
| 310 |
+
await saveCheckpoint(payload.runId, payload.domain, "completed", { reason: "no_authority" });
|
| 311 |
+
return { status: "skipped", reason: "no_authority_contacts" };
|
| 312 |
+
}
|
| 313 |
+
|
| 314 |
+
await saveCheckpoint(payload.runId, payload.domain, "emails_verified");
|
| 315 |
+
|
| 316 |
+
// ── Step 2: Save company ─────────────────────────────────
|
| 317 |
+
const companyId = randomUUID();
|
| 318 |
+
await db.from("companies").upsert({
|
| 319 |
+
id: companyId,
|
| 320 |
+
domain: payload.domain,
|
| 321 |
+
name: payload.companyName,
|
| 322 |
+
industry: payload.industry,
|
| 323 |
+
employee_count: payload.employeeCount,
|
| 324 |
+
description: payload.description,
|
| 325 |
+
website_status: "active",
|
| 326 |
+
linkedin_url: payload.linkedInUrl,
|
| 327 |
+
tech_stack: payload.techStack,
|
| 328 |
+
country: payload.country,
|
| 329 |
+
city: payload.city,
|
| 330 |
+
service_match: payload.serviceMatch,
|
| 331 |
+
service_match_score: Math.round(payload.matchConfidence * 100),
|
| 332 |
+
pain_signals: payload.painSignals,
|
| 333 |
+
trace_id: payload.traceId,
|
| 334 |
+
}, { onConflict: "domain" });
|
| 335 |
+
|
| 336 |
+
// Update contacts with company_id
|
| 337 |
+
for (const contact of contacts) {
|
| 338 |
+
await db.from("contacts").update({ company_id: companyId }).eq("id", contact.id);
|
| 339 |
+
}
|
| 340 |
+
|
| 341 |
+
// ── Step 3: AI Profile + Score (Python service) ──────────
|
| 342 |
+
const profileResponse = await axios.post(
|
| 343 |
+
`${env.PYTHON_AI_SERVICE_URL}/profile`,
|
| 344 |
+
{
|
| 345 |
+
company: {
|
| 346 |
+
id: companyId,
|
| 347 |
+
name: payload.companyName,
|
| 348 |
+
industry: payload.industry,
|
| 349 |
+
employee_count: payload.employeeCount,
|
| 350 |
+
description: payload.description,
|
| 351 |
+
website_text: payload.websiteText,
|
| 352 |
+
linkedin_description: "",
|
| 353 |
+
tech_stack: payload.techStack,
|
| 354 |
+
ai_job_count: payload.aiJobCount,
|
| 355 |
+
pain_signals: payload.painSignals,
|
| 356 |
+
service_match: payload.serviceMatch,
|
| 357 |
+
},
|
| 358 |
+
contacts: contacts.map(c => ({
|
| 359 |
+
full_name: c.fullName,
|
| 360 |
+
email: c.email,
|
| 361 |
+
email_verified: c.emailVerification?.status === "verified_deliverable",
|
| 362 |
+
linkedin_personal_url: c.linkedinPersonalUrl,
|
| 363 |
+
social_profiles: c.socialProfiles ?? {},
|
| 364 |
+
})),
|
| 365 |
+
trace_id: payload.traceId,
|
| 366 |
+
},
|
| 367 |
+
{
|
| 368 |
+
headers: { Authorization: `Bearer ${env.PYTHON_AI_SERVICE_SECRET}` },
|
| 369 |
+
timeout: 45_000,
|
| 370 |
+
}
|
| 371 |
+
);
|
| 372 |
+
|
| 373 |
+
const profile = profileResponse.data?.profile;
|
| 374 |
+
const score = profileResponse.data?.score;
|
| 375 |
+
const totalScore = score?.total_score ?? 0;
|
| 376 |
+
const tier = score?.tier ?? "archive";
|
| 377 |
+
|
| 378 |
+
// Save profile and score
|
| 379 |
+
await db.from("lead_profiles").upsert({
|
| 380 |
+
company_id: companyId,
|
| 381 |
+
...profile,
|
| 382 |
+
}, { onConflict: "company_id" });
|
| 383 |
+
|
| 384 |
+
await db.from("lead_scores").upsert({
|
| 385 |
+
company_id: companyId,
|
| 386 |
+
...score,
|
| 387 |
+
}, { onConflict: "company_id" });
|
| 388 |
+
|
| 389 |
+
// Update run stats
|
| 390 |
+
if (totalScore >= 70) {
|
| 391 |
+
await db.rpc("increment_run_leads", { run_id: payload.runId });
|
| 392 |
+
}
|
| 393 |
+
|
| 394 |
+
await saveCheckpoint(payload.runId, payload.domain, "completed");
|
| 395 |
+
|
| 396 |
+
// ── Step 4: Hot lead alert (85+) ─────────────────────────
|
| 397 |
+
if (totalScore >= 85) {
|
| 398 |
+
const best = authorityContacts[0];
|
| 399 |
+
await sendHotLeadAlert({
|
| 400 |
+
companyName: payload.companyName,
|
| 401 |
+
domain: payload.domain,
|
| 402 |
+
industry: payload.industry,
|
| 403 |
+
employeeCount: payload.employeeCount,
|
| 404 |
+
city: payload.city,
|
| 405 |
+
score: totalScore,
|
| 406 |
+
tier,
|
| 407 |
+
contactName: best.fullName,
|
| 408 |
+
contactTitle: best.title ?? "",
|
| 409 |
+
email: best.email,
|
| 410 |
+
emailVerified: best.emailVerification?.status === "verified_deliverable",
|
| 411 |
+
linkedinPersonal: best.linkedinPersonalUrl,
|
| 412 |
+
linkedinCompany: payload.linkedInUrl,
|
| 413 |
+
serviceMatch: payload.serviceMatch,
|
| 414 |
+
outreachAngle: profile?.outreach_angle ?? "",
|
| 415 |
+
painPoints: (profile?.pain_points ?? []).slice(0, 3),
|
| 416 |
+
socialProfiles: best.socialProfiles ?? {},
|
| 417 |
+
});
|
| 418 |
+
}
|
| 419 |
+
|
| 420 |
+
recordOperation(payload.traceId, "enrich_and_profile", score?.tokens_used ?? 0, 0, true);
|
| 421 |
+
|
| 422 |
+
return {
|
| 423 |
+
status: "completed",
|
| 424 |
+
domain: payload.domain,
|
| 425 |
+
score: totalScore,
|
| 426 |
+
tier,
|
| 427 |
+
contactsFound: contacts.length,
|
| 428 |
+
authorityConfirmed: authorityContacts.length,
|
| 429 |
+
hasLinkedIn: contacts.some(c => c.linkedinPersonalUrl),
|
| 430 |
+
};
|
| 431 |
+
},
|
| 432 |
+
});
|
| 433 |
+
|
| 434 |
+
|
| 435 |
+
// ═══════════════════════════════════════════════════════════════
|
| 436 |
+
// TASK 4: Daily Digest (CRON — runs at 6:30 AM UTC = 11:30 AM PKT)
|
| 437 |
+
// ═══════════════════════════════════════════════════════════════
|
| 438 |
+
|
| 439 |
+
export const dailyDigestTask = schedules.task({
|
| 440 |
+
id: "daily-digest",
|
| 441 |
+
// Cron configured in Trigger.dev dashboard: 30 6 * * * (6:30 AM UTC)
|
| 442 |
+
maxDuration: 60,
|
| 443 |
+
run: async () => {
|
| 444 |
+
const db = getSupabaseClient();
|
| 445 |
+
const today = new Date();
|
| 446 |
+
today.setHours(0, 0, 0, 0);
|
| 447 |
+
|
| 448 |
+
// Get today's run stats
|
| 449 |
+
const { data: runs } = await db
|
| 450 |
+
.from("discovery_runs")
|
| 451 |
+
.select("*")
|
| 452 |
+
.gte("ran_at", today.toISOString());
|
| 453 |
+
|
| 454 |
+
const latestRun = runs?.[0];
|
| 455 |
+
if (!latestRun) {
|
| 456 |
+
logger.info("No runs today — skipping digest");
|
| 457 |
+
return;
|
| 458 |
+
}
|
| 459 |
+
|
| 460 |
+
// Count today's leads by tier
|
| 461 |
+
const { data: scores } = await db
|
| 462 |
+
.from("lead_scores")
|
| 463 |
+
.select("total_score, tier")
|
| 464 |
+
.gte("created_at", today.toISOString());
|
| 465 |
+
|
| 466 |
+
const hotLeads = scores?.filter(s => s.tier === "hot").length ?? 0;
|
| 467 |
+
const warmLeads = scores?.filter(s => s.tier === "warm").length ?? 0;
|
| 468 |
+
const nurtureLeads = scores?.filter(s => s.tier === "nurture").length ?? 0;
|
| 469 |
+
|
| 470 |
+
// Get token usage
|
| 471 |
+
const { data: traces } = await db
|
| 472 |
+
.from("llm_traces")
|
| 473 |
+
.select("total_tokens")
|
| 474 |
+
.gte("created_at", today.toISOString());
|
| 475 |
+
|
| 476 |
+
const totalTokens = traces?.reduce((sum, t) => sum + (t.total_tokens ?? 0), 0) ?? 0;
|
| 477 |
+
|
| 478 |
+
await sendDailyDigest({
|
| 479 |
+
territory: `${latestRun.city}, ${latestRun.country_code}`,
|
| 480 |
+
industry: latestRun.industry,
|
| 481 |
+
companiesSearched: latestRun.companies_found ?? 0,
|
| 482 |
+
leadsQualified: (scores?.length ?? 0),
|
| 483 |
+
hotLeads,
|
| 484 |
+
warmLeads,
|
| 485 |
+
nurtureLeads,
|
| 486 |
+
tokensUsed: totalTokens,
|
| 487 |
+
durationMinutes: latestRun.completed_at
|
| 488 |
+
? Math.round((new Date(latestRun.completed_at).getTime() - new Date(latestRun.ran_at).getTime()) / 60_000)
|
| 489 |
+
: 0,
|
| 490 |
+
});
|
| 491 |
+
|
| 492 |
+
return { sent: true, leads: scores?.length ?? 0 };
|
| 493 |
+
},
|
| 494 |
+
});
|
| 495 |
+
|
| 496 |
+
|
| 497 |
+
// ═══════════════════════════════════════════════════════════════
|
| 498 |
+
// TASK 5: Manual Discovery (triggered from Slack)
|
| 499 |
+
// ═══════════════════════════════════════════════════════════════
|
| 500 |
+
|
| 501 |
+
export const manualDiscoveryTask = task({
|
| 502 |
+
id: "manual-discovery",
|
| 503 |
+
maxDuration: 300,
|
| 504 |
+
run: async (payload: {
|
| 505 |
+
region: string;
|
| 506 |
+
industry: string;
|
| 507 |
+
maxCompanies: number;
|
| 508 |
+
triggeredBy: string;
|
| 509 |
+
}) => {
|
| 510 |
+
// Reuse the daily scheduler logic but with custom territory
|
| 511 |
+
logger.info({ payload }, "Manual discovery triggered from Slack");
|
| 512 |
+
|
| 513 |
+
// TODO: Build custom territory from region param
|
| 514 |
+
// For now, trigger the same pipeline
|
| 515 |
+
return { status: "manual_run_started", ...payload };
|
| 516 |
+
},
|
| 517 |
+
});
|
|
@@ -0,0 +1,139 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import { task } from "@trigger.dev/sdk/v3";
|
| 2 |
+
import { z } from "zod";
|
| 3 |
+
import { getSupabaseClient } from "../../shared/supabase/client";
|
| 4 |
+
import { logger } from "../../shared/utils/logger";
|
| 5 |
+
import { loadIcpConfig, applyHardFilters, applySignalFilters } from "../lib/icp-filter";
|
| 6 |
+
import { isDuplicate, isSuppressed } from "../lib/deduplicator";
|
| 7 |
+
import { scrapeCompanyWebsite } from "../lib/web-scraper";
|
| 8 |
+
import { scrapeLinkedInCompany } from "../lib/linkedin-scraper";
|
| 9 |
+
import { normalizeCompany } from "../lib/normalizer";
|
| 10 |
+
import { enrichContacts } from "../lib/contact-enricher";
|
| 11 |
+
import { searchCompanies } from "../providers/serper";
|
| 12 |
+
import { getRegionConfig } from "../lib/rotation";
|
| 13 |
+
|
| 14 |
+
// ─── Input schema ─────────────────────────────────────────────
|
| 15 |
+
|
| 16 |
+
const ManualDiscoveryInput = z.object({
|
| 17 |
+
region: z.enum(["US", "UK", "AU", "UAE", "SA", "SG"]),
|
| 18 |
+
industry: z.string().optional(),
|
| 19 |
+
customKeywords: z.array(z.string()).optional(),
|
| 20 |
+
maxCompanies: z.number().min(1).max(50).default(20),
|
| 21 |
+
triggeredBy: z.string().default("manual"), // slack username or "api"
|
| 22 |
+
});
|
| 23 |
+
|
| 24 |
+
export type ManualDiscoveryInput = z.infer<typeof ManualDiscoveryInput>;
|
| 25 |
+
|
| 26 |
+
// ─── Manual Discovery Task ────────────────────────────────────
|
| 27 |
+
|
| 28 |
+
export const manualDiscoveryTask = task({
|
| 29 |
+
id: "manual-discovery",
|
| 30 |
+
maxDuration: 1800, // 30 min max
|
| 31 |
+
|
| 32 |
+
run: async (payload: ManualDiscoveryInput) => {
|
| 33 |
+
const input = ManualDiscoveryInput.parse(payload);
|
| 34 |
+
logger.info({ input }, "🎯 Manual discovery started");
|
| 35 |
+
|
| 36 |
+
const icp = await loadIcpConfig();
|
| 37 |
+
const regionConfig = getRegionConfig(input.region);
|
| 38 |
+
|
| 39 |
+
const keywords = input.customKeywords?.length
|
| 40 |
+
? input.customKeywords
|
| 41 |
+
: icp.keywords;
|
| 42 |
+
|
| 43 |
+
const industries = input.industry
|
| 44 |
+
? [input.industry]
|
| 45 |
+
: regionConfig.industries.slice(0, 3); // limit to 3 for manual runs
|
| 46 |
+
|
| 47 |
+
let totalDiscovered = 0;
|
| 48 |
+
let totalQualified = 0;
|
| 49 |
+
|
| 50 |
+
for (const industry of industries) {
|
| 51 |
+
const results = await searchCompanies(input.region, industry, keywords);
|
| 52 |
+
const capped = results.slice(0, input.maxCompanies);
|
| 53 |
+
|
| 54 |
+
for (const result of capped) {
|
| 55 |
+
const status = await processManualCompany(result, input.region, icp, industry);
|
| 56 |
+
if (status !== "skip") totalDiscovered++;
|
| 57 |
+
if (status === "qualified") totalQualified++;
|
| 58 |
+
}
|
| 59 |
+
}
|
| 60 |
+
|
| 61 |
+
// ── Audit log ─────────────────────────────────────────────
|
| 62 |
+
const db = getSupabaseClient();
|
| 63 |
+
await db.from("audit_log").insert({
|
| 64 |
+
action: "manual_discovery_completed",
|
| 65 |
+
entity_type: "discovery_run",
|
| 66 |
+
entity_id: null,
|
| 67 |
+
actor: input.triggeredBy,
|
| 68 |
+
details: {
|
| 69 |
+
region: input.region,
|
| 70 |
+
industry: input.industry ?? "all",
|
| 71 |
+
totalDiscovered,
|
| 72 |
+
totalQualified,
|
| 73 |
+
},
|
| 74 |
+
});
|
| 75 |
+
|
| 76 |
+
logger.info({ totalDiscovered, totalQualified }, "✅ Manual discovery completed");
|
| 77 |
+
return { region: input.region, totalDiscovered, totalQualified };
|
| 78 |
+
},
|
| 79 |
+
});
|
| 80 |
+
|
| 81 |
+
// ─── Processing pipeline (same logic as auto, extracted) ─────
|
| 82 |
+
|
| 83 |
+
async function processManualCompany(
|
| 84 |
+
result: { domain: string; title: string; link: string; snippet: string },
|
| 85 |
+
region: string,
|
| 86 |
+
icp: Awaited<ReturnType<typeof loadIcpConfig>>,
|
| 87 |
+
industry: string
|
| 88 |
+
): Promise<"skip" | "new" | "qualified"> {
|
| 89 |
+
const { domain } = result;
|
| 90 |
+
const db = getSupabaseClient();
|
| 91 |
+
|
| 92 |
+
if (await isSuppressed(domain)) return "skip";
|
| 93 |
+
const { isDupe } = await isDuplicate(domain, result.title);
|
| 94 |
+
if (isDupe) return "skip";
|
| 95 |
+
|
| 96 |
+
const website = await scrapeCompanyWebsite(domain);
|
| 97 |
+
const gate1 = applyHardFilters(website, icp, region);
|
| 98 |
+
if (!gate1.passed) return "skip";
|
| 99 |
+
|
| 100 |
+
const gate2 = applySignalFilters(website, icp);
|
| 101 |
+
|
| 102 |
+
let linkedin = null;
|
| 103 |
+
if (website.linkedinUrl) {
|
| 104 |
+
linkedin = await scrapeLinkedInCompany(website.linkedinUrl).catch(() => null);
|
| 105 |
+
}
|
| 106 |
+
|
| 107 |
+
const normalized = normalizeCompany(result as any, website, linkedin, region, "manual");
|
| 108 |
+
const { data: saved, error } = await db
|
| 109 |
+
.from("companies")
|
| 110 |
+
.insert({ ...normalized, industry })
|
| 111 |
+
.select("id")
|
| 112 |
+
.single();
|
| 113 |
+
|
| 114 |
+
if (error || !saved) return "skip";
|
| 115 |
+
|
| 116 |
+
if (!gate2.passed) {
|
| 117 |
+
await db.from("companies").update({ status: "nurture" }).eq("id", saved.id);
|
| 118 |
+
return "new";
|
| 119 |
+
}
|
| 120 |
+
|
| 121 |
+
const decisionMakers = linkedin?.decisionMakers ?? [];
|
| 122 |
+
const contactsSaved = await enrichContacts(saved.id, domain, decisionMakers);
|
| 123 |
+
|
| 124 |
+
await db.from("companies").update({ status: "profiled" }).eq("id", saved.id);
|
| 125 |
+
|
| 126 |
+
if (contactsSaved > 0) {
|
| 127 |
+
const { profilingTask } = await import("../../profiling/trigger-tasks/profiling-router");
|
| 128 |
+
await profilingTask.trigger({
|
| 129 |
+
company_id: saved.id,
|
| 130 |
+
domain,
|
| 131 |
+
name: normalized.name,
|
| 132 |
+
region,
|
| 133 |
+
source: "manual",
|
| 134 |
+
});
|
| 135 |
+
return "qualified";
|
| 136 |
+
}
|
| 137 |
+
|
| 138 |
+
return "new";
|
| 139 |
+
}
|
|
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
from dotenv import load_dotenv
|
| 3 |
+
from pydantic_settings import BaseSettings
|
| 4 |
+
|
| 5 |
+
load_dotenv()
|
| 6 |
+
|
| 7 |
+
class Settings(BaseSettings):
|
| 8 |
+
# Supabase
|
| 9 |
+
SUPABASE_URL: str
|
| 10 |
+
SUPABASE_SERVICE_ROLE_KEY: str
|
| 11 |
+
|
| 12 |
+
# LLM (All on NVIDIA NIM — FREE)
|
| 13 |
+
NVIDIA_API_KEY: str
|
| 14 |
+
NVIDIA_NIM_BASE_URL: str = "https://integrate.api.nvidia.com/v1"
|
| 15 |
+
|
| 16 |
+
# Service auth
|
| 17 |
+
PYTHON_AI_SERVICE_SECRET: str
|
| 18 |
+
|
| 19 |
+
# Config
|
| 20 |
+
LOG_LEVEL: str = "INFO"
|
| 21 |
+
|
| 22 |
+
class Config:
|
| 23 |
+
env_file = "../../../.env"
|
| 24 |
+
|
| 25 |
+
settings = Settings()
|
|
@@ -0,0 +1,137 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Hallucination Guard v2 — Grounded Verification
|
| 3 |
+
|
| 4 |
+
Old approach: "Ask LLM for confidence" → LLM grades own exam → useless
|
| 5 |
+
New approach: Cross-reference every claim against evidence → real verification
|
| 6 |
+
|
| 7 |
+
Every LLM output field is checked:
|
| 8 |
+
- Employee count → matches scraped data?
|
| 9 |
+
- Industry → matches detected industry?
|
| 10 |
+
- AI readiness "high" → do we actually have AI job postings?
|
| 11 |
+
- PII in output → strip immediately
|
| 12 |
+
"""
|
| 13 |
+
|
| 14 |
+
import re
|
| 15 |
+
import logging
|
| 16 |
+
|
| 17 |
+
logger = logging.getLogger(__name__)
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
def validate_profile_grounded(profile: dict, evidence: dict) -> dict:
|
| 21 |
+
"""
|
| 22 |
+
Cross-check profile output against evidence.
|
| 23 |
+
Returns grounding result with corrections.
|
| 24 |
+
"""
|
| 25 |
+
verified = []
|
| 26 |
+
unverified = []
|
| 27 |
+
corrections = {}
|
| 28 |
+
|
| 29 |
+
# ── Employee count ────────────────────────────────────────
|
| 30 |
+
summary = str(profile.get("profile_summary", ""))
|
| 31 |
+
known_emp = evidence.get("employee_count")
|
| 32 |
+
|
| 33 |
+
emp_match = re.search(r'(\d[\d,]+)\s*(employees?|people|staff)', summary, re.I)
|
| 34 |
+
if emp_match and known_emp:
|
| 35 |
+
claimed = int(emp_match.group(1).replace(",", ""))
|
| 36 |
+
if abs(claimed - known_emp) > known_emp * 0.3:
|
| 37 |
+
corrections["employee_count"] = {"claimed": claimed, "actual": known_emp}
|
| 38 |
+
verified.append("employee_count_corrected")
|
| 39 |
+
else:
|
| 40 |
+
verified.append("employee_count_accurate")
|
| 41 |
+
|
| 42 |
+
# ── AI readiness vs actual signals ────────────────────────
|
| 43 |
+
claimed_readiness = profile.get("ai_readiness", "")
|
| 44 |
+
ai_jobs = evidence.get("ai_job_count", 0)
|
| 45 |
+
tech_stack = evidence.get("tech_stack", [])
|
| 46 |
+
|
| 47 |
+
if claimed_readiness == "high" and ai_jobs == 0 and len(tech_stack) == 0:
|
| 48 |
+
corrections["ai_readiness"] = {"claimed": "high", "actual": "low"}
|
| 49 |
+
verified.append("ai_readiness_corrected")
|
| 50 |
+
elif claimed_readiness == "low" and ai_jobs >= 3:
|
| 51 |
+
corrections["ai_readiness"] = {"claimed": "low", "actual": "high"}
|
| 52 |
+
verified.append("ai_readiness_corrected")
|
| 53 |
+
else:
|
| 54 |
+
verified.append("ai_readiness_plausible")
|
| 55 |
+
|
| 56 |
+
# ── Company name in summary ───────────────────────────────
|
| 57 |
+
known_name = evidence.get("name", "")
|
| 58 |
+
if known_name and len(known_name) > 3:
|
| 59 |
+
name_words = known_name.lower().split()
|
| 60 |
+
summary_lower = summary.lower()
|
| 61 |
+
if any(w in summary_lower for w in name_words if len(w) > 2):
|
| 62 |
+
verified.append("company_name_present")
|
| 63 |
+
else:
|
| 64 |
+
unverified.append("company_name_may_differ")
|
| 65 |
+
|
| 66 |
+
# ── Evidence claims ───────────────────────────────────────
|
| 67 |
+
evidence_used = profile.get("evidence_used", [])
|
| 68 |
+
if isinstance(evidence_used, list):
|
| 69 |
+
all_evidence_text = " ".join([
|
| 70 |
+
str(evidence.get("website_text", "")),
|
| 71 |
+
" ".join(evidence.get("tech_stack", [])),
|
| 72 |
+
" ".join(evidence.get("pain_signals", [])),
|
| 73 |
+
str(evidence.get("description", "")),
|
| 74 |
+
]).lower()
|
| 75 |
+
|
| 76 |
+
for claim in evidence_used:
|
| 77 |
+
claim_words = str(claim).lower().split()[:4]
|
| 78 |
+
if any(w in all_evidence_text for w in claim_words if len(w) > 3):
|
| 79 |
+
verified.append(f"evidence_grounded: {str(claim)[:30]}")
|
| 80 |
+
else:
|
| 81 |
+
unverified.append(f"evidence_unverifiable: {str(claim)[:30]}")
|
| 82 |
+
|
| 83 |
+
# ── PII check ─────────────────────────────────────────────
|
| 84 |
+
output_str = str(profile)
|
| 85 |
+
email_found = re.search(r'[\w.+-]+@[\w-]+\.[a-z]{2,}', output_str)
|
| 86 |
+
phone_found = re.search(r'\+?\d[\d\s\-().]{8,}', output_str)
|
| 87 |
+
|
| 88 |
+
if email_found:
|
| 89 |
+
unverified.append("pii_email_in_output")
|
| 90 |
+
if phone_found:
|
| 91 |
+
unverified.append("pii_phone_in_output")
|
| 92 |
+
|
| 93 |
+
# ── Grounding score ───────────────────────────────────────
|
| 94 |
+
total = len(verified) + len(unverified)
|
| 95 |
+
grounding_score = len(verified) / total if total > 0 else 0.5
|
| 96 |
+
|
| 97 |
+
result = {
|
| 98 |
+
"is_grounded": grounding_score >= 0.6,
|
| 99 |
+
"grounding_score": round(grounding_score, 2),
|
| 100 |
+
"verified_claims": verified,
|
| 101 |
+
"unverified_claims": unverified,
|
| 102 |
+
"corrections": corrections,
|
| 103 |
+
}
|
| 104 |
+
|
| 105 |
+
if not result["is_grounded"]:
|
| 106 |
+
logger.warning(f"Profile failed grounding: score={grounding_score:.2f}, corrections={len(corrections)}")
|
| 107 |
+
|
| 108 |
+
return result
|
| 109 |
+
|
| 110 |
+
|
| 111 |
+
def validate_score_grounded(score: dict, profile: dict) -> dict:
|
| 112 |
+
"""Validate scoring output for consistency."""
|
| 113 |
+
issues = []
|
| 114 |
+
|
| 115 |
+
total = score.get("total_score", -1)
|
| 116 |
+
if not (0 <= total <= 100):
|
| 117 |
+
issues.append(f"invalid_total_score:{total}")
|
| 118 |
+
|
| 119 |
+
tier = score.get("tier")
|
| 120 |
+
if tier not in ("hot", "warm", "nurture", "archive"):
|
| 121 |
+
issues.append(f"invalid_tier:{tier}")
|
| 122 |
+
|
| 123 |
+
# Cross-check tier vs score
|
| 124 |
+
expected_tier = (
|
| 125 |
+
"hot" if total >= 85 else
|
| 126 |
+
"warm" if total >= 70 else
|
| 127 |
+
"nurture" if total >= 50 else
|
| 128 |
+
"archive"
|
| 129 |
+
)
|
| 130 |
+
if tier != expected_tier:
|
| 131 |
+
issues.append(f"tier_score_mismatch: score={total} tier={tier} expected={expected_tier}")
|
| 132 |
+
score["tier"] = expected_tier # auto-correct
|
| 133 |
+
|
| 134 |
+
return {
|
| 135 |
+
"is_valid": len(issues) == 0,
|
| 136 |
+
"issues": issues,
|
| 137 |
+
}
|
|
@@ -0,0 +1,148 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
FastAPI Profiling Service v2 — NVIDIA NIM powered.
|
| 3 |
+
|
| 4 |
+
Endpoints:
|
| 5 |
+
POST /profile → Profile company + compute score (single pipeline)
|
| 6 |
+
GET /health → Service health check
|
| 7 |
+
|
| 8 |
+
Security:
|
| 9 |
+
Bearer token authentication (shared secret with Node.js orchestration layer)
|
| 10 |
+
"""
|
| 11 |
+
|
| 12 |
+
import logging
|
| 13 |
+
from contextlib import asynccontextmanager
|
| 14 |
+
from fastapi import FastAPI, HTTPException, Depends
|
| 15 |
+
from fastapi.security import HTTPBearer, HTTPAuthorizationCredentials
|
| 16 |
+
from pydantic import BaseModel
|
| 17 |
+
from typing import Optional
|
| 18 |
+
from config import settings
|
| 19 |
+
from profiler import generate_profile
|
| 20 |
+
from scorer import compute_score
|
| 21 |
+
from hallucination_guard import validate_score_grounded
|
| 22 |
+
|
| 23 |
+
logging.basicConfig(level=getattr(logging, settings.LOG_LEVEL.upper(), logging.INFO))
|
| 24 |
+
logger = logging.getLogger(__name__)
|
| 25 |
+
|
| 26 |
+
# ─── Auth ─────────────────────────────────────────────────────
|
| 27 |
+
|
| 28 |
+
security = HTTPBearer()
|
| 29 |
+
|
| 30 |
+
def verify_token(credentials: HTTPAuthorizationCredentials = Depends(security)):
|
| 31 |
+
if credentials.credentials != settings.PYTHON_AI_SERVICE_SECRET:
|
| 32 |
+
raise HTTPException(status_code=401, detail="Invalid authentication")
|
| 33 |
+
return True
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
# ─── Models ───────────────────────────────────────────────────
|
| 37 |
+
|
| 38 |
+
class CompanyInput(BaseModel):
|
| 39 |
+
id: Optional[str] = None
|
| 40 |
+
name: str
|
| 41 |
+
industry: str = ""
|
| 42 |
+
employee_count: Optional[int] = None
|
| 43 |
+
description: str = ""
|
| 44 |
+
website_text: str = ""
|
| 45 |
+
linkedin_description: str = ""
|
| 46 |
+
tech_stack: list[str] = []
|
| 47 |
+
ai_job_count: int = 0
|
| 48 |
+
pain_signals: list[str] = []
|
| 49 |
+
service_match: Optional[str] = None
|
| 50 |
+
|
| 51 |
+
class ContactInput(BaseModel):
|
| 52 |
+
full_name: str = ""
|
| 53 |
+
email: Optional[str] = None
|
| 54 |
+
email_verified: bool = False
|
| 55 |
+
linkedin_personal_url: Optional[str] = None
|
| 56 |
+
social_profiles: dict = {}
|
| 57 |
+
|
| 58 |
+
class ProfileRequest(BaseModel):
|
| 59 |
+
company: CompanyInput
|
| 60 |
+
contacts: list[ContactInput] = []
|
| 61 |
+
trace_id: str = ""
|
| 62 |
+
|
| 63 |
+
|
| 64 |
+
# ─── App ──────────────────────────────────────────────────────
|
| 65 |
+
|
| 66 |
+
@asynccontextmanager
|
| 67 |
+
async def lifespan(app: FastAPI):
|
| 68 |
+
logger.info("🚀 AI Profiling Service v2 starting...")
|
| 69 |
+
logger.info(f" NVIDIA NIM: {settings.NVIDIA_NIM_BASE_URL}")
|
| 70 |
+
logger.info(f" Models: GPT OSS → Gemma 3 → LLaMA 70B → LLaMA 8B → Deterministic")
|
| 71 |
+
yield
|
| 72 |
+
logger.info("AI Profiling Service shutting down")
|
| 73 |
+
|
| 74 |
+
app = FastAPI(
|
| 75 |
+
title="AI Lead Profiling Service",
|
| 76 |
+
version="2.0.0",
|
| 77 |
+
lifespan=lifespan,
|
| 78 |
+
)
|
| 79 |
+
|
| 80 |
+
|
| 81 |
+
# ─── Endpoints ────────────────────────────────────────────────
|
| 82 |
+
|
| 83 |
+
@app.get("/health")
|
| 84 |
+
async def health():
|
| 85 |
+
return {
|
| 86 |
+
"status": "healthy",
|
| 87 |
+
"version": "2.0.0",
|
| 88 |
+
"models": {
|
| 89 |
+
"primary": "nvidia/llama-3.1-nemotron-ultra-253b-v1",
|
| 90 |
+
"secondary": "google/gemma-3-27b-it",
|
| 91 |
+
"tertiary": "meta/llama-3.3-70b-instruct",
|
| 92 |
+
"fast": "meta/llama-3.1-8b-instruct",
|
| 93 |
+
},
|
| 94 |
+
}
|
| 95 |
+
|
| 96 |
+
|
| 97 |
+
@app.post("/profile")
|
| 98 |
+
async def profile_company(request: ProfileRequest, _auth: bool = Depends(verify_token)):
|
| 99 |
+
"""
|
| 100 |
+
Full profiling pipeline:
|
| 101 |
+
1. LLM generates profile (chain-of-thought, grounded)
|
| 102 |
+
2. LLM extracts signals for scoring
|
| 103 |
+
3. Code computes score deterministically
|
| 104 |
+
4. Both are validated for hallucinations
|
| 105 |
+
"""
|
| 106 |
+
company_data = request.company.model_dump()
|
| 107 |
+
contacts_data = [c.model_dump() for c in request.contacts]
|
| 108 |
+
trace_id = request.trace_id
|
| 109 |
+
|
| 110 |
+
try:
|
| 111 |
+
# Step 1: Generate profile (LLM with grounding)
|
| 112 |
+
profile = await generate_profile(company_data, trace_id)
|
| 113 |
+
|
| 114 |
+
# Step 2: Compute score (LLM extracts signals → code computes)
|
| 115 |
+
score = await compute_score(company_data, profile, contacts_data, trace_id)
|
| 116 |
+
|
| 117 |
+
# Step 3: Validate score consistency
|
| 118 |
+
score_validation = validate_score_grounded(score, profile)
|
| 119 |
+
if not score_validation["is_valid"]:
|
| 120 |
+
logger.warning(f"Score validation issues: {score_validation['issues']}")
|
| 121 |
+
|
| 122 |
+
return {
|
| 123 |
+
"profile": profile,
|
| 124 |
+
"score": score,
|
| 125 |
+
"validation": {
|
| 126 |
+
"profile_grounded": profile.get("grounding_score", 0),
|
| 127 |
+
"profile_consistent": profile.get("is_consistent", True),
|
| 128 |
+
"score_valid": score_validation["is_valid"],
|
| 129 |
+
"score_issues": score_validation.get("issues", []),
|
| 130 |
+
},
|
| 131 |
+
"meta": {
|
| 132 |
+
"model_used": profile.get("llm_model", "unknown"),
|
| 133 |
+
"is_fallback": profile.get("is_fallback", False),
|
| 134 |
+
"tokens_used": profile.get("tokens_used", 0),
|
| 135 |
+
"trace_id": trace_id,
|
| 136 |
+
},
|
| 137 |
+
}
|
| 138 |
+
|
| 139 |
+
except Exception as e:
|
| 140 |
+
logger.error(f"Profiling failed for {company_data.get('name')}: {e}")
|
| 141 |
+
raise HTTPException(status_code=500, detail=str(e))
|
| 142 |
+
|
| 143 |
+
|
| 144 |
+
# ─── Run ──────────────────────────────────────────────────────
|
| 145 |
+
|
| 146 |
+
if __name__ == "__main__":
|
| 147 |
+
import uvicorn
|
| 148 |
+
uvicorn.run("main:app", host="0.0.0.0", port=8000, reload=True)
|
|
@@ -0,0 +1,254 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Multi-Model LLM Client (Python) — All FREE on NVIDIA NIM
|
| 3 |
+
|
| 4 |
+
3 models, 1 provider, 1 API key, $0 cost:
|
| 5 |
+
1. MiniMax M2.7 → Best reasoning, 4M context, built-in CoT
|
| 6 |
+
2. LLaMA 3.3 70B → Reliable fallback
|
| 7 |
+
3. LLaMA 3.1 8B → Fast, simple tasks
|
| 8 |
+
4. Deterministic → Zero LLM fallback
|
| 9 |
+
"""
|
| 10 |
+
|
| 11 |
+
import time
|
| 12 |
+
import json
|
| 13 |
+
import hashlib
|
| 14 |
+
import logging
|
| 15 |
+
from typing import Optional
|
| 16 |
+
from openai import AsyncOpenAI
|
| 17 |
+
from config import settings
|
| 18 |
+
|
| 19 |
+
logger = logging.getLogger(__name__)
|
| 20 |
+
|
| 21 |
+
# ─── Model configs (ALL on NVIDIA NIM) ───────────────────────
|
| 22 |
+
|
| 23 |
+
MODEL_CONFIGS = [
|
| 24 |
+
{
|
| 25 |
+
"name": "MiniMax M2.7",
|
| 26 |
+
"model": "minimaxai/minimax-m2.7",
|
| 27 |
+
"max_context": 4_000_000,
|
| 28 |
+
"best_for": "profiling, scoring, complex reasoning",
|
| 29 |
+
},
|
| 30 |
+
{
|
| 31 |
+
"name": "LLaMA 3.3 70B",
|
| 32 |
+
"model": "meta/llama-3.3-70b-instruct",
|
| 33 |
+
"max_context": 128_000,
|
| 34 |
+
"best_for": "general tasks, reliable fallback",
|
| 35 |
+
},
|
| 36 |
+
{
|
| 37 |
+
"name": "LLaMA 3.1 8B",
|
| 38 |
+
"model": "meta/llama-3.1-8b-instruct",
|
| 39 |
+
"max_context": 128_000,
|
| 40 |
+
"best_for": "email classification, simple checks",
|
| 41 |
+
},
|
| 42 |
+
]
|
| 43 |
+
|
| 44 |
+
# ─── Shared client (single provider) ─────────────────────────
|
| 45 |
+
|
| 46 |
+
_client: Optional[AsyncOpenAI] = None
|
| 47 |
+
|
| 48 |
+
def get_client() -> AsyncOpenAI:
|
| 49 |
+
global _client
|
| 50 |
+
if _client is None:
|
| 51 |
+
_client = AsyncOpenAI(
|
| 52 |
+
base_url=settings.NVIDIA_NIM_BASE_URL,
|
| 53 |
+
api_key=settings.NVIDIA_API_KEY,
|
| 54 |
+
)
|
| 55 |
+
return _client
|
| 56 |
+
|
| 57 |
+
|
| 58 |
+
# ─── Main LLM call ───────────────────────────────────────────
|
| 59 |
+
|
| 60 |
+
async def call_llm(
|
| 61 |
+
operation: str,
|
| 62 |
+
system_prompt: str,
|
| 63 |
+
user_prompt: str,
|
| 64 |
+
model_index: int = 0,
|
| 65 |
+
temperature: float = 0.2,
|
| 66 |
+
max_tokens: int = 1024,
|
| 67 |
+
json_mode: bool = True,
|
| 68 |
+
trace_id: str = "",
|
| 69 |
+
company_id: str = None,
|
| 70 |
+
) -> dict:
|
| 71 |
+
"""Call LLM with fallback: MiniMax → LLaMA 70B → LLaMA 8B → Deterministic"""
|
| 72 |
+
if model_index >= len(MODEL_CONFIGS):
|
| 73 |
+
logger.error(f"ALL models failed for {operation} — deterministic fallback")
|
| 74 |
+
return _deterministic_fallback()
|
| 75 |
+
|
| 76 |
+
config = MODEL_CONFIGS[model_index]
|
| 77 |
+
client = get_client()
|
| 78 |
+
start = time.time()
|
| 79 |
+
|
| 80 |
+
try:
|
| 81 |
+
kwargs = {
|
| 82 |
+
"model": config["model"],
|
| 83 |
+
"messages": [
|
| 84 |
+
{"role": "system", "content": system_prompt},
|
| 85 |
+
{"role": "user", "content": user_prompt},
|
| 86 |
+
],
|
| 87 |
+
"temperature": temperature,
|
| 88 |
+
"max_tokens": max_tokens,
|
| 89 |
+
"top_p": 0.9,
|
| 90 |
+
}
|
| 91 |
+
if json_mode:
|
| 92 |
+
kwargs["response_format"] = {"type": "json_object"}
|
| 93 |
+
|
| 94 |
+
response = await client.chat.completions.create(**kwargs)
|
| 95 |
+
|
| 96 |
+
message = response.choices[0].message
|
| 97 |
+
content = message.content or ""
|
| 98 |
+
reasoning = getattr(message, "reasoning_content", None)
|
| 99 |
+
usage = response.usage
|
| 100 |
+
latency_ms = int((time.time() - start) * 1000)
|
| 101 |
+
|
| 102 |
+
parsed = _safe_parse_json(content) if json_mode else None
|
| 103 |
+
|
| 104 |
+
if json_mode and parsed is None:
|
| 105 |
+
logger.warning(f"JSON parse failed on {config['name']} — next model")
|
| 106 |
+
return await call_llm(operation, system_prompt, user_prompt,
|
| 107 |
+
model_index + 1, temperature, max_tokens,
|
| 108 |
+
json_mode, trace_id, company_id)
|
| 109 |
+
|
| 110 |
+
result = {
|
| 111 |
+
"content": content,
|
| 112 |
+
"reasoning": reasoning,
|
| 113 |
+
"parsed": parsed,
|
| 114 |
+
"model": config["name"],
|
| 115 |
+
"provider": "nvidia",
|
| 116 |
+
"tokens": {
|
| 117 |
+
"prompt": usage.prompt_tokens if usage else 0,
|
| 118 |
+
"completion": usage.completion_tokens if usage else 0,
|
| 119 |
+
"total": usage.total_tokens if usage else 0,
|
| 120 |
+
},
|
| 121 |
+
"latency_ms": latency_ms,
|
| 122 |
+
"fallback_used": False,
|
| 123 |
+
}
|
| 124 |
+
|
| 125 |
+
if reasoning:
|
| 126 |
+
logger.debug(f"MiniMax reasoning: {reasoning[:150]}...")
|
| 127 |
+
|
| 128 |
+
await _log_trace(trace_id, operation, config["name"], result, True, company_id)
|
| 129 |
+
return result
|
| 130 |
+
|
| 131 |
+
except Exception as e:
|
| 132 |
+
error_msg = str(e)
|
| 133 |
+
|
| 134 |
+
if "429" in error_msg:
|
| 135 |
+
logger.warning(f"Rate limited on {config['name']} — waiting 10s")
|
| 136 |
+
await _async_sleep(10)
|
| 137 |
+
return await call_llm(operation, system_prompt, user_prompt,
|
| 138 |
+
model_index, temperature, max_tokens,
|
| 139 |
+
json_mode, trace_id, company_id)
|
| 140 |
+
|
| 141 |
+
logger.warning(f"{config['name']} failed ({error_msg[:80]}) — next model")
|
| 142 |
+
return await call_llm(operation, system_prompt, user_prompt,
|
| 143 |
+
model_index + 1, temperature, max_tokens,
|
| 144 |
+
json_mode, trace_id, company_id)
|
| 145 |
+
|
| 146 |
+
|
| 147 |
+
def _deterministic_fallback() -> dict:
|
| 148 |
+
return {
|
| 149 |
+
"content": "",
|
| 150 |
+
"reasoning": None,
|
| 151 |
+
"parsed": None,
|
| 152 |
+
"model": "deterministic_fallback",
|
| 153 |
+
"provider": "none",
|
| 154 |
+
"tokens": {"prompt": 0, "completion": 0, "total": 0},
|
| 155 |
+
"latency_ms": 0,
|
| 156 |
+
"fallback_used": True,
|
| 157 |
+
}
|
| 158 |
+
|
| 159 |
+
|
| 160 |
+
# ─── Self-consistency check ──────────────────────────────────
|
| 161 |
+
|
| 162 |
+
async def call_with_consistency(
|
| 163 |
+
operation: str,
|
| 164 |
+
system_prompt: str,
|
| 165 |
+
user_prompt: str,
|
| 166 |
+
trace_id: str = "",
|
| 167 |
+
company_id: str = None,
|
| 168 |
+
) -> dict:
|
| 169 |
+
primary = await call_llm(operation, system_prompt, user_prompt,
|
| 170 |
+
temperature=0.1, trace_id=trace_id, company_id=company_id)
|
| 171 |
+
|
| 172 |
+
if operation not in ("profile", "score"):
|
| 173 |
+
return {**primary, "is_consistent": True, "consistency_score": 1.0}
|
| 174 |
+
|
| 175 |
+
if primary.get("fallback_used"):
|
| 176 |
+
return {**primary, "is_consistent": True, "consistency_score": 0.5}
|
| 177 |
+
|
| 178 |
+
# MiniMax with reasoning = inherently more consistent
|
| 179 |
+
if primary.get("model") == "MiniMax M2.7" and primary.get("reasoning"):
|
| 180 |
+
return {**primary, "is_consistent": True, "consistency_score": 0.95}
|
| 181 |
+
|
| 182 |
+
secondary = await call_llm(operation, system_prompt, user_prompt,
|
| 183 |
+
temperature=0.4, trace_id=trace_id, company_id=company_id)
|
| 184 |
+
|
| 185 |
+
score = _compare_outputs(primary.get("parsed"), secondary.get("parsed"))
|
| 186 |
+
return {**primary, "is_consistent": score >= 0.75, "consistency_score": score}
|
| 187 |
+
|
| 188 |
+
|
| 189 |
+
def _compare_outputs(a: dict, b: dict) -> float:
|
| 190 |
+
if not a or not b:
|
| 191 |
+
return 0.5
|
| 192 |
+
matches = 0
|
| 193 |
+
total = 0
|
| 194 |
+
for key in ["ai_readiness", "tier", "service_match"]:
|
| 195 |
+
if key in a and key in b:
|
| 196 |
+
total += 1
|
| 197 |
+
if a[key] == b[key]:
|
| 198 |
+
matches += 1
|
| 199 |
+
for key in ["total_score", "company_fit"]:
|
| 200 |
+
av = a.get(key)
|
| 201 |
+
bv = b.get(key)
|
| 202 |
+
if isinstance(av, (int, float)) and isinstance(bv, (int, float)):
|
| 203 |
+
total += 1
|
| 204 |
+
if abs(av - bv) <= 10:
|
| 205 |
+
matches += 1
|
| 206 |
+
return matches / total if total > 0 else 1.0
|
| 207 |
+
|
| 208 |
+
|
| 209 |
+
# ─── Helpers ─────────────────────────────────────────────────
|
| 210 |
+
|
| 211 |
+
def _safe_parse_json(text: str) -> Optional[dict]:
|
| 212 |
+
content = text.strip()
|
| 213 |
+
if "```json" in content:
|
| 214 |
+
content = content.split("```json")[1].split("```")[0].strip()
|
| 215 |
+
elif "```" in content:
|
| 216 |
+
content = content.split("```")[1].split("```")[0].strip()
|
| 217 |
+
try:
|
| 218 |
+
return json.loads(content)
|
| 219 |
+
except json.JSONDecodeError:
|
| 220 |
+
import re
|
| 221 |
+
match = re.search(r'\{[\s\S]*\}', content)
|
| 222 |
+
if match:
|
| 223 |
+
try:
|
| 224 |
+
return json.loads(match.group())
|
| 225 |
+
except json.JSONDecodeError:
|
| 226 |
+
return None
|
| 227 |
+
return None
|
| 228 |
+
|
| 229 |
+
|
| 230 |
+
async def _log_trace(trace_id, operation, model, result, success, company_id):
|
| 231 |
+
try:
|
| 232 |
+
from supabase import create_client
|
| 233 |
+
sb = create_client(settings.SUPABASE_URL, settings.SUPABASE_SERVICE_ROLE_KEY)
|
| 234 |
+
|
| 235 |
+
sb.table("llm_traces").insert({
|
| 236 |
+
"trace_id": trace_id,
|
| 237 |
+
"operation": operation,
|
| 238 |
+
"model": model,
|
| 239 |
+
"provider": "nvidia",
|
| 240 |
+
"prompt_tokens": result["tokens"]["prompt"] if result else 0,
|
| 241 |
+
"completion_tokens": result["tokens"]["completion"] if result else 0,
|
| 242 |
+
"total_tokens": result["tokens"]["total"] if result else 0,
|
| 243 |
+
"latency_ms": result.get("latency_ms", 0) if result else 0,
|
| 244 |
+
"success": success,
|
| 245 |
+
"fallback_used": result.get("fallback_used", True) if result else True,
|
| 246 |
+
"company_id": company_id,
|
| 247 |
+
}).execute()
|
| 248 |
+
except Exception as e:
|
| 249 |
+
logger.debug(f"Trace log failed (non-critical): {e}")
|
| 250 |
+
|
| 251 |
+
|
| 252 |
+
async def _async_sleep(seconds: int):
|
| 253 |
+
import asyncio
|
| 254 |
+
await asyncio.sleep(seconds)
|
|
@@ -0,0 +1,212 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Profiler — Production-grade company profiling using NVIDIA NIM.
|
| 3 |
+
|
| 4 |
+
Key differences from v1:
|
| 5 |
+
1. Chain-of-thought reasoning forced (Step 1-5 before JSON)
|
| 6 |
+
2. Few-shot examples (2 real-world examples in prompt)
|
| 7 |
+
3. Grounding instruction ("UNKNOWN" for missing data)
|
| 8 |
+
4. Evidence tracking (what data supported each claim)
|
| 9 |
+
5. Deterministic fallback (zero hallucination when LLM fails)
|
| 10 |
+
"""
|
| 11 |
+
|
| 12 |
+
import logging
|
| 13 |
+
from nvidia_client import call_with_consistency, MODELS
|
| 14 |
+
from hallucination_guard import validate_profile_grounded
|
| 15 |
+
|
| 16 |
+
logger = logging.getLogger(__name__)
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
# ─── System prompt ────────────────────────────────────────────
|
| 20 |
+
|
| 21 |
+
SYSTEM_PROMPT = """You are a business analyst for an AI automation agency.
|
| 22 |
+
Your job: analyze a company and identify WHERE our AI services can help them.
|
| 23 |
+
|
| 24 |
+
CRITICAL RULES:
|
| 25 |
+
- Only state facts supported by the provided evidence
|
| 26 |
+
- Write "UNKNOWN" for anything not in the data — NEVER guess
|
| 27 |
+
- Your analysis determines whether a real salesperson contacts this company
|
| 28 |
+
- Wrong analysis = wasted human time = unacceptable
|
| 29 |
+
- Think step by step before concluding"""
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
# ─── User prompt builder ─────────────────────────────────────
|
| 33 |
+
|
| 34 |
+
def build_profile_prompt(data: dict) -> str:
|
| 35 |
+
return f"""ANALYZE THIS COMPANY:
|
| 36 |
+
|
| 37 |
+
Name: {data.get('name', 'UNKNOWN')}
|
| 38 |
+
Industry: {data.get('industry', 'UNKNOWN')}
|
| 39 |
+
Employees: {data.get('employee_count', 'UNKNOWN')}
|
| 40 |
+
Description: {(data.get('description') or 'NONE PROVIDED')[:400]}
|
| 41 |
+
|
| 42 |
+
Website excerpt:
|
| 43 |
+
{(data.get('website_text') or '')[:600]}
|
| 44 |
+
|
| 45 |
+
LinkedIn description:
|
| 46 |
+
{data.get('linkedin_description') or 'NONE'}
|
| 47 |
+
|
| 48 |
+
Tech stack detected: {', '.join(data.get('tech_stack', [])) or 'NONE DETECTED'}
|
| 49 |
+
Job postings mentioning AI/automation: {data.get('ai_job_count', 0)}
|
| 50 |
+
Pain signals detected: {', '.join(data.get('pain_signals', [])) or 'NONE'}
|
| 51 |
+
Service match suggestion: {data.get('service_match') or 'NONE'}
|
| 52 |
+
|
| 53 |
+
STEP-BY-STEP ANALYSIS:
|
| 54 |
+
|
| 55 |
+
Step 1: What does this company actually DO? (2 sentences, facts only)
|
| 56 |
+
Step 2: What are their likely daily operational challenges? (based on industry + size)
|
| 57 |
+
Step 3: What specific AI automation would save them time/money? (be specific)
|
| 58 |
+
Step 4: Who in this organization would approve buying this service?
|
| 59 |
+
Step 5: What outreach angle would resonate with this specific person?
|
| 60 |
+
|
| 61 |
+
After reasoning through steps 1-5, output this JSON:
|
| 62 |
+
{{
|
| 63 |
+
"profile_summary": "2-3 factual sentences about what this company does",
|
| 64 |
+
"pain_points": ["specific pain 1", "specific pain 2"],
|
| 65 |
+
"ai_use_case": "The single most compelling AI use case for them",
|
| 66 |
+
"ai_readiness": "low|medium|high",
|
| 67 |
+
"decision_maker_reasoning": "Who likely makes purchasing decisions and why",
|
| 68 |
+
"outreach_angle": "One specific sentence — the hook for first contact",
|
| 69 |
+
"confidence": 0.0,
|
| 70 |
+
"evidence_used": ["list which data points you relied on"],
|
| 71 |
+
"evidence_missing": ["list what data you wished you had"]
|
| 72 |
+
}}
|
| 73 |
+
|
| 74 |
+
EXAMPLE 1 (dental clinic, 6 employees):
|
| 75 |
+
{{
|
| 76 |
+
"profile_summary": "ABC Dental is a 6-person dental practice in Houston offering general and cosmetic dentistry. They display their phone number prominently and use a basic contact form for appointments.",
|
| 77 |
+
"pain_points": ["Manual phone-based appointment scheduling during business hours only", "No after-hours patient communication capability"],
|
| 78 |
+
"ai_use_case": "AI receptionist to handle appointment booking, reminders, and after-hours calls",
|
| 79 |
+
"ai_readiness": "low",
|
| 80 |
+
"decision_maker_reasoning": "Practice owner (Dr. Smith, DDS) makes all purchasing decisions. Small practice = owner controls budget directly.",
|
| 81 |
+
"outreach_angle": "Stop losing patients to voicemail — our AI receptionist books appointments 24/7",
|
| 82 |
+
"confidence": 0.82,
|
| 83 |
+
"evidence_used": ["phone number on homepage", "contact form only", "6 staff listed", "no chatbot detected"],
|
| 84 |
+
"evidence_missing": ["annual revenue", "number of daily calls"]
|
| 85 |
+
}}
|
| 86 |
+
|
| 87 |
+
EXAMPLE 2 (manufacturing company, 150 employees):
|
| 88 |
+
{{
|
| 89 |
+
"profile_summary": "XYZ Manufacturing is a UK-based manufacturer of industrial valves with 150 employees. They use SAP for ERP and are hiring a Data Analyst.",
|
| 90 |
+
"pain_points": ["Manual data extraction from legacy SAP system", "Production reporting requires manual spreadsheet compilation"],
|
| 91 |
+
"ai_use_case": "Automated reporting pipeline that extracts SAP data and generates dashboards",
|
| 92 |
+
"ai_readiness": "medium",
|
| 93 |
+
"decision_maker_reasoning": "Operations Director manages the data team and would champion this internally. CTO signs off on tech purchases.",
|
| 94 |
+
"outreach_angle": "Your Data Analyst job posting tells us you're drowning in manual SAP reports — we automate that entirely",
|
| 95 |
+
"confidence": 0.88,
|
| 96 |
+
"evidence_used": ["SAP detected in tech stack", "Data Analyst job posting", "150 employees"],
|
| 97 |
+
"evidence_missing": ["specific SAP modules used", "current reporting frequency"]
|
| 98 |
+
}}"""
|
| 99 |
+
|
| 100 |
+
|
| 101 |
+
# ─── Main profiling function ─────────────────────────────────
|
| 102 |
+
|
| 103 |
+
async def generate_profile(company_data: dict, trace_id: str = "") -> dict:
|
| 104 |
+
"""
|
| 105 |
+
Generate LLM profile with consistency checking and grounding.
|
| 106 |
+
Returns cleaned, grounded profile or deterministic fallback.
|
| 107 |
+
"""
|
| 108 |
+
prompt = build_profile_prompt(company_data)
|
| 109 |
+
|
| 110 |
+
# Call with consistency check (2 temperatures, compare)
|
| 111 |
+
result = await call_with_consistency(
|
| 112 |
+
operation="profile",
|
| 113 |
+
system_prompt=SYSTEM_PROMPT,
|
| 114 |
+
user_prompt=prompt,
|
| 115 |
+
trace_id=trace_id,
|
| 116 |
+
company_id=company_data.get("id"),
|
| 117 |
+
)
|
| 118 |
+
|
| 119 |
+
# All models failed → deterministic fallback
|
| 120 |
+
if result.get("fallback_used") or not result.get("parsed"):
|
| 121 |
+
logger.warning(f"All LLM models failed for {company_data.get('name')} — using fallback")
|
| 122 |
+
return _deterministic_fallback(company_data)
|
| 123 |
+
|
| 124 |
+
profile = result["parsed"]
|
| 125 |
+
profile["llm_model"] = result["model"]
|
| 126 |
+
profile["is_fallback"] = False
|
| 127 |
+
profile["is_consistent"] = result.get("is_consistent", True)
|
| 128 |
+
profile["consistency_score"] = result.get("consistency_score", 1.0)
|
| 129 |
+
profile["tokens_used"] = result["tokens"]["total"]
|
| 130 |
+
|
| 131 |
+
# Grounding validation
|
| 132 |
+
grounding_result = validate_profile_grounded(profile, company_data)
|
| 133 |
+
profile["grounding_score"] = grounding_result["grounding_score"]
|
| 134 |
+
profile["corrections"] = grounding_result.get("corrections", {})
|
| 135 |
+
|
| 136 |
+
# Apply corrections
|
| 137 |
+
if grounding_result.get("corrections"):
|
| 138 |
+
for key, correction in grounding_result["corrections"].items():
|
| 139 |
+
if key in profile:
|
| 140 |
+
profile[key] = correction["actual"]
|
| 141 |
+
|
| 142 |
+
return profile
|
| 143 |
+
|
| 144 |
+
|
| 145 |
+
# ─── Deterministic fallback ──────────────────────────────────
|
| 146 |
+
|
| 147 |
+
def _deterministic_fallback(data: dict) -> dict:
|
| 148 |
+
"""Zero-hallucination fallback. Only uses available facts."""
|
| 149 |
+
industry = data.get("industry", "business")
|
| 150 |
+
size = data.get("employee_count", "unknown")
|
| 151 |
+
name = data.get("name", "this company")
|
| 152 |
+
pain_signals = data.get("pain_signals", [])
|
| 153 |
+
service_match = data.get("service_match")
|
| 154 |
+
|
| 155 |
+
# Map service to pain points
|
| 156 |
+
pain_points = _get_pain_points(service_match, industry, pain_signals)
|
| 157 |
+
|
| 158 |
+
# AI readiness from evidence
|
| 159 |
+
ai_jobs = data.get("ai_job_count", 0)
|
| 160 |
+
tech_stack = data.get("tech_stack", [])
|
| 161 |
+
if ai_jobs >= 2: ai_readiness = "high"
|
| 162 |
+
elif tech_stack or ai_jobs >= 1: ai_readiness = "medium"
|
| 163 |
+
else: ai_readiness = "low"
|
| 164 |
+
|
| 165 |
+
return {
|
| 166 |
+
"profile_summary": f"{name} is a {industry} company with approximately {size} employees.",
|
| 167 |
+
"pain_points": pain_points,
|
| 168 |
+
"ai_use_case": _get_use_case(service_match, industry),
|
| 169 |
+
"ai_readiness": ai_readiness,
|
| 170 |
+
"decision_maker_reasoning": f"At a {size}-employee {industry} company, purchasing decisions are likely made by the owner or managing director.",
|
| 171 |
+
"outreach_angle": _get_outreach_angle(service_match, name),
|
| 172 |
+
"confidence": 0.5,
|
| 173 |
+
"evidence_used": [f"employee_count: {size}", f"industry: {industry}"] + pain_signals[:3],
|
| 174 |
+
"evidence_missing": ["revenue", "growth rate", "current tools"],
|
| 175 |
+
"llm_model": "deterministic_fallback",
|
| 176 |
+
"is_fallback": True,
|
| 177 |
+
"is_consistent": True,
|
| 178 |
+
"consistency_score": 1.0,
|
| 179 |
+
"grounding_score": 1.0,
|
| 180 |
+
"tokens_used": 0,
|
| 181 |
+
"corrections": {},
|
| 182 |
+
}
|
| 183 |
+
|
| 184 |
+
|
| 185 |
+
def _get_pain_points(service, industry, detected_signals):
|
| 186 |
+
if detected_signals and len(detected_signals) >= 2:
|
| 187 |
+
return detected_signals[:2]
|
| 188 |
+
|
| 189 |
+
service_pains = {
|
| 190 |
+
"AI Receptionist": ["Manual phone handling during business hours only", "Missed calls and appointments outside working hours"],
|
| 191 |
+
"AI Customer Support": ["Manual ticket handling and slow response times", "No automated FAQ or chatbot for common questions"],
|
| 192 |
+
"AI Data Processing": ["Manual data entry and reporting overhead", "Legacy system inefficiencies"],
|
| 193 |
+
"AI Sales Automation": ["Manual outbound sales process", "Unqualified leads consuming sales team time"],
|
| 194 |
+
"AI Workflow Automation": ["Manual approval workflows", "Multiple disconnected tools and platforms"],
|
| 195 |
+
}
|
| 196 |
+
return service_pains.get(service, ["Manual operational processes", "Unoptimized workflow efficiency"])
|
| 197 |
+
|
| 198 |
+
|
| 199 |
+
def _get_use_case(service, industry):
|
| 200 |
+
if service:
|
| 201 |
+
return f"{service} for {industry} operations"
|
| 202 |
+
return f"AI workflow automation for {industry} processes"
|
| 203 |
+
|
| 204 |
+
|
| 205 |
+
def _get_outreach_angle(service, name):
|
| 206 |
+
angles = {
|
| 207 |
+
"AI Receptionist": f"Stop losing customers to voicemail — our AI handles calls 24/7 for {name}",
|
| 208 |
+
"AI Customer Support": f"Reduce support costs by 60% with AI-powered customer service for {name}",
|
| 209 |
+
"AI Data Processing": f"Eliminate manual reporting — our AI automates your data pipeline",
|
| 210 |
+
"AI Sales Automation": f"Double your sales pipeline efficiency with AI-powered outreach",
|
| 211 |
+
}
|
| 212 |
+
return angles.get(service, f"Reduce operational overhead with targeted AI automation for {name}")
|
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
fastapi==0.111.0
|
| 2 |
+
uvicorn[standard]==0.30.0
|
| 3 |
+
httpx==0.27.0
|
| 4 |
+
pydantic==2.7.0
|
| 5 |
+
pydantic-settings==2.2.0
|
| 6 |
+
python-dotenv==1.0.1
|
| 7 |
+
openai==1.30.0
|
| 8 |
+
supabase==2.4.0
|
|
@@ -0,0 +1,260 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Scorer v2 — Signal Extraction + Deterministic Scoring
|
| 3 |
+
|
| 4 |
+
KEY DESIGN CHANGE:
|
| 5 |
+
Old: LLM computes score directly → hallucination risk
|
| 6 |
+
New: LLM extracts SIGNALS → Code computes score → zero hallucination
|
| 7 |
+
|
| 8 |
+
LLM is good at: "Does this company have legacy SAP?" (yes/no)
|
| 9 |
+
LLM is bad at: "Give this company 73 out of 100" (arbitrary)
|
| 10 |
+
|
| 11 |
+
So: LLM extracts signals, code does math.
|
| 12 |
+
"""
|
| 13 |
+
|
| 14 |
+
import logging
|
| 15 |
+
from nvidia_client import call_llm, MODELS
|
| 16 |
+
|
| 17 |
+
logger = logging.getLogger(__name__)
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
# ─── Signal extraction prompt ────────────────────────────────
|
| 21 |
+
|
| 22 |
+
SYSTEM_PROMPT = """You are a lead qualification engine.
|
| 23 |
+
Your job: extract SIGNALS from company data. You do NOT compute the final score.
|
| 24 |
+
The system computes scores deterministically from your signal extraction.
|
| 25 |
+
|
| 26 |
+
CRITICAL RULES:
|
| 27 |
+
- Extract only what the evidence supports
|
| 28 |
+
- For each signal, cite which piece of evidence supports it
|
| 29 |
+
- If evidence is weak or missing, say so honestly
|
| 30 |
+
- Output ONLY the structured JSON requested"""
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
def build_signal_prompt(data: dict, profile: dict, contacts: list) -> str:
|
| 34 |
+
has_verified_email = any(c.get("email_verified") for c in contacts)
|
| 35 |
+
has_linkedin = any(c.get("linkedin_personal_url") for c in contacts)
|
| 36 |
+
has_social = any(c.get("social_profiles") for c in contacts)
|
| 37 |
+
|
| 38 |
+
return f"""EXTRACT SIGNALS for lead scoring. Do not compute a score.
|
| 39 |
+
|
| 40 |
+
Company: {data.get('name', 'UNKNOWN')}
|
| 41 |
+
Industry: {data.get('industry', 'UNKNOWN')}
|
| 42 |
+
Employees: {data.get('employee_count', 'UNKNOWN')}
|
| 43 |
+
Tech stack: {', '.join(data.get('tech_stack', [])) or 'NONE'}
|
| 44 |
+
AI job postings: {data.get('ai_job_count', 0)}
|
| 45 |
+
Pain signals: {', '.join(data.get('pain_signals', [])) or 'NONE'}
|
| 46 |
+
Service match: {data.get('service_match') or 'NONE'}
|
| 47 |
+
AI readiness (from profile): {profile.get('ai_readiness', 'UNKNOWN')}
|
| 48 |
+
Has verified email: {has_verified_email}
|
| 49 |
+
Has personal LinkedIn: {has_linkedin}
|
| 50 |
+
Has social profiles: {has_social}
|
| 51 |
+
Growth signals count: {len(data.get('growth_signals', []))}
|
| 52 |
+
|
| 53 |
+
Output JSON:
|
| 54 |
+
{{
|
| 55 |
+
"company_fit_signals": {{
|
| 56 |
+
"industry_match": true,
|
| 57 |
+
"size_appropriate": true,
|
| 58 |
+
"evidence": "why"
|
| 59 |
+
}},
|
| 60 |
+
"ai_readiness_signals": {{
|
| 61 |
+
"level": "none|low|medium|high",
|
| 62 |
+
"tech_stack_relevant": false,
|
| 63 |
+
"ai_jobs_present": false,
|
| 64 |
+
"evidence": "why"
|
| 65 |
+
}},
|
| 66 |
+
"service_match_signals": {{
|
| 67 |
+
"matched": true,
|
| 68 |
+
"service_name": "which service",
|
| 69 |
+
"pain_count": 0,
|
| 70 |
+
"evidence": "which pain signals"
|
| 71 |
+
}},
|
| 72 |
+
"contact_quality_signals": {{
|
| 73 |
+
"email_verified": {str(has_verified_email).lower()},
|
| 74 |
+
"linkedin_found": {str(has_linkedin).lower()},
|
| 75 |
+
"decision_maker_identified": true
|
| 76 |
+
}},
|
| 77 |
+
"timing_signals": {{
|
| 78 |
+
"actively_growing": false,
|
| 79 |
+
"recently_active": true,
|
| 80 |
+
"evidence": "what suggests timing"
|
| 81 |
+
}},
|
| 82 |
+
"confidence": 0.0
|
| 83 |
+
}}"""
|
| 84 |
+
|
| 85 |
+
|
| 86 |
+
# ─── Main scoring function ───────────────────────────────────
|
| 87 |
+
|
| 88 |
+
async def compute_score(
|
| 89 |
+
company_data: dict,
|
| 90 |
+
profile: dict,
|
| 91 |
+
contacts: list,
|
| 92 |
+
trace_id: str = ""
|
| 93 |
+
) -> dict:
|
| 94 |
+
"""
|
| 95 |
+
Step 1: LLM extracts signals (qualitative)
|
| 96 |
+
Step 2: Code computes score (deterministic, reproducible)
|
| 97 |
+
"""
|
| 98 |
+
|
| 99 |
+
# ── Step 1: Signal extraction via LLM ─────────────────────
|
| 100 |
+
signals = await _extract_signals(company_data, profile, contacts, trace_id)
|
| 101 |
+
|
| 102 |
+
# ── Step 2: Deterministic scoring ─────────────────────────
|
| 103 |
+
score = _compute_deterministic_score(signals, company_data, profile, contacts)
|
| 104 |
+
|
| 105 |
+
return score
|
| 106 |
+
|
| 107 |
+
|
| 108 |
+
async def _extract_signals(data, profile, contacts, trace_id) -> dict:
|
| 109 |
+
"""Ask LLM to identify signals — NOT to score."""
|
| 110 |
+
try:
|
| 111 |
+
prompt = build_signal_prompt(data, profile, contacts)
|
| 112 |
+
result = await call_llm(
|
| 113 |
+
operation="score",
|
| 114 |
+
system_prompt=SYSTEM_PROMPT,
|
| 115 |
+
user_prompt=prompt,
|
| 116 |
+
model=MODELS["FAST"], # 8B model — signal extraction is simple
|
| 117 |
+
temperature=0.1,
|
| 118 |
+
max_tokens=400,
|
| 119 |
+
json_mode=True,
|
| 120 |
+
trace_id=trace_id,
|
| 121 |
+
company_id=data.get("id"),
|
| 122 |
+
)
|
| 123 |
+
|
| 124 |
+
if result.get("parsed"):
|
| 125 |
+
return result["parsed"]
|
| 126 |
+
except Exception as e:
|
| 127 |
+
logger.warning(f"Signal extraction failed: {e}")
|
| 128 |
+
|
| 129 |
+
# Fallback: extract signals from raw data
|
| 130 |
+
return _extract_signals_deterministic(data, profile, contacts)
|
| 131 |
+
|
| 132 |
+
|
| 133 |
+
def _extract_signals_deterministic(data, profile, contacts) -> dict:
|
| 134 |
+
"""Rule-based signal extraction when LLM fails."""
|
| 135 |
+
has_email = any(c.get("email_verified") for c in contacts)
|
| 136 |
+
has_linkedin = any(c.get("linkedin_personal_url") for c in contacts)
|
| 137 |
+
|
| 138 |
+
return {
|
| 139 |
+
"company_fit_signals": {
|
| 140 |
+
"industry_match": bool(data.get("industry")),
|
| 141 |
+
"size_appropriate": (data.get("employee_count") or 0) >= 3,
|
| 142 |
+
"evidence": "deterministic",
|
| 143 |
+
},
|
| 144 |
+
"ai_readiness_signals": {
|
| 145 |
+
"level": profile.get("ai_readiness", "low"),
|
| 146 |
+
"tech_stack_relevant": len(data.get("tech_stack", [])) > 0,
|
| 147 |
+
"ai_jobs_present": data.get("ai_job_count", 0) > 0,
|
| 148 |
+
"evidence": "deterministic",
|
| 149 |
+
},
|
| 150 |
+
"service_match_signals": {
|
| 151 |
+
"matched": bool(data.get("service_match")),
|
| 152 |
+
"service_name": data.get("service_match", "NONE"),
|
| 153 |
+
"pain_count": len(data.get("pain_signals", [])),
|
| 154 |
+
"evidence": "deterministic",
|
| 155 |
+
},
|
| 156 |
+
"contact_quality_signals": {
|
| 157 |
+
"email_verified": has_email,
|
| 158 |
+
"linkedin_found": has_linkedin,
|
| 159 |
+
"decision_maker_identified": len(contacts) > 0,
|
| 160 |
+
},
|
| 161 |
+
"timing_signals": {
|
| 162 |
+
"actively_growing": data.get("ai_job_count", 0) > 0,
|
| 163 |
+
"recently_active": True,
|
| 164 |
+
"evidence": "deterministic",
|
| 165 |
+
},
|
| 166 |
+
"confidence": 0.5,
|
| 167 |
+
}
|
| 168 |
+
|
| 169 |
+
|
| 170 |
+
# ─── Deterministic score computation ─────────────────────────
|
| 171 |
+
# This is where the ACTUAL score is calculated.
|
| 172 |
+
# No LLM involved — pure math from signals.
|
| 173 |
+
|
| 174 |
+
def _compute_deterministic_score(signals: dict, data: dict, profile: dict, contacts: list) -> dict:
|
| 175 |
+
"""
|
| 176 |
+
Weights:
|
| 177 |
+
company_fit: 25 pts
|
| 178 |
+
ai_readiness: 20 pts
|
| 179 |
+
service_match: 20 pts (NEW — replaces old AI readiness weight)
|
| 180 |
+
decision_maker: 20 pts
|
| 181 |
+
timing: 15 pts
|
| 182 |
+
"""
|
| 183 |
+
|
| 184 |
+
# ── Company Fit (25 pts) ──────────────────────────────────
|
| 185 |
+
fit = signals.get("company_fit_signals", {})
|
| 186 |
+
company_fit = 0
|
| 187 |
+
if fit.get("industry_match"): company_fit += 10
|
| 188 |
+
if fit.get("size_appropriate"): company_fit += 10
|
| 189 |
+
emp = data.get("employee_count") or 0
|
| 190 |
+
if emp >= 200: company_fit += 5
|
| 191 |
+
elif emp >= 50: company_fit += 3
|
| 192 |
+
elif emp >= 10: company_fit += 1
|
| 193 |
+
|
| 194 |
+
# ── AI Readiness (20 pts) ─────────────────────────────────
|
| 195 |
+
ai_sig = signals.get("ai_readiness_signals", {})
|
| 196 |
+
ai_readiness = 0
|
| 197 |
+
level = ai_sig.get("level", "low")
|
| 198 |
+
if level == "high": ai_readiness += 12
|
| 199 |
+
elif level == "medium": ai_readiness += 8
|
| 200 |
+
elif level == "low": ai_readiness += 3
|
| 201 |
+
if ai_sig.get("tech_stack_relevant"): ai_readiness += 4
|
| 202 |
+
if ai_sig.get("ai_jobs_present"): ai_readiness += 4
|
| 203 |
+
ai_readiness = min(20, ai_readiness)
|
| 204 |
+
|
| 205 |
+
# ── Service Match (20 pts) — KEY DIFFERENTIATOR ───────────
|
| 206 |
+
svc = signals.get("service_match_signals", {})
|
| 207 |
+
service_match = 0
|
| 208 |
+
if svc.get("matched"):
|
| 209 |
+
service_match += 10
|
| 210 |
+
pain_count = svc.get("pain_count", 0)
|
| 211 |
+
service_match += min(10, pain_count * 3) # up to 10 pts for pain signals
|
| 212 |
+
service_match = min(20, service_match)
|
| 213 |
+
|
| 214 |
+
# ── Decision Maker Access (20 pts) ────────────────────────
|
| 215 |
+
contact = signals.get("contact_quality_signals", {})
|
| 216 |
+
dm = 0
|
| 217 |
+
if contact.get("email_verified"): dm += 12
|
| 218 |
+
elif any(c.get("email") for c in contacts): dm += 6
|
| 219 |
+
if contact.get("linkedin_found"): dm += 5
|
| 220 |
+
if contact.get("decision_maker_identified"): dm += 3
|
| 221 |
+
dm = min(20, dm)
|
| 222 |
+
|
| 223 |
+
# ── Timing (15 pts) ───────────────────────────────────────
|
| 224 |
+
timing = signals.get("timing_signals", {})
|
| 225 |
+
timing_score = 5 # base: company exists and has website
|
| 226 |
+
if timing.get("actively_growing"): timing_score += 5
|
| 227 |
+
if timing.get("recently_active"): timing_score += 3
|
| 228 |
+
if len(data.get("growth_signals", [])) >= 2: timing_score += 2
|
| 229 |
+
timing_score = min(15, timing_score)
|
| 230 |
+
|
| 231 |
+
# ── Total ─────────────────────────────────────────────────
|
| 232 |
+
total = company_fit + ai_readiness + service_match + dm + timing_score
|
| 233 |
+
tier = _score_to_tier(total)
|
| 234 |
+
|
| 235 |
+
return {
|
| 236 |
+
"company_fit": company_fit,
|
| 237 |
+
"ai_readiness_score": ai_readiness,
|
| 238 |
+
"service_match_score": service_match,
|
| 239 |
+
"decision_maker_access": dm,
|
| 240 |
+
"timing_score": timing_score,
|
| 241 |
+
"total_score": total,
|
| 242 |
+
"tier": tier,
|
| 243 |
+
"score_breakdown": {
|
| 244 |
+
"company_fit": f"{company_fit}/25",
|
| 245 |
+
"ai_readiness": f"{ai_readiness}/20",
|
| 246 |
+
"service_match": f"{service_match}/20",
|
| 247 |
+
"decision_maker": f"{dm}/20",
|
| 248 |
+
"timing": f"{timing_score}/15",
|
| 249 |
+
},
|
| 250 |
+
"score_reasoning": f"Deterministic score from {len(signals)} signal groups",
|
| 251 |
+
"llm_model": "deterministic_scorer",
|
| 252 |
+
"is_fallback": False,
|
| 253 |
+
}
|
| 254 |
+
|
| 255 |
+
|
| 256 |
+
def _score_to_tier(score: int) -> str:
|
| 257 |
+
if score >= 85: return "hot"
|
| 258 |
+
if score >= 70: return "warm"
|
| 259 |
+
if score >= 50: return "nurture"
|
| 260 |
+
return "archive"
|
|
@@ -0,0 +1,158 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import { task } from "@trigger.dev/sdk/v3";
|
| 2 |
+
import axios from "axios";
|
| 3 |
+
import { getEnv } from "../../shared/config/env";
|
| 4 |
+
import { getSupabaseClient } from "../../shared/supabase/client";
|
| 5 |
+
import { logger, auditLog } from "../../shared/utils/logger";
|
| 6 |
+
import { CompanyDiscoveredPayload } from "../../shared/supabase/schema";
|
| 7 |
+
|
| 8 |
+
/**
|
| 9 |
+
* Profiling Router — Trigger.dev task that:
|
| 10 |
+
* 1. Receives company.discovered event
|
| 11 |
+
* 2. Calls Python AI service for LLM profiling + scoring
|
| 12 |
+
* 3. Routes result: qualified → outreach queue, low score → nurture/archive
|
| 13 |
+
*/
|
| 14 |
+
export const profilingTask = task({
|
| 15 |
+
id: "profiling-router",
|
| 16 |
+
maxDuration: 300, // 5 min per company
|
| 17 |
+
|
| 18 |
+
run: async (payload: CompanyDiscoveredPayload) => {
|
| 19 |
+
const { company_id, domain, name, region, source } = payload;
|
| 20 |
+
const env = getEnv();
|
| 21 |
+
const db = getSupabaseClient();
|
| 22 |
+
|
| 23 |
+
logger.info({ company_id, domain }, "🧠 Profiling started");
|
| 24 |
+
|
| 25 |
+
try {
|
| 26 |
+
// ── Call Python AI Service ──────────────────────────────
|
| 27 |
+
const response = await axios.post(
|
| 28 |
+
`${env.PYTHON_AI_SERVICE_URL}/profile`,
|
| 29 |
+
{ company_id, domain, name, region, source },
|
| 30 |
+
{
|
| 31 |
+
headers: {
|
| 32 |
+
"Content-Type": "application/json",
|
| 33 |
+
"x-service-secret": env.PYTHON_AI_SERVICE_SECRET,
|
| 34 |
+
},
|
| 35 |
+
timeout: 120_000, // 2 min timeout for LLM
|
| 36 |
+
}
|
| 37 |
+
);
|
| 38 |
+
|
| 39 |
+
const result = response.data;
|
| 40 |
+
logger.info(
|
| 41 |
+
{ company_id, score: result.total_score, tier: result.tier },
|
| 42 |
+
"✅ Profiling complete"
|
| 43 |
+
);
|
| 44 |
+
|
| 45 |
+
// ── Route based on score tier ───────────────────────────
|
| 46 |
+
await routeByTier(company_id, result, db, env);
|
| 47 |
+
|
| 48 |
+
// ── Audit log ───────────────────────────────────────────
|
| 49 |
+
auditLog("lead_profiled", "company", {
|
| 50 |
+
company_id,
|
| 51 |
+
domain,
|
| 52 |
+
score: result.total_score,
|
| 53 |
+
tier: result.tier,
|
| 54 |
+
is_fallback: result.is_fallback,
|
| 55 |
+
});
|
| 56 |
+
|
| 57 |
+
return result;
|
| 58 |
+
} catch (err: unknown) {
|
| 59 |
+
// ── Python service unavailable → fallback ───────────────
|
| 60 |
+
if (axios.isAxiosError(err) && !err.response) {
|
| 61 |
+
logger.error({ company_id, domain }, "Python service unreachable — queuing for review");
|
| 62 |
+
await db.from("human_review_queue").insert({
|
| 63 |
+
type: "score_anomaly",
|
| 64 |
+
company_id,
|
| 65 |
+
payload: { reason: "python_service_unavailable", domain },
|
| 66 |
+
});
|
| 67 |
+
return { success: false, reason: "python_service_unavailable" };
|
| 68 |
+
}
|
| 69 |
+
throw err;
|
| 70 |
+
}
|
| 71 |
+
},
|
| 72 |
+
});
|
| 73 |
+
|
| 74 |
+
// ─── Score-based routing ──────────────────────────────────────
|
| 75 |
+
|
| 76 |
+
async function routeByTier(
|
| 77 |
+
companyId: string,
|
| 78 |
+
result: { total_score: number; tier: string; needs_human_review: boolean },
|
| 79 |
+
db: ReturnType<typeof getSupabaseClient>,
|
| 80 |
+
env: ReturnType<typeof getEnv>
|
| 81 |
+
) {
|
| 82 |
+
const { tier, total_score, needs_human_review } = result;
|
| 83 |
+
|
| 84 |
+
if (tier === "hot" || tier === "warm") {
|
| 85 |
+
if (needs_human_review && env.HUMAN_REVIEW_ENABLED) {
|
| 86 |
+
// Queue for human approval before outreach
|
| 87 |
+
logger.info({ companyId, tier }, "Routing to human review queue");
|
| 88 |
+
await db.from("human_review_queue").insert({
|
| 89 |
+
type: "outreach_approval",
|
| 90 |
+
company_id: companyId,
|
| 91 |
+
payload: { score: total_score, tier, reason: "human_review_required" },
|
| 92 |
+
});
|
| 93 |
+
await notifySlack(companyId, total_score, tier, env, "review");
|
| 94 |
+
} else {
|
| 95 |
+
// Qualified — trigger outreach (Step 3, to be built)
|
| 96 |
+
logger.info({ companyId, tier, score: total_score }, "🚀 Routing to outreach queue");
|
| 97 |
+
await notifySlack(companyId, total_score, tier, env, "qualified");
|
| 98 |
+
|
| 99 |
+
// Future: trigger outreach task
|
| 100 |
+
// await outreachTask.trigger({ company_id: companyId, tier });
|
| 101 |
+
}
|
| 102 |
+
} else if (tier === "nurture") {
|
| 103 |
+
logger.info({ companyId }, "Routing to nurture — re-score in 30 days");
|
| 104 |
+
// Future: schedule re-scoring task
|
| 105 |
+
} else {
|
| 106 |
+
logger.info({ companyId }, "Archived — score too low");
|
| 107 |
+
}
|
| 108 |
+
}
|
| 109 |
+
|
| 110 |
+
async function notifySlack(
|
| 111 |
+
companyId: string,
|
| 112 |
+
score: number,
|
| 113 |
+
tier: string,
|
| 114 |
+
env: ReturnType<typeof getEnv>,
|
| 115 |
+
type: "qualified" | "review"
|
| 116 |
+
) {
|
| 117 |
+
try {
|
| 118 |
+
const db = getSupabaseClient();
|
| 119 |
+
const { data: company } = await db
|
| 120 |
+
.from("companies")
|
| 121 |
+
.select("name, domain, industry, employee_count")
|
| 122 |
+
.eq("id", companyId)
|
| 123 |
+
.single();
|
| 124 |
+
|
| 125 |
+
if (!company) return;
|
| 126 |
+
|
| 127 |
+
const emoji = tier === "hot" ? "🔥" : "✅";
|
| 128 |
+
const action = type === "review" ? "⏳ Needs Review" : "📤 Ready for Outreach";
|
| 129 |
+
|
| 130 |
+
const message = {
|
| 131 |
+
text: `${emoji} New Qualified Lead — ${action}`,
|
| 132 |
+
blocks: [
|
| 133 |
+
{
|
| 134 |
+
type: "section",
|
| 135 |
+
text: {
|
| 136 |
+
type: "mrkdwn",
|
| 137 |
+
text: `*${emoji} ${company.name}*\n${action}\n\n` +
|
| 138 |
+
`• *Score:* ${score}/100 — ${tier.toUpperCase()}\n` +
|
| 139 |
+
`• *Industry:* ${company.industry ?? "Unknown"}\n` +
|
| 140 |
+
`• *Employees:* ${company.employee_count ?? "Unknown"}\n` +
|
| 141 |
+
`• *Domain:* ${company.domain}`,
|
| 142 |
+
},
|
| 143 |
+
},
|
| 144 |
+
],
|
| 145 |
+
};
|
| 146 |
+
|
| 147 |
+
const channelId = type === "review" ? env.SLACK_REVIEW_CHANNEL_ID : env.SLACK_ALERT_CHANNEL_ID;
|
| 148 |
+
|
| 149 |
+
await axios.post("https://slack.com/api/chat.postMessage", {
|
| 150 |
+
channel: channelId,
|
| 151 |
+
...message,
|
| 152 |
+
}, {
|
| 153 |
+
headers: { Authorization: `Bearer ${env.SLACK_BOT_TOKEN}` },
|
| 154 |
+
});
|
| 155 |
+
} catch (err) {
|
| 156 |
+
logger.warn({ err }, "Slack notification failed — non-critical");
|
| 157 |
+
}
|
| 158 |
+
}
|
|
@@ -0,0 +1,66 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import { z } from "zod";
|
| 2 |
+
import * as dotenv from "dotenv";
|
| 3 |
+
|
| 4 |
+
dotenv.config();
|
| 5 |
+
|
| 6 |
+
const envSchema = z.object({
|
| 7 |
+
// ─── LLM (All on NVIDIA NIM — FREE) ────────────────────────
|
| 8 |
+
NVIDIA_API_KEY: z.string().min(5),
|
| 9 |
+
NVIDIA_NIM_BASE_URL: z.string().url().default("https://integrate.api.nvidia.com/v1"),
|
| 10 |
+
|
| 11 |
+
// ─── Supabase ──────────────────────────────────────────────
|
| 12 |
+
SUPABASE_URL: z.string().url(),
|
| 13 |
+
SUPABASE_SERVICE_ROLE_KEY: z.string().min(10),
|
| 14 |
+
|
| 15 |
+
// ─── Trigger.dev ───────────────────────────────────────────
|
| 16 |
+
TRIGGER_DEV_API_KEY: z.string().min(5),
|
| 17 |
+
TRIGGER_DEV_PROJECT_ID: z.string().min(3),
|
| 18 |
+
|
| 19 |
+
// ─── Web Research ──────────────────────────────────────────
|
| 20 |
+
SERPER_API_KEY: z.string().min(5),
|
| 21 |
+
|
| 22 |
+
// ─── Email Finding ─────────────────────────────────────────
|
| 23 |
+
HUNTER_API_KEY: z.string().min(5),
|
| 24 |
+
|
| 25 |
+
// ─── Email Verification ────────────────────────────────────
|
| 26 |
+
REOON_API_KEY: z.string().min(5),
|
| 27 |
+
|
| 28 |
+
// ─── Slack ─────────────────────────────────────────────────
|
| 29 |
+
SLACK_BOT_TOKEN: z.string().startsWith("xoxb-"),
|
| 30 |
+
SLACK_SIGNING_SECRET: z.string().min(5),
|
| 31 |
+
SLACK_ALERT_CHANNEL_ID: z.string(),
|
| 32 |
+
SLACK_REVIEW_CHANNEL_ID: z.string(),
|
| 33 |
+
|
| 34 |
+
// ─── Python AI Service ─────────────────────────────────────
|
| 35 |
+
PYTHON_AI_SERVICE_URL: z.string().url().default("http://localhost:8000"),
|
| 36 |
+
PYTHON_AI_SERVICE_SECRET: z.string().min(10),
|
| 37 |
+
|
| 38 |
+
// ─── System Config ─────────────────────────────────────────
|
| 39 |
+
NODE_ENV: z.enum(["development", "staging", "production"]).default("development"),
|
| 40 |
+
LOG_LEVEL: z.enum(["debug", "info", "warn", "error"]).default("info"),
|
| 41 |
+
DAILY_LEAD_QUOTA: z.coerce.number().default(10),
|
| 42 |
+
QUALITY_SCORE_THRESHOLD: z.coerce.number().default(70),
|
| 43 |
+
HUMAN_REVIEW_ENABLED: z.string().transform((v) => v === "true").default("true"),
|
| 44 |
+
DAILY_EMAIL_LIMIT: z.coerce.number().default(50),
|
| 45 |
+
DAILY_LINKEDIN_LIMIT: z.coerce.number().default(25),
|
| 46 |
+
SCHEDULE_START_HOUR_UTC: z.coerce.number().default(4),
|
| 47 |
+
});
|
| 48 |
+
|
| 49 |
+
type Env = z.infer<typeof envSchema>;
|
| 50 |
+
|
| 51 |
+
let _env: Env;
|
| 52 |
+
|
| 53 |
+
export function getEnv(): Env {
|
| 54 |
+
if (!_env) {
|
| 55 |
+
const result = envSchema.safeParse(process.env);
|
| 56 |
+
if (!result.success) {
|
| 57 |
+
console.error("❌ Invalid environment configuration:");
|
| 58 |
+
result.error.errors.forEach((e) => {
|
| 59 |
+
console.error(` ${e.path.join(".")}: ${e.message}`);
|
| 60 |
+
});
|
| 61 |
+
process.exit(1);
|
| 62 |
+
}
|
| 63 |
+
_env = result.data;
|
| 64 |
+
}
|
| 65 |
+
return _env;
|
| 66 |
+
}
|
|
@@ -0,0 +1,239 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/**
|
| 2 |
+
* Grounded Hallucination Detection
|
| 3 |
+
*
|
| 4 |
+
* Google DeepMind approach: Every LLM claim must be traceable
|
| 5 |
+
* to a piece of evidence. Claims without evidence are stripped.
|
| 6 |
+
*
|
| 7 |
+
* This is NOT "ask LLM for confidence" — that's like asking
|
| 8 |
+
* a cheater to grade their own exam.
|
| 9 |
+
*
|
| 10 |
+
* This IS: cross-reference every output field against source data.
|
| 11 |
+
*/
|
| 12 |
+
|
| 13 |
+
import { logger } from "../utils/logger";
|
| 14 |
+
|
| 15 |
+
export interface GroundingResult {
|
| 16 |
+
isGrounded: boolean;
|
| 17 |
+
groundingScore: number; // 0.0-1.0
|
| 18 |
+
verifiedClaims: string[]; // claims that match evidence
|
| 19 |
+
unverifiedClaims: string[]; // claims with no evidence
|
| 20 |
+
strippedClaims: string[]; // claims removed from output
|
| 21 |
+
corrections: Record<string, { claimed: unknown; actual: unknown }>;
|
| 22 |
+
}
|
| 23 |
+
|
| 24 |
+
export interface EvidenceSet {
|
| 25 |
+
// Factual data we collected from providers/scrapers
|
| 26 |
+
company_name: string;
|
| 27 |
+
domain: string;
|
| 28 |
+
employee_count: number | null;
|
| 29 |
+
industry: string | null;
|
| 30 |
+
tech_stack: string[];
|
| 31 |
+
description: string | null;
|
| 32 |
+
website_text: string;
|
| 33 |
+
job_postings: string[];
|
| 34 |
+
ai_job_count: number;
|
| 35 |
+
linkedin_description: string | null;
|
| 36 |
+
country: string | null;
|
| 37 |
+
city: string | null;
|
| 38 |
+
pain_signals_detected: string[];
|
| 39 |
+
}
|
| 40 |
+
|
| 41 |
+
/**
|
| 42 |
+
* Validates LLM profile output against collected evidence.
|
| 43 |
+
* Returns cleaned profile with unverifiable claims stripped.
|
| 44 |
+
*/
|
| 45 |
+
export function groundProfile(
|
| 46 |
+
profile: Record<string, unknown>,
|
| 47 |
+
evidence: EvidenceSet
|
| 48 |
+
): { cleaned: Record<string, unknown>; grounding: GroundingResult } {
|
| 49 |
+
const verified: string[] = [];
|
| 50 |
+
const unverified: string[] = [];
|
| 51 |
+
const stripped: string[] = [];
|
| 52 |
+
const corrections: Record<string, { claimed: unknown; actual: unknown }> = {};
|
| 53 |
+
|
| 54 |
+
const cleaned = { ...profile };
|
| 55 |
+
|
| 56 |
+
// ── Check profile_summary ──────────────────────────────────
|
| 57 |
+
const summary = String(profile.profile_summary ?? "");
|
| 58 |
+
|
| 59 |
+
// Does summary mention the right company?
|
| 60 |
+
if (summary.length > 20 && !containsName(summary, evidence.company_name)) {
|
| 61 |
+
stripped.push("summary_wrong_company");
|
| 62 |
+
// Don't strip — just flag. LLM may paraphrase the name.
|
| 63 |
+
}
|
| 64 |
+
|
| 65 |
+
// Does summary claim employee count?
|
| 66 |
+
const claimedEmpMatch = summary.match(/(\d[\d,]+)\s*(employees?|people|staff|team)/i);
|
| 67 |
+
if (claimedEmpMatch && evidence.employee_count) {
|
| 68 |
+
const claimed = parseInt(claimedEmpMatch[1].replace(/,/g, ""), 10);
|
| 69 |
+
if (Math.abs(claimed - evidence.employee_count) > evidence.employee_count * 0.3) {
|
| 70 |
+
corrections["employee_count"] = { claimed, actual: evidence.employee_count };
|
| 71 |
+
// Fix the claim in the summary
|
| 72 |
+
cleaned.profile_summary = summary.replace(
|
| 73 |
+
claimedEmpMatch[0],
|
| 74 |
+
`${evidence.employee_count} employees`
|
| 75 |
+
);
|
| 76 |
+
verified.push("employee_count_corrected");
|
| 77 |
+
} else {
|
| 78 |
+
verified.push("employee_count_accurate");
|
| 79 |
+
}
|
| 80 |
+
}
|
| 81 |
+
|
| 82 |
+
// ── Check industry claim ───────────────────────────────────
|
| 83 |
+
const claimedIndustry = summary.toLowerCase();
|
| 84 |
+
if (evidence.industry) {
|
| 85 |
+
const industryWords = evidence.industry.toLowerCase().split(/[\s_]+/);
|
| 86 |
+
const hasIndustryMention = industryWords.some(w => claimedIndustry.includes(w));
|
| 87 |
+
if (hasIndustryMention) {
|
| 88 |
+
verified.push("industry_match");
|
| 89 |
+
} else {
|
| 90 |
+
unverified.push("industry_may_differ");
|
| 91 |
+
}
|
| 92 |
+
}
|
| 93 |
+
|
| 94 |
+
// ── Check tech stack claims ─────────────────────────────────
|
| 95 |
+
if (Array.isArray(profile.evidence_used)) {
|
| 96 |
+
for (const claim of profile.evidence_used as string[]) {
|
| 97 |
+
const claimLower = claim.toLowerCase();
|
| 98 |
+
const isSupported =
|
| 99 |
+
evidence.tech_stack.some(t => claimLower.includes(t.toLowerCase())) ||
|
| 100 |
+
evidence.website_text.toLowerCase().includes(claimLower.slice(0, 20)) ||
|
| 101 |
+
evidence.job_postings.some(j => claimLower.includes(j.toLowerCase().slice(0, 15))) ||
|
| 102 |
+
evidence.pain_signals_detected.some(p => claimLower.includes(p.toLowerCase().slice(0, 15)));
|
| 103 |
+
|
| 104 |
+
if (isSupported) {
|
| 105 |
+
verified.push(`evidence: ${claim.slice(0, 40)}`);
|
| 106 |
+
} else {
|
| 107 |
+
unverified.push(`unverifiable: ${claim.slice(0, 40)}`);
|
| 108 |
+
}
|
| 109 |
+
}
|
| 110 |
+
}
|
| 111 |
+
|
| 112 |
+
// ── Check ai_readiness ─────────────────────────────────────
|
| 113 |
+
const claimedReadiness = String(profile.ai_readiness ?? "");
|
| 114 |
+
if (claimedReadiness === "high" && evidence.ai_job_count === 0 && evidence.tech_stack.length === 0) {
|
| 115 |
+
corrections["ai_readiness"] = { claimed: "high", actual: "low" };
|
| 116 |
+
cleaned.ai_readiness = "low";
|
| 117 |
+
verified.push("ai_readiness_corrected");
|
| 118 |
+
} else if (claimedReadiness === "low" && evidence.ai_job_count >= 3) {
|
| 119 |
+
corrections["ai_readiness"] = { claimed: "low", actual: "high" };
|
| 120 |
+
cleaned.ai_readiness = "high";
|
| 121 |
+
verified.push("ai_readiness_corrected");
|
| 122 |
+
} else {
|
| 123 |
+
verified.push("ai_readiness_plausible");
|
| 124 |
+
}
|
| 125 |
+
|
| 126 |
+
// ���─ Check for PII leakage ──────────────────────────────────
|
| 127 |
+
const outputStr = JSON.stringify(cleaned);
|
| 128 |
+
const emailPattern = /[\w.+-]+@[\w-]+\.[a-z]{2,}/gi;
|
| 129 |
+
const phonePattern = /\+?\d[\d\s\-().]{8,}/g;
|
| 130 |
+
|
| 131 |
+
if (emailPattern.test(outputStr)) {
|
| 132 |
+
stripped.push("pii_email_in_output");
|
| 133 |
+
// Strip emails from all string fields
|
| 134 |
+
for (const [key, val] of Object.entries(cleaned)) {
|
| 135 |
+
if (typeof val === "string") {
|
| 136 |
+
cleaned[key] = val.replace(emailPattern, "[EMAIL_REDACTED]");
|
| 137 |
+
}
|
| 138 |
+
}
|
| 139 |
+
}
|
| 140 |
+
|
| 141 |
+
if (phonePattern.test(outputStr)) {
|
| 142 |
+
stripped.push("pii_phone_in_output");
|
| 143 |
+
for (const [key, val] of Object.entries(cleaned)) {
|
| 144 |
+
if (typeof val === "string") {
|
| 145 |
+
cleaned[key] = val.replace(phonePattern, "[PHONE_REDACTED]");
|
| 146 |
+
}
|
| 147 |
+
}
|
| 148 |
+
}
|
| 149 |
+
|
| 150 |
+
// ── Compute grounding score ────────────────────────────────
|
| 151 |
+
const totalChecks = verified.length + unverified.length + stripped.length;
|
| 152 |
+
const groundingScore = totalChecks === 0 ? 0.5 : verified.length / totalChecks;
|
| 153 |
+
|
| 154 |
+
const result: GroundingResult = {
|
| 155 |
+
isGrounded: groundingScore >= 0.6 && stripped.length === 0,
|
| 156 |
+
groundingScore,
|
| 157 |
+
verifiedClaims: verified,
|
| 158 |
+
unverifiedClaims: unverified,
|
| 159 |
+
strippedClaims: stripped,
|
| 160 |
+
corrections,
|
| 161 |
+
};
|
| 162 |
+
|
| 163 |
+
if (!result.isGrounded) {
|
| 164 |
+
logger.warn(
|
| 165 |
+
{ groundingScore: groundingScore.toFixed(2), corrections: Object.keys(corrections).length },
|
| 166 |
+
"Profile failed grounding — corrections applied"
|
| 167 |
+
);
|
| 168 |
+
}
|
| 169 |
+
|
| 170 |
+
return { cleaned, grounding: result };
|
| 171 |
+
}
|
| 172 |
+
|
| 173 |
+
/**
|
| 174 |
+
* Validates scoring signals against evidence.
|
| 175 |
+
* Scores are computed DETERMINISTICALLY from signals —
|
| 176 |
+
* LLM only extracts signals, code computes score.
|
| 177 |
+
*/
|
| 178 |
+
export function groundSignals(
|
| 179 |
+
signals: Record<string, unknown>,
|
| 180 |
+
evidence: EvidenceSet
|
| 181 |
+
): { cleaned: Record<string, unknown>; grounding: GroundingResult } {
|
| 182 |
+
const verified: string[] = [];
|
| 183 |
+
const unverified: string[] = [];
|
| 184 |
+
const corrections: Record<string, { claimed: unknown; actual: unknown }> = {};
|
| 185 |
+
const cleaned = { ...signals };
|
| 186 |
+
|
| 187 |
+
// Verify company_fit_signals
|
| 188 |
+
const fitSignals = signals.company_fit_signals as Record<string, unknown> | undefined;
|
| 189 |
+
if (fitSignals) {
|
| 190 |
+
if (fitSignals.size_appropriate === true && evidence.employee_count !== null && evidence.employee_count < 3) {
|
| 191 |
+
corrections["size_appropriate"] = { claimed: true, actual: false };
|
| 192 |
+
verified.push("size_corrected");
|
| 193 |
+
} else {
|
| 194 |
+
verified.push("size_plausible");
|
| 195 |
+
}
|
| 196 |
+
}
|
| 197 |
+
|
| 198 |
+
// Verify ai_readiness_signals
|
| 199 |
+
const aiSignals = signals.ai_readiness_signals as Record<string, unknown> | undefined;
|
| 200 |
+
if (aiSignals) {
|
| 201 |
+
if (aiSignals.ai_jobs_present === true && evidence.ai_job_count === 0) {
|
| 202 |
+
corrections["ai_jobs_present"] = { claimed: true, actual: false };
|
| 203 |
+
verified.push("ai_jobs_corrected");
|
| 204 |
+
} else {
|
| 205 |
+
verified.push("ai_jobs_accurate");
|
| 206 |
+
}
|
| 207 |
+
|
| 208 |
+
if (aiSignals.tech_stack_relevant === true && evidence.tech_stack.length === 0) {
|
| 209 |
+
corrections["tech_stack_relevant"] = { claimed: true, actual: false };
|
| 210 |
+
verified.push("tech_stack_corrected");
|
| 211 |
+
} else {
|
| 212 |
+
verified.push("tech_stack_accurate");
|
| 213 |
+
}
|
| 214 |
+
}
|
| 215 |
+
|
| 216 |
+
const totalChecks = verified.length + unverified.length;
|
| 217 |
+
const groundingScore = totalChecks === 0 ? 0.5 : verified.length / totalChecks;
|
| 218 |
+
|
| 219 |
+
return {
|
| 220 |
+
cleaned,
|
| 221 |
+
grounding: {
|
| 222 |
+
isGrounded: groundingScore >= 0.6,
|
| 223 |
+
groundingScore,
|
| 224 |
+
verifiedClaims: verified,
|
| 225 |
+
unverifiedClaims: unverified,
|
| 226 |
+
strippedClaims: [],
|
| 227 |
+
corrections,
|
| 228 |
+
},
|
| 229 |
+
};
|
| 230 |
+
}
|
| 231 |
+
|
| 232 |
+
// ─── Helpers ─────────────────────────────────────────────────
|
| 233 |
+
|
| 234 |
+
function containsName(text: string, name: string): boolean {
|
| 235 |
+
const words = name.toLowerCase().split(/\s+/);
|
| 236 |
+
const textLower = text.toLowerCase();
|
| 237 |
+
// At least one significant word from company name should be present
|
| 238 |
+
return words.some(w => w.length > 2 && textLower.includes(w));
|
| 239 |
+
}
|
|
@@ -0,0 +1,307 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/**
|
| 2 |
+
* Multi-Model LLM Client — All FREE on NVIDIA NIM
|
| 3 |
+
*
|
| 4 |
+
* 3 models, 1 provider, 1 API key, $0 cost:
|
| 5 |
+
*
|
| 6 |
+
* Priority 1: MiniMax M2.7 → Best reasoning, 4M context, built-in CoT
|
| 7 |
+
* Priority 2: LLaMA 3.3 70B → Reliable, proven, 128K context
|
| 8 |
+
* Priority 3: LLaMA 3.1 8B → Fast, cheap, for simple tasks
|
| 9 |
+
* Priority 4: Deterministic → Zero LLM, zero hallucination
|
| 10 |
+
*
|
| 11 |
+
* All on: https://integrate.api.nvidia.com/v1
|
| 12 |
+
* All use: same NVIDIA_API_KEY
|
| 13 |
+
*
|
| 14 |
+
* MiniMax M2.7 special feature:
|
| 15 |
+
* Response includes `reasoning_content` field — chain-of-thought
|
| 16 |
+
* reasoning happens AUTOMATICALLY inside the model.
|
| 17 |
+
* We don't need to prompt "think step by step" — it does it natively.
|
| 18 |
+
*/
|
| 19 |
+
|
| 20 |
+
import axios, { AxiosError } from "axios";
|
| 21 |
+
import { createHash } from "crypto";
|
| 22 |
+
import { getEnv } from "../config/env";
|
| 23 |
+
import { getSupabaseClient } from "../supabase/client";
|
| 24 |
+
import { logger } from "../utils/logger";
|
| 25 |
+
|
| 26 |
+
// ─── Types ───────────────────────────────────────────────────
|
| 27 |
+
|
| 28 |
+
export interface LLMRequest {
|
| 29 |
+
operation: string;
|
| 30 |
+
modelIndex?: number; // 0=MiniMax, 1=LLaMA70B, 2=LLaMA8B
|
| 31 |
+
systemPrompt: string;
|
| 32 |
+
userPrompt: string;
|
| 33 |
+
temperature?: number;
|
| 34 |
+
maxTokens?: number;
|
| 35 |
+
jsonMode?: boolean;
|
| 36 |
+
traceId: string;
|
| 37 |
+
companyId?: string;
|
| 38 |
+
}
|
| 39 |
+
|
| 40 |
+
export interface LLMResponse {
|
| 41 |
+
content: string;
|
| 42 |
+
reasoning: string | null; // MiniMax's built-in chain-of-thought
|
| 43 |
+
parsed: Record<string, unknown> | null;
|
| 44 |
+
model: string;
|
| 45 |
+
provider: string;
|
| 46 |
+
tokens: { prompt: number; completion: number; total: number };
|
| 47 |
+
latencyMs: number;
|
| 48 |
+
grounded: boolean;
|
| 49 |
+
fallbackUsed: boolean;
|
| 50 |
+
}
|
| 51 |
+
|
| 52 |
+
// ─── Model configs (ALL on NVIDIA NIM, ALL FREE) ─────────────
|
| 53 |
+
|
| 54 |
+
interface ModelConfig {
|
| 55 |
+
name: string;
|
| 56 |
+
model: string;
|
| 57 |
+
maxContext: number;
|
| 58 |
+
bestFor: string;
|
| 59 |
+
}
|
| 60 |
+
|
| 61 |
+
const MODEL_CONFIGS: ModelConfig[] = [
|
| 62 |
+
{
|
| 63 |
+
name: "MiniMax M2.7",
|
| 64 |
+
model: "minimaxai/minimax-m2.7",
|
| 65 |
+
maxContext: 4_000_000, // 4M tokens!
|
| 66 |
+
bestFor: "profiling, scoring, complex reasoning",
|
| 67 |
+
},
|
| 68 |
+
{
|
| 69 |
+
name: "LLaMA 3.3 70B",
|
| 70 |
+
model: "meta/llama-3.3-70b-instruct",
|
| 71 |
+
maxContext: 128_000,
|
| 72 |
+
bestFor: "general tasks, reliable fallback",
|
| 73 |
+
},
|
| 74 |
+
{
|
| 75 |
+
name: "LLaMA 3.1 8B",
|
| 76 |
+
model: "meta/llama-3.1-8b-instruct",
|
| 77 |
+
maxContext: 128_000,
|
| 78 |
+
bestFor: "email classification, simple checks",
|
| 79 |
+
},
|
| 80 |
+
];
|
| 81 |
+
|
| 82 |
+
export const MODELS = {
|
| 83 |
+
MINIMAX: 0, // Primary — best reasoning
|
| 84 |
+
LLAMA_70B: 1, // Fallback — reliable
|
| 85 |
+
LLAMA_8B: 2, // Fast — simple tasks
|
| 86 |
+
FAST: 2, // alias
|
| 87 |
+
} as const;
|
| 88 |
+
|
| 89 |
+
// ─── Main LLM call ──────────────────────────────────────────
|
| 90 |
+
|
| 91 |
+
export async function callLLM(request: LLMRequest): Promise<LLMResponse> {
|
| 92 |
+
const modelIndex = request.modelIndex ?? 0;
|
| 93 |
+
const env = getEnv();
|
| 94 |
+
|
| 95 |
+
if (modelIndex >= MODEL_CONFIGS.length) {
|
| 96 |
+
return deterministicFallback(request);
|
| 97 |
+
}
|
| 98 |
+
|
| 99 |
+
const config = MODEL_CONFIGS[modelIndex];
|
| 100 |
+
const startTime = Date.now();
|
| 101 |
+
|
| 102 |
+
const body: Record<string, unknown> = {
|
| 103 |
+
model: config.model,
|
| 104 |
+
messages: [
|
| 105 |
+
{ role: "system", content: request.systemPrompt },
|
| 106 |
+
{ role: "user", content: request.userPrompt },
|
| 107 |
+
],
|
| 108 |
+
temperature: request.temperature ?? 0.2,
|
| 109 |
+
max_tokens: request.maxTokens ?? 1024,
|
| 110 |
+
top_p: 0.9,
|
| 111 |
+
};
|
| 112 |
+
|
| 113 |
+
if (request.jsonMode) {
|
| 114 |
+
body.response_format = { type: "json_object" };
|
| 115 |
+
}
|
| 116 |
+
|
| 117 |
+
try {
|
| 118 |
+
const response = await axios.post(
|
| 119 |
+
`${env.NVIDIA_NIM_BASE_URL}/chat/completions`,
|
| 120 |
+
body,
|
| 121 |
+
{
|
| 122 |
+
headers: {
|
| 123 |
+
Authorization: `Bearer ${env.NVIDIA_API_KEY}`,
|
| 124 |
+
"Content-Type": "application/json",
|
| 125 |
+
},
|
| 126 |
+
timeout: 90_000, // MiniMax can take longer for reasoning
|
| 127 |
+
}
|
| 128 |
+
);
|
| 129 |
+
|
| 130 |
+
const data = response.data;
|
| 131 |
+
const message = data.choices?.[0]?.message;
|
| 132 |
+
const content = message?.content ?? "";
|
| 133 |
+
const reasoning = message?.reasoning_content ?? null; // MiniMax CoT
|
| 134 |
+
const usage = data.usage ?? { prompt_tokens: 0, completion_tokens: 0, total_tokens: 0 };
|
| 135 |
+
const latencyMs = Date.now() - startTime;
|
| 136 |
+
|
| 137 |
+
let parsed: Record<string, unknown> | null = null;
|
| 138 |
+
if (request.jsonMode) {
|
| 139 |
+
parsed = safeParseJSON(content);
|
| 140 |
+
if (!parsed) {
|
| 141 |
+
logger.warn({ operation: request.operation, model: config.name }, "JSON parse failed → next model");
|
| 142 |
+
return callLLM({ ...request, modelIndex: modelIndex + 1 });
|
| 143 |
+
}
|
| 144 |
+
}
|
| 145 |
+
|
| 146 |
+
const result: LLMResponse = {
|
| 147 |
+
content,
|
| 148 |
+
reasoning,
|
| 149 |
+
parsed,
|
| 150 |
+
model: config.name,
|
| 151 |
+
provider: "nvidia",
|
| 152 |
+
tokens: {
|
| 153 |
+
prompt: usage.prompt_tokens,
|
| 154 |
+
completion: usage.completion_tokens,
|
| 155 |
+
total: usage.total_tokens,
|
| 156 |
+
},
|
| 157 |
+
latencyMs,
|
| 158 |
+
grounded: true,
|
| 159 |
+
fallbackUsed: false,
|
| 160 |
+
};
|
| 161 |
+
|
| 162 |
+
// Log MiniMax reasoning if present
|
| 163 |
+
if (reasoning) {
|
| 164 |
+
logger.debug({ operation: request.operation, reasoning: reasoning.slice(0, 200) },
|
| 165 |
+
"MiniMax reasoning captured");
|
| 166 |
+
}
|
| 167 |
+
|
| 168 |
+
await logLLMTrace(request, result, true, config);
|
| 169 |
+
return result;
|
| 170 |
+
|
| 171 |
+
} catch (err) {
|
| 172 |
+
if (err instanceof AxiosError) {
|
| 173 |
+
if (err.response?.status === 429) {
|
| 174 |
+
const retryAfter = parseInt(err.response.headers["retry-after"] ?? "5", 10);
|
| 175 |
+
logger.warn({ model: config.name, retryAfter }, "Rate limited → waiting");
|
| 176 |
+
await sleep(retryAfter * 1000);
|
| 177 |
+
return callLLM(request);
|
| 178 |
+
}
|
| 179 |
+
|
| 180 |
+
if (err.response?.status === 503 || err.response?.status === 500) {
|
| 181 |
+
logger.warn({ model: config.name, status: err.response?.status }, `${config.name} unavailable → next`);
|
| 182 |
+
return callLLM({ ...request, modelIndex: modelIndex + 1 });
|
| 183 |
+
}
|
| 184 |
+
}
|
| 185 |
+
|
| 186 |
+
logger.error({ model: config.name, err: String(err).slice(0, 200) }, "LLM call failed → next");
|
| 187 |
+
return callLLM({ ...request, modelIndex: modelIndex + 1 });
|
| 188 |
+
}
|
| 189 |
+
}
|
| 190 |
+
|
| 191 |
+
function deterministicFallback(request: LLMRequest): LLMResponse {
|
| 192 |
+
logger.error({ operation: request.operation }, "ALL models failed → deterministic fallback");
|
| 193 |
+
return {
|
| 194 |
+
content: "",
|
| 195 |
+
reasoning: null,
|
| 196 |
+
parsed: null,
|
| 197 |
+
model: "deterministic_fallback",
|
| 198 |
+
provider: "none",
|
| 199 |
+
tokens: { prompt: 0, completion: 0, total: 0 },
|
| 200 |
+
latencyMs: 0,
|
| 201 |
+
grounded: false,
|
| 202 |
+
fallbackUsed: true,
|
| 203 |
+
};
|
| 204 |
+
}
|
| 205 |
+
|
| 206 |
+
// ─── Self-consistency check ──────────────────────────────────
|
| 207 |
+
// NOTE: MiniMax has built-in reasoning → consistency is higher
|
| 208 |
+
// We still do dual-temperature check for critical operations
|
| 209 |
+
|
| 210 |
+
export async function callLLMWithConsistencyCheck(
|
| 211 |
+
request: LLMRequest
|
| 212 |
+
): Promise<{ primary: LLMResponse; isConsistent: boolean; consistencyScore: number }> {
|
| 213 |
+
const primary = await callLLM({ ...request, temperature: 0.1 });
|
| 214 |
+
|
| 215 |
+
if (!["profile", "score"].includes(request.operation)) {
|
| 216 |
+
return { primary, isConsistent: true, consistencyScore: 1.0 };
|
| 217 |
+
}
|
| 218 |
+
|
| 219 |
+
if (primary.fallbackUsed) {
|
| 220 |
+
return { primary, isConsistent: true, consistencyScore: 0.5 };
|
| 221 |
+
}
|
| 222 |
+
|
| 223 |
+
// MiniMax has reasoning → inherently more consistent
|
| 224 |
+
// Only do consistency check with LLaMA models
|
| 225 |
+
if (primary.model === "MiniMax M2.7" && primary.reasoning) {
|
| 226 |
+
// MiniMax showed its reasoning → trust it more
|
| 227 |
+
return { primary, isConsistent: true, consistencyScore: 0.95 };
|
| 228 |
+
}
|
| 229 |
+
|
| 230 |
+
const secondary = await callLLM({ ...request, temperature: 0.4, modelIndex: request.modelIndex });
|
| 231 |
+
const score = compareOutputs(primary, secondary);
|
| 232 |
+
return { primary, isConsistent: score >= 0.75, consistencyScore: score };
|
| 233 |
+
}
|
| 234 |
+
|
| 235 |
+
function compareOutputs(a: LLMResponse, b: LLMResponse): number {
|
| 236 |
+
if (!a.parsed || !b.parsed) return 0.5;
|
| 237 |
+
let matches = 0, total = 0;
|
| 238 |
+
|
| 239 |
+
for (const key of ["ai_readiness", "tier", "service_match"]) {
|
| 240 |
+
if (key in a.parsed && key in b.parsed) {
|
| 241 |
+
total++;
|
| 242 |
+
if (a.parsed[key] === b.parsed[key]) matches++;
|
| 243 |
+
}
|
| 244 |
+
}
|
| 245 |
+
for (const key of ["total_score", "company_fit"]) {
|
| 246 |
+
const aVal = a.parsed[key], bVal = b.parsed[key];
|
| 247 |
+
if (typeof aVal === "number" && typeof bVal === "number") {
|
| 248 |
+
total++;
|
| 249 |
+
if (Math.abs(aVal - bVal) <= 10) matches++;
|
| 250 |
+
}
|
| 251 |
+
}
|
| 252 |
+
return total === 0 ? 1.0 : matches / total;
|
| 253 |
+
}
|
| 254 |
+
|
| 255 |
+
// ─── Trace logging ───────────────────────────────────────────
|
| 256 |
+
|
| 257 |
+
async function logLLMTrace(
|
| 258 |
+
request: LLMRequest,
|
| 259 |
+
response: LLMResponse | null,
|
| 260 |
+
success: boolean,
|
| 261 |
+
config?: ModelConfig
|
| 262 |
+
): Promise<void> {
|
| 263 |
+
try {
|
| 264 |
+
const db = getSupabaseClient();
|
| 265 |
+
await db.from("llm_traces").insert({
|
| 266 |
+
trace_id: request.traceId,
|
| 267 |
+
operation: request.operation,
|
| 268 |
+
model: response?.model ?? config?.name ?? "unknown",
|
| 269 |
+
provider: "nvidia",
|
| 270 |
+
prompt_tokens: response?.tokens.prompt ?? 0,
|
| 271 |
+
completion_tokens: response?.tokens.completion ?? 0,
|
| 272 |
+
total_tokens: response?.tokens.total ?? 0,
|
| 273 |
+
latency_ms: response?.latencyMs ?? 0,
|
| 274 |
+
success,
|
| 275 |
+
fallback_used: response?.fallbackUsed ?? true,
|
| 276 |
+
company_id: request.companyId ?? null,
|
| 277 |
+
input_hash: hashText(request.userPrompt.slice(0, 200)),
|
| 278 |
+
output_hash: response ? hashText(response.content.slice(0, 200)) : null,
|
| 279 |
+
});
|
| 280 |
+
} catch (err) {
|
| 281 |
+
logger.warn({ err }, "Trace log failed — non-critical");
|
| 282 |
+
}
|
| 283 |
+
}
|
| 284 |
+
|
| 285 |
+
// ─── Helpers ─────────────────────────────────────────────────
|
| 286 |
+
|
| 287 |
+
function safeParseJSON(text: string): Record<string, unknown> | null {
|
| 288 |
+
let content = text.trim();
|
| 289 |
+
if (content.includes("```json")) content = content.split("```json")[1].split("```")[0].trim();
|
| 290 |
+
else if (content.includes("```")) content = content.split("```")[1].split("```")[0].trim();
|
| 291 |
+
|
| 292 |
+
try {
|
| 293 |
+
return JSON.parse(content);
|
| 294 |
+
} catch {
|
| 295 |
+
const match = content.match(/\{[\s\S]*\}/);
|
| 296 |
+
if (match) { try { return JSON.parse(match[0]); } catch { return null; } }
|
| 297 |
+
return null;
|
| 298 |
+
}
|
| 299 |
+
}
|
| 300 |
+
|
| 301 |
+
function hashText(text: string): string {
|
| 302 |
+
return createHash("sha256").update(text).digest("hex").slice(0, 16);
|
| 303 |
+
}
|
| 304 |
+
|
| 305 |
+
function sleep(ms: number): Promise<void> {
|
| 306 |
+
return new Promise((resolve) => setTimeout(resolve, ms));
|
| 307 |
+
}
|
|
@@ -0,0 +1,277 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/**
|
| 2 |
+
* Production-grade prompt library.
|
| 3 |
+
*
|
| 4 |
+
* Design principles (Google/Anthropic standard):
|
| 5 |
+
* 1. Chain-of-thought: Force reasoning before conclusion
|
| 6 |
+
* 2. Few-shot examples: 2-3 examples for each prompt
|
| 7 |
+
* 3. Structured output: Exact JSON schema specified
|
| 8 |
+
* 4. Grounding instruction: "Only state what evidence supports"
|
| 9 |
+
* 5. Anti-hallucination: "Write UNKNOWN if data not provided"
|
| 10 |
+
* 6. Token-efficient: No verbose instructions, no repetition
|
| 11 |
+
*/
|
| 12 |
+
|
| 13 |
+
// ─── SYSTEM PROMPTS ──────────────────────────────────────────
|
| 14 |
+
|
| 15 |
+
export const SYSTEM_PROMPTS = {
|
| 16 |
+
PROFILER: `You are a business analyst for an AI automation agency.
|
| 17 |
+
Your job: analyze a company and identify WHERE our AI services can help them.
|
| 18 |
+
|
| 19 |
+
CRITICAL RULES:
|
| 20 |
+
- Only state facts supported by the provided evidence
|
| 21 |
+
- Write "UNKNOWN" for anything not in the data — NEVER guess
|
| 22 |
+
- Your analysis determines whether a real salesperson contacts this company
|
| 23 |
+
- Wrong analysis = wasted human time = unacceptable
|
| 24 |
+
- Think step by step before concluding`,
|
| 25 |
+
|
| 26 |
+
SCORER: `You are a lead qualification engine.
|
| 27 |
+
Your job: extract SIGNALS from company data. You do NOT compute the final score.
|
| 28 |
+
The system computes scores deterministically from your signal extraction.
|
| 29 |
+
|
| 30 |
+
CRITICAL RULES:
|
| 31 |
+
- Extract only what the evidence supports
|
| 32 |
+
- For each signal, cite which piece of evidence supports it
|
| 33 |
+
- If evidence is weak or missing, say so honestly
|
| 34 |
+
- Output ONLY the structured JSON requested`,
|
| 35 |
+
|
| 36 |
+
EMAIL_CLASSIFIER: `You are a B2B email quality analyst.
|
| 37 |
+
Your job: determine if a specific email address reaches a decision-maker.
|
| 38 |
+
Consider company size, industry, and the email prefix meaning in context.
|
| 39 |
+
|
| 40 |
+
CRITICAL RULES:
|
| 41 |
+
- Small company (<20 people): admin@, operations@, office@ likely reaches owner
|
| 42 |
+
- Large company (200+): same prefixes likely reach departments, not individuals
|
| 43 |
+
- NEVER assume — reason from the evidence provided
|
| 44 |
+
- When uncertain, err on the side of KEEPING the email (mark confidence low)`,
|
| 45 |
+
|
| 46 |
+
PAIN_DETECTOR: `You are an operations efficiency analyst.
|
| 47 |
+
Your job: identify operational pain points in a company that AI automation can solve.
|
| 48 |
+
You are NOT looking for companies that already use AI.
|
| 49 |
+
You ARE looking for companies with manual, repetitive, or inefficient processes.
|
| 50 |
+
|
| 51 |
+
CRITICAL RULES:
|
| 52 |
+
- A phone number on homepage = manual call handling (pain point)
|
| 53 |
+
- "Book by phone" = no online scheduling (pain point)
|
| 54 |
+
- No chatbot visible = manual customer interaction (pain point)
|
| 55 |
+
- Small staff + many services = overworked team (pain point)
|
| 56 |
+
- These are REAL signals, not guesses`,
|
| 57 |
+
} as const;
|
| 58 |
+
|
| 59 |
+
|
| 60 |
+
// ─── PROFILING PROMPT ────────────────────────────────────────
|
| 61 |
+
|
| 62 |
+
export function buildProfilePrompt(companyData: {
|
| 63 |
+
name: string;
|
| 64 |
+
industry: string;
|
| 65 |
+
employee_count: number | null;
|
| 66 |
+
description: string;
|
| 67 |
+
website_text: string;
|
| 68 |
+
tech_stack: string[];
|
| 69 |
+
job_postings: string[];
|
| 70 |
+
ai_job_count: number;
|
| 71 |
+
linkedin_description: string;
|
| 72 |
+
pain_signals: string[];
|
| 73 |
+
service_match: string | null;
|
| 74 |
+
}): string {
|
| 75 |
+
return `ANALYZE THIS COMPANY:
|
| 76 |
+
|
| 77 |
+
Name: ${companyData.name}
|
| 78 |
+
Industry: ${companyData.industry || "UNKNOWN"}
|
| 79 |
+
Employees: ${companyData.employee_count ?? "UNKNOWN"}
|
| 80 |
+
Description: ${companyData.description || "NONE PROVIDED"}
|
| 81 |
+
|
| 82 |
+
Website excerpt (first 600 chars):
|
| 83 |
+
${(companyData.website_text || "").slice(0, 600)}
|
| 84 |
+
|
| 85 |
+
LinkedIn description:
|
| 86 |
+
${companyData.linkedin_description || "NONE"}
|
| 87 |
+
|
| 88 |
+
Tech stack detected: ${companyData.tech_stack.length ? companyData.tech_stack.join(", ") : "NONE DETECTED"}
|
| 89 |
+
Job postings mentioning AI/automation: ${companyData.ai_job_count}
|
| 90 |
+
Pain signals detected: ${companyData.pain_signals.length ? companyData.pain_signals.join(", ") : "NONE"}
|
| 91 |
+
Service match suggestion: ${companyData.service_match || "NONE"}
|
| 92 |
+
|
| 93 |
+
STEP-BY-STEP ANALYSIS:
|
| 94 |
+
|
| 95 |
+
Step 1: What does this company actually DO? (2 sentences, facts only)
|
| 96 |
+
Step 2: What are their likely daily operational challenges? (based on industry + size)
|
| 97 |
+
Step 3: What specific AI automation would save them time/money? (be specific)
|
| 98 |
+
Step 4: Who in this organization would approve buying this service?
|
| 99 |
+
Step 5: What outreach angle would resonate with this specific person?
|
| 100 |
+
|
| 101 |
+
After reasoning through steps 1-5, output this JSON:
|
| 102 |
+
{
|
| 103 |
+
"profile_summary": "2-3 factual sentences about what this company does",
|
| 104 |
+
"pain_points": ["specific pain 1", "specific pain 2"],
|
| 105 |
+
"ai_use_case": "The single most compelling AI use case for them",
|
| 106 |
+
"ai_readiness": "low|medium|high",
|
| 107 |
+
"decision_maker_reasoning": "Who likely makes purchasing decisions and why",
|
| 108 |
+
"outreach_angle": "One specific sentence — the hook for first contact",
|
| 109 |
+
"confidence": 0.0,
|
| 110 |
+
"evidence_used": ["list which data points you relied on"],
|
| 111 |
+
"evidence_missing": ["list what data you wished you had"]
|
| 112 |
+
}
|
| 113 |
+
|
| 114 |
+
EXAMPLE 1 (dental clinic, 6 employees):
|
| 115 |
+
{
|
| 116 |
+
"profile_summary": "ABC Dental is a 6-person dental practice in Houston offering general and cosmetic dentistry. They display their phone number prominently and use a basic contact form for appointments.",
|
| 117 |
+
"pain_points": ["Manual phone-based appointment scheduling during business hours only", "No after-hours patient communication capability"],
|
| 118 |
+
"ai_use_case": "AI receptionist to handle appointment booking, reminders, and after-hours calls",
|
| 119 |
+
"ai_readiness": "low",
|
| 120 |
+
"decision_maker_reasoning": "Practice owner (Dr. Smith, DDS) makes all purchasing decisions. Small practice = owner controls budget directly.",
|
| 121 |
+
"outreach_angle": "Stop losing patients to voicemail — our AI receptionist books appointments 24/7, even when your front desk is closed",
|
| 122 |
+
"confidence": 0.82,
|
| 123 |
+
"evidence_used": ["phone number on homepage", "contact form only", "6 staff listed", "no chatbot detected"],
|
| 124 |
+
"evidence_missing": ["annual revenue", "number of daily calls", "current scheduling software"]
|
| 125 |
+
}
|
| 126 |
+
|
| 127 |
+
EXAMPLE 2 (manufacturing company, 150 employees):
|
| 128 |
+
{
|
| 129 |
+
"profile_summary": "XYZ Manufacturing is a UK-based manufacturer of industrial valves with 150 employees. They use SAP for ERP and are hiring a Data Analyst, suggesting manual reporting pain.",
|
| 130 |
+
"pain_points": ["Manual data extraction from legacy SAP system", "Production reporting requires manual spreadsheet compilation"],
|
| 131 |
+
"ai_use_case": "Automated reporting pipeline that extracts SAP data and generates dashboards without manual intervention",
|
| 132 |
+
"ai_readiness": "medium",
|
| 133 |
+
"decision_maker_reasoning": "Operations Director (found on LinkedIn) manages the data team and would champion this internally. CTO signs off on tech purchases.",
|
| 134 |
+
"outreach_angle": "Your Data Analyst job posting tells us you're drowning in manual SAP reports — we automate that entirely",
|
| 135 |
+
"confidence": 0.88,
|
| 136 |
+
"evidence_used": ["SAP detected in tech stack", "Data Analyst job posting", "150 employees", "manufacturing industry"],
|
| 137 |
+
"evidence_missing": ["specific SAP modules used", "current reporting frequency"]
|
| 138 |
+
}`;
|
| 139 |
+
}
|
| 140 |
+
|
| 141 |
+
|
| 142 |
+
// ─── SIGNAL EXTRACTION PROMPT (for scoring) ──────────────────
|
| 143 |
+
|
| 144 |
+
export function buildSignalExtractionPrompt(companyData: {
|
| 145 |
+
name: string;
|
| 146 |
+
industry: string;
|
| 147 |
+
employee_count: number | null;
|
| 148 |
+
tech_stack: string[];
|
| 149 |
+
ai_job_count: number;
|
| 150 |
+
pain_signals: string[];
|
| 151 |
+
service_match: string | null;
|
| 152 |
+
has_verified_email: boolean;
|
| 153 |
+
has_linkedin: boolean;
|
| 154 |
+
has_social: boolean;
|
| 155 |
+
growth_signals_count: number;
|
| 156 |
+
website_active: boolean;
|
| 157 |
+
}): string {
|
| 158 |
+
return `EXTRACT SIGNALS for lead scoring. Do not compute a score — just identify signals.
|
| 159 |
+
|
| 160 |
+
Company: ${companyData.name}
|
| 161 |
+
Industry: ${companyData.industry || "UNKNOWN"}
|
| 162 |
+
Employees: ${companyData.employee_count ?? "UNKNOWN"}
|
| 163 |
+
Tech stack: ${companyData.tech_stack.join(", ") || "NONE"}
|
| 164 |
+
AI/automation job postings: ${companyData.ai_job_count}
|
| 165 |
+
Pain signals detected: ${companyData.pain_signals.join(", ") || "NONE"}
|
| 166 |
+
Service match: ${companyData.service_match || "NONE"}
|
| 167 |
+
Has verified email: ${companyData.has_verified_email}
|
| 168 |
+
Has personal LinkedIn: ${companyData.has_linkedin}
|
| 169 |
+
Has social profiles: ${companyData.has_social}
|
| 170 |
+
Growth signals count: ${companyData.growth_signals_count}
|
| 171 |
+
Website recently active: ${companyData.website_active}
|
| 172 |
+
|
| 173 |
+
Output JSON:
|
| 174 |
+
{
|
| 175 |
+
"company_fit_signals": {
|
| 176 |
+
"industry_match": true|false,
|
| 177 |
+
"size_appropriate": true|false,
|
| 178 |
+
"evidence": "why"
|
| 179 |
+
},
|
| 180 |
+
"ai_readiness_signals": {
|
| 181 |
+
"level": "none|low|medium|high",
|
| 182 |
+
"tech_stack_relevant": true|false,
|
| 183 |
+
"ai_jobs_present": true|false,
|
| 184 |
+
"evidence": "why"
|
| 185 |
+
},
|
| 186 |
+
"service_match_signals": {
|
| 187 |
+
"matched": true|false,
|
| 188 |
+
"service_name": "which service fits",
|
| 189 |
+
"pain_count": 0,
|
| 190 |
+
"evidence": "which pain signals"
|
| 191 |
+
},
|
| 192 |
+
"contact_quality_signals": {
|
| 193 |
+
"email_verified": true|false,
|
| 194 |
+
"linkedin_found": true|false,
|
| 195 |
+
"decision_maker_identified": true|false
|
| 196 |
+
},
|
| 197 |
+
"timing_signals": {
|
| 198 |
+
"actively_growing": true|false,
|
| 199 |
+
"recently_active": true|false,
|
| 200 |
+
"evidence": "what suggests good timing"
|
| 201 |
+
},
|
| 202 |
+
"confidence": 0.0
|
| 203 |
+
}`;
|
| 204 |
+
}
|
| 205 |
+
|
| 206 |
+
|
| 207 |
+
// ─── EMAIL CLASSIFICATION PROMPT ─────────────────────────────
|
| 208 |
+
|
| 209 |
+
export function buildEmailClassifyPrompt(data: {
|
| 210 |
+
email: string;
|
| 211 |
+
company_name: string;
|
| 212 |
+
company_size: number | null;
|
| 213 |
+
industry: string;
|
| 214 |
+
website_snippet: string;
|
| 215 |
+
}): string {
|
| 216 |
+
return `CLASSIFY this email address for B2B outreach viability.
|
| 217 |
+
|
| 218 |
+
Email: ${data.email}
|
| 219 |
+
Company: ${data.company_name}
|
| 220 |
+
Size: ${data.company_size ?? "UNKNOWN"} employees
|
| 221 |
+
Industry: ${data.industry || "UNKNOWN"}
|
| 222 |
+
Website excerpt: ${(data.website_snippet || "").slice(0, 300)}
|
| 223 |
+
|
| 224 |
+
Does "${data.email}" likely reach a person with purchasing authority?
|
| 225 |
+
|
| 226 |
+
Consider:
|
| 227 |
+
- Email prefix meaning in context of this company size
|
| 228 |
+
- "${data.email.split("@")[0]}@" at a ${data.company_size ?? "unknown"}-person ${data.industry} company
|
| 229 |
+
- Small companies: admin/operations/office = often the owner
|
| 230 |
+
- Large companies: admin/operations = departments, not individuals
|
| 231 |
+
|
| 232 |
+
Output JSON:
|
| 233 |
+
{
|
| 234 |
+
"keep": true|false,
|
| 235 |
+
"confidence": 0.0,
|
| 236 |
+
"likely_reaches": "who this email probably reaches",
|
| 237 |
+
"reason": "one line why keep or reject"
|
| 238 |
+
}`;
|
| 239 |
+
}
|
| 240 |
+
|
| 241 |
+
|
| 242 |
+
// ─── PAIN SIGNAL DETECTION PROMPT ────────────────────────────
|
| 243 |
+
|
| 244 |
+
export function buildPainDetectionPrompt(data: {
|
| 245 |
+
company_name: string;
|
| 246 |
+
industry: string;
|
| 247 |
+
employee_count: number | null;
|
| 248 |
+
website_text: string;
|
| 249 |
+
page_elements: string[]; // ['phone_number', 'contact_form', 'no_chatbot', etc.]
|
| 250 |
+
}): string {
|
| 251 |
+
return `DETECT operational inefficiency signals for this company.
|
| 252 |
+
|
| 253 |
+
Company: ${data.company_name}
|
| 254 |
+
Industry: ${data.industry || "UNKNOWN"}
|
| 255 |
+
Size: ${data.employee_count ?? "UNKNOWN"} employees
|
| 256 |
+
|
| 257 |
+
Website text (excerpt):
|
| 258 |
+
${(data.website_text || "").slice(0, 500)}
|
| 259 |
+
|
| 260 |
+
Page elements detected:
|
| 261 |
+
${data.page_elements.join("\n")}
|
| 262 |
+
|
| 263 |
+
IMPORTANT: You are NOT looking for AI signals. You are looking for MANUAL PROCESS signals.
|
| 264 |
+
A phone number on a homepage IS a signal (manual call handling).
|
| 265 |
+
A "Book by Phone" button IS a signal (no online scheduling).
|
| 266 |
+
No live chat IS a signal (no automated customer interaction).
|
| 267 |
+
|
| 268 |
+
Output JSON:
|
| 269 |
+
{
|
| 270 |
+
"pain_signals": [
|
| 271 |
+
{"signal": "what you detected", "evidence": "where on page", "severity": "low|medium|high"}
|
| 272 |
+
],
|
| 273 |
+
"service_match": "which AI service best fits: AI Receptionist|AI Customer Support|AI Data Processing|AI Sales Automation|AI Workflow Automation|NONE",
|
| 274 |
+
"match_confidence": 0.0,
|
| 275 |
+
"reasoning": "one paragraph explaining your analysis"
|
| 276 |
+
}`;
|
| 277 |
+
}
|
|
@@ -0,0 +1,118 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/**
|
| 2 |
+
* Pipeline Observability — Trace ID system
|
| 3 |
+
*
|
| 4 |
+
* Every pipeline run gets a unique trace_id.
|
| 5 |
+
* Every operation within that run carries the trace_id.
|
| 6 |
+
* This enables: debugging, cost tracking, latency analysis.
|
| 7 |
+
*
|
| 8 |
+
* Think of it like a receipt number — every action is linked.
|
| 9 |
+
*/
|
| 10 |
+
|
| 11 |
+
import { randomUUID } from "crypto";
|
| 12 |
+
import { getSupabaseClient } from "../supabase/client";
|
| 13 |
+
import { logger } from "../utils/logger";
|
| 14 |
+
|
| 15 |
+
export interface PipelineTrace {
|
| 16 |
+
traceId: string;
|
| 17 |
+
runId: string; // discovery_runs.id
|
| 18 |
+
startedAt: number; // Date.now()
|
| 19 |
+
operationCount: number;
|
| 20 |
+
totalTokens: number;
|
| 21 |
+
totalLatencyMs: number;
|
| 22 |
+
errors: string[];
|
| 23 |
+
}
|
| 24 |
+
|
| 25 |
+
// In-memory trace store (per pipeline run)
|
| 26 |
+
const activeTraces = new Map<string, PipelineTrace>();
|
| 27 |
+
|
| 28 |
+
/**
|
| 29 |
+
* Start a new pipeline trace. Call this at beginning of every discovery run.
|
| 30 |
+
*/
|
| 31 |
+
export function startTrace(runId: string): string {
|
| 32 |
+
const traceId = `trace_${randomUUID().slice(0, 8)}_${Date.now()}`;
|
| 33 |
+
|
| 34 |
+
activeTraces.set(traceId, {
|
| 35 |
+
traceId,
|
| 36 |
+
runId,
|
| 37 |
+
startedAt: Date.now(),
|
| 38 |
+
operationCount: 0,
|
| 39 |
+
totalTokens: 0,
|
| 40 |
+
totalLatencyMs: 0,
|
| 41 |
+
errors: [],
|
| 42 |
+
});
|
| 43 |
+
|
| 44 |
+
logger.info({ traceId, runId }, "🔍 Pipeline trace started");
|
| 45 |
+
return traceId;
|
| 46 |
+
}
|
| 47 |
+
|
| 48 |
+
/**
|
| 49 |
+
* Record an operation within a trace.
|
| 50 |
+
*/
|
| 51 |
+
export function recordOperation(
|
| 52 |
+
traceId: string,
|
| 53 |
+
operation: string,
|
| 54 |
+
tokens: number,
|
| 55 |
+
latencyMs: number,
|
| 56 |
+
success: boolean,
|
| 57 |
+
error?: string
|
| 58 |
+
): void {
|
| 59 |
+
const trace = activeTraces.get(traceId);
|
| 60 |
+
if (!trace) return;
|
| 61 |
+
|
| 62 |
+
trace.operationCount++;
|
| 63 |
+
trace.totalTokens += tokens;
|
| 64 |
+
trace.totalLatencyMs += latencyMs;
|
| 65 |
+
|
| 66 |
+
if (!success && error) {
|
| 67 |
+
trace.errors.push(`${operation}: ${error}`);
|
| 68 |
+
}
|
| 69 |
+
}
|
| 70 |
+
|
| 71 |
+
/**
|
| 72 |
+
* End trace and persist summary to audit_log.
|
| 73 |
+
*/
|
| 74 |
+
export async function endTrace(traceId: string): Promise<PipelineTrace | null> {
|
| 75 |
+
const trace = activeTraces.get(traceId);
|
| 76 |
+
if (!trace) return null;
|
| 77 |
+
|
| 78 |
+
const duration = Date.now() - trace.startedAt;
|
| 79 |
+
|
| 80 |
+
logger.info({
|
| 81 |
+
traceId,
|
| 82 |
+
operations: trace.operationCount,
|
| 83 |
+
tokens: trace.totalTokens,
|
| 84 |
+
durationMs: duration,
|
| 85 |
+
errors: trace.errors.length,
|
| 86 |
+
}, "✅ Pipeline trace completed");
|
| 87 |
+
|
| 88 |
+
// Persist to audit log
|
| 89 |
+
try {
|
| 90 |
+
const db = getSupabaseClient();
|
| 91 |
+
await db.from("audit_log").insert({
|
| 92 |
+
action: "pipeline_trace_completed",
|
| 93 |
+
entity_type: "discovery_run",
|
| 94 |
+
entity_id: trace.runId,
|
| 95 |
+
details: {
|
| 96 |
+
trace_id: traceId,
|
| 97 |
+
duration_ms: duration,
|
| 98 |
+
operations: trace.operationCount,
|
| 99 |
+
total_tokens: trace.totalTokens,
|
| 100 |
+
total_latency_ms: trace.totalLatencyMs,
|
| 101 |
+
error_count: trace.errors.length,
|
| 102 |
+
errors: trace.errors.slice(0, 10), // cap at 10
|
| 103 |
+
},
|
| 104 |
+
});
|
| 105 |
+
} catch (err) {
|
| 106 |
+
logger.warn({ err }, "Failed to persist trace — non-critical");
|
| 107 |
+
}
|
| 108 |
+
|
| 109 |
+
activeTraces.delete(traceId);
|
| 110 |
+
return trace;
|
| 111 |
+
}
|
| 112 |
+
|
| 113 |
+
/**
|
| 114 |
+
* Get active trace (for passing to LLM calls etc.)
|
| 115 |
+
*/
|
| 116 |
+
export function getTrace(traceId: string): PipelineTrace | undefined {
|
| 117 |
+
return activeTraces.get(traceId);
|
| 118 |
+
}
|
|
@@ -0,0 +1,143 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/**
|
| 2 |
+
* Pipeline Checkpoint System — Crash Recovery
|
| 3 |
+
*
|
| 4 |
+
* Problem: Pipeline crashes at company #15 → restarts →
|
| 5 |
+
* processes company #1-14 again = wasted API calls + duplicates
|
| 6 |
+
*
|
| 7 |
+
* Solution: Save checkpoint after each major stage.
|
| 8 |
+
* On restart, resume from last checkpoint.
|
| 9 |
+
*
|
| 10 |
+
* Stages (in order):
|
| 11 |
+
* scraped → filtered → contacts_found → profiled → scored → completed
|
| 12 |
+
*/
|
| 13 |
+
|
| 14 |
+
import { getSupabaseClient } from "../supabase/client";
|
| 15 |
+
import { logger } from "../utils/logger";
|
| 16 |
+
|
| 17 |
+
export type PipelineStage =
|
| 18 |
+
| "discovered" // found in search results
|
| 19 |
+
| "scraped" // website scraped
|
| 20 |
+
| "filtered" // passed Gate 1 + 2
|
| 21 |
+
| "contacts_found" // decision makers identified
|
| 22 |
+
| "emails_verified" // emails found and verified
|
| 23 |
+
| "profiled" // LLM profile generated
|
| 24 |
+
| "scored" // score computed
|
| 25 |
+
| "completed"; // fully processed
|
| 26 |
+
|
| 27 |
+
/**
|
| 28 |
+
* Save checkpoint for a company in a specific run.
|
| 29 |
+
* Stores intermediate data so pipeline can resume from this point.
|
| 30 |
+
*/
|
| 31 |
+
export async function saveCheckpoint(
|
| 32 |
+
runId: string,
|
| 33 |
+
domain: string,
|
| 34 |
+
stage: PipelineStage,
|
| 35 |
+
stageData: Record<string, unknown> = {}
|
| 36 |
+
): Promise<void> {
|
| 37 |
+
const db = getSupabaseClient();
|
| 38 |
+
|
| 39 |
+
const { error } = await db.from("pipeline_checkpoints").upsert(
|
| 40 |
+
{
|
| 41 |
+
run_id: runId,
|
| 42 |
+
company_domain: domain,
|
| 43 |
+
stage,
|
| 44 |
+
stage_data: stageData,
|
| 45 |
+
completed: stage === "completed",
|
| 46 |
+
updated_at: new Date().toISOString(),
|
| 47 |
+
},
|
| 48 |
+
{ onConflict: "run_id,company_domain" }
|
| 49 |
+
);
|
| 50 |
+
|
| 51 |
+
if (error) {
|
| 52 |
+
logger.warn({ domain, stage, error: error.message }, "Checkpoint save failed — non-critical");
|
| 53 |
+
}
|
| 54 |
+
}
|
| 55 |
+
|
| 56 |
+
/**
|
| 57 |
+
* Get the last checkpoint for a domain in a run.
|
| 58 |
+
* Returns null if no checkpoint exists (fresh start).
|
| 59 |
+
*/
|
| 60 |
+
export async function getCheckpoint(
|
| 61 |
+
runId: string,
|
| 62 |
+
domain: string
|
| 63 |
+
): Promise<{ stage: PipelineStage; stageData: Record<string, unknown> } | null> {
|
| 64 |
+
const db = getSupabaseClient();
|
| 65 |
+
|
| 66 |
+
const { data } = await db
|
| 67 |
+
.from("pipeline_checkpoints")
|
| 68 |
+
.select("stage, stage_data")
|
| 69 |
+
.eq("run_id", runId)
|
| 70 |
+
.eq("company_domain", domain)
|
| 71 |
+
.maybeSingle();
|
| 72 |
+
|
| 73 |
+
if (!data) return null;
|
| 74 |
+
return { stage: data.stage as PipelineStage, stageData: data.stage_data ?? {} };
|
| 75 |
+
}
|
| 76 |
+
|
| 77 |
+
/**
|
| 78 |
+
* Check if a domain was already fully processed in ANY recent run.
|
| 79 |
+
* Prevents re-processing across separate runs (not just within one run).
|
| 80 |
+
*/
|
| 81 |
+
export async function isAlreadyProcessed(domain: string, withinDays = 30): Promise<boolean> {
|
| 82 |
+
const db = getSupabaseClient();
|
| 83 |
+
|
| 84 |
+
const cutoff = new Date();
|
| 85 |
+
cutoff.setDate(cutoff.getDate() - withinDays);
|
| 86 |
+
|
| 87 |
+
const { data } = await db
|
| 88 |
+
.from("pipeline_checkpoints")
|
| 89 |
+
.select("id")
|
| 90 |
+
.eq("company_domain", domain)
|
| 91 |
+
.eq("completed", true)
|
| 92 |
+
.gte("updated_at", cutoff.toISOString())
|
| 93 |
+
.limit(1)
|
| 94 |
+
.maybeSingle();
|
| 95 |
+
|
| 96 |
+
return !!data;
|
| 97 |
+
}
|
| 98 |
+
|
| 99 |
+
/**
|
| 100 |
+
* Get all incomplete companies in a run (for resume).
|
| 101 |
+
* Returns list of domains and their last stage.
|
| 102 |
+
*/
|
| 103 |
+
export async function getIncompleteCompanies(
|
| 104 |
+
runId: string
|
| 105 |
+
): Promise<{ domain: string; stage: PipelineStage; stageData: Record<string, unknown> }[]> {
|
| 106 |
+
const db = getSupabaseClient();
|
| 107 |
+
|
| 108 |
+
const { data } = await db
|
| 109 |
+
.from("pipeline_checkpoints")
|
| 110 |
+
.select("company_domain, stage, stage_data")
|
| 111 |
+
.eq("run_id", runId)
|
| 112 |
+
.eq("completed", false);
|
| 113 |
+
|
| 114 |
+
return (data ?? []).map((d) => ({
|
| 115 |
+
domain: d.company_domain,
|
| 116 |
+
stage: d.stage as PipelineStage,
|
| 117 |
+
stageData: d.stage_data ?? {},
|
| 118 |
+
}));
|
| 119 |
+
}
|
| 120 |
+
|
| 121 |
+
/**
|
| 122 |
+
* Stage ordering — used to determine if we can skip ahead.
|
| 123 |
+
*/
|
| 124 |
+
const STAGE_ORDER: PipelineStage[] = [
|
| 125 |
+
"discovered", "scraped", "filtered", "contacts_found",
|
| 126 |
+
"emails_verified", "profiled", "scored", "completed",
|
| 127 |
+
];
|
| 128 |
+
|
| 129 |
+
export function isStageComplete(currentStage: PipelineStage, requiredStage: PipelineStage): boolean {
|
| 130 |
+
return STAGE_ORDER.indexOf(currentStage) >= STAGE_ORDER.indexOf(requiredStage);
|
| 131 |
+
}
|
| 132 |
+
|
| 133 |
+
/**
|
| 134 |
+
* Helper to determine where to resume processing for a company.
|
| 135 |
+
*/
|
| 136 |
+
export function getResumePoint(checkpoint: { stage: PipelineStage } | null): PipelineStage {
|
| 137 |
+
if (!checkpoint) return "discovered";
|
| 138 |
+
|
| 139 |
+
// Resume from the NEXT stage after the last completed one
|
| 140 |
+
const idx = STAGE_ORDER.indexOf(checkpoint.stage);
|
| 141 |
+
if (idx < 0 || idx >= STAGE_ORDER.length - 1) return "discovered";
|
| 142 |
+
return STAGE_ORDER[idx + 1];
|
| 143 |
+
}
|
|
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import { createClient } from "@supabase/supabase-js";
|
| 2 |
+
import { getEnv } from "../config/env";
|
| 3 |
+
|
| 4 |
+
let _client: ReturnType<typeof createClient> | null = null;
|
| 5 |
+
|
| 6 |
+
export function getSupabaseClient() {
|
| 7 |
+
if (!_client) {
|
| 8 |
+
const env = getEnv();
|
| 9 |
+
_client = createClient(env.SUPABASE_URL, env.SUPABASE_SERVICE_ROLE_KEY, {
|
| 10 |
+
auth: { persistSession: false },
|
| 11 |
+
db: { schema: "public" },
|
| 12 |
+
});
|
| 13 |
+
}
|
| 14 |
+
return _client;
|
| 15 |
+
}
|
|
@@ -0,0 +1,184 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
// ─── TypeScript types matching Supabase schema ─────────────────
|
| 2 |
+
|
| 3 |
+
export type CompanyStatus =
|
| 4 |
+
| "discovered" | "researching" | "profiled"
|
| 5 |
+
| "qualified" | "nurture" | "archived" | "suppressed";
|
| 6 |
+
|
| 7 |
+
export type ContactStatus =
|
| 8 |
+
| "found" | "email_verified" | "email_invalid"
|
| 9 |
+
| "linkedin_only" | "suppressed";
|
| 10 |
+
|
| 11 |
+
export type LeadTier = "hot" | "warm" | "nurture" | "archive";
|
| 12 |
+
|
| 13 |
+
export type OutreachChannel = "email" | "linkedin";
|
| 14 |
+
|
| 15 |
+
export type OutreachStatus =
|
| 16 |
+
| "queued" | "sent" | "opened" | "replied"
|
| 17 |
+
| "bounced" | "failed" | "review_needed";
|
| 18 |
+
|
| 19 |
+
export type IntentType =
|
| 20 |
+
| "interested" | "question" | "not_now"
|
| 21 |
+
| "not_interested" | "out_of_office" | "wrong_person" | "unknown";
|
| 22 |
+
|
| 23 |
+
export type ReviewStatus = "pending" | "approved" | "rejected" | "edited";
|
| 24 |
+
|
| 25 |
+
// ─── Table row types ─────────────────────────────────────────
|
| 26 |
+
|
| 27 |
+
export interface IcpConfig {
|
| 28 |
+
id: string;
|
| 29 |
+
name: string;
|
| 30 |
+
min_employees: number;
|
| 31 |
+
industries: string[];
|
| 32 |
+
exclude_industries: string[];
|
| 33 |
+
geographies: string[];
|
| 34 |
+
keywords: string[];
|
| 35 |
+
tech_signals: string[];
|
| 36 |
+
score_threshold: number;
|
| 37 |
+
is_active: boolean;
|
| 38 |
+
created_at: string;
|
| 39 |
+
updated_at: string;
|
| 40 |
+
}
|
| 41 |
+
|
| 42 |
+
export interface RotationState {
|
| 43 |
+
id: string;
|
| 44 |
+
week_number: number;
|
| 45 |
+
region: string;
|
| 46 |
+
started_at: string;
|
| 47 |
+
completed_at: string | null;
|
| 48 |
+
companies_found: number;
|
| 49 |
+
leads_qualified: number;
|
| 50 |
+
}
|
| 51 |
+
|
| 52 |
+
export interface Company {
|
| 53 |
+
id: string;
|
| 54 |
+
domain: string;
|
| 55 |
+
name: string;
|
| 56 |
+
industry: string | null;
|
| 57 |
+
employee_count: number | null;
|
| 58 |
+
employee_range: string | null;
|
| 59 |
+
description: string | null;
|
| 60 |
+
website_url: string | null;
|
| 61 |
+
linkedin_url: string | null;
|
| 62 |
+
country: string | null;
|
| 63 |
+
region: string | null;
|
| 64 |
+
tech_stack: string[];
|
| 65 |
+
growth_signals: GrowthSignal[];
|
| 66 |
+
raw_data: Record<string, unknown>;
|
| 67 |
+
source: string;
|
| 68 |
+
status: CompanyStatus;
|
| 69 |
+
discovered_at: string;
|
| 70 |
+
updated_at: string;
|
| 71 |
+
}
|
| 72 |
+
|
| 73 |
+
export interface GrowthSignal {
|
| 74 |
+
type: "job_posting" | "news" | "funding" | "social_post" | "expansion";
|
| 75 |
+
content: string;
|
| 76 |
+
source_url?: string;
|
| 77 |
+
ai_related: boolean;
|
| 78 |
+
detected_at: string;
|
| 79 |
+
}
|
| 80 |
+
|
| 81 |
+
export interface Contact {
|
| 82 |
+
id: string;
|
| 83 |
+
company_id: string;
|
| 84 |
+
full_name: string;
|
| 85 |
+
first_name: string | null;
|
| 86 |
+
last_name: string | null;
|
| 87 |
+
title: string;
|
| 88 |
+
seniority: "c_suite" | "vp" | "director" | "manager" | null;
|
| 89 |
+
email: string | null;
|
| 90 |
+
email_verified: boolean;
|
| 91 |
+
email_source: "hunter" | "snov" | "pattern" | null;
|
| 92 |
+
linkedin_url: string | null;
|
| 93 |
+
linkedin_verified: boolean;
|
| 94 |
+
status: ContactStatus;
|
| 95 |
+
suppressed: boolean;
|
| 96 |
+
suppressed_at: string | null;
|
| 97 |
+
suppressed_reason: string | null;
|
| 98 |
+
created_at: string;
|
| 99 |
+
updated_at: string;
|
| 100 |
+
}
|
| 101 |
+
|
| 102 |
+
export interface Evidence {
|
| 103 |
+
id: string;
|
| 104 |
+
company_id: string;
|
| 105 |
+
type: "job_posting" | "news" | "social_post" | "website_text" | "tech_stack";
|
| 106 |
+
content: string;
|
| 107 |
+
source_url: string | null;
|
| 108 |
+
ai_signal: boolean;
|
| 109 |
+
collected_at: string;
|
| 110 |
+
}
|
| 111 |
+
|
| 112 |
+
export interface LeadProfile {
|
| 113 |
+
id: string;
|
| 114 |
+
company_id: string;
|
| 115 |
+
profile_summary: string;
|
| 116 |
+
pain_points: string[];
|
| 117 |
+
ai_use_case: string | null;
|
| 118 |
+
ai_readiness: "low" | "medium" | "high";
|
| 119 |
+
outreach_angle: string | null;
|
| 120 |
+
llm_model: string;
|
| 121 |
+
llm_confidence: number | null;
|
| 122 |
+
is_fallback: boolean;
|
| 123 |
+
created_at: string;
|
| 124 |
+
}
|
| 125 |
+
|
| 126 |
+
export interface LeadScore {
|
| 127 |
+
id: string;
|
| 128 |
+
company_id: string;
|
| 129 |
+
contact_id: string | null;
|
| 130 |
+
total_score: number;
|
| 131 |
+
tier: LeadTier;
|
| 132 |
+
company_fit: number | null;
|
| 133 |
+
ai_readiness: number | null;
|
| 134 |
+
decision_maker: number | null;
|
| 135 |
+
growth_signal: number | null;
|
| 136 |
+
engagement_potential: number | null;
|
| 137 |
+
score_reasoning: string | null;
|
| 138 |
+
scored_at: string;
|
| 139 |
+
}
|
| 140 |
+
|
| 141 |
+
export interface HumanReviewItem {
|
| 142 |
+
id: string;
|
| 143 |
+
type: "outreach_approval" | "score_anomaly" | "escalation";
|
| 144 |
+
company_id: string | null;
|
| 145 |
+
contact_id: string | null;
|
| 146 |
+
payload: Record<string, unknown>;
|
| 147 |
+
status: ReviewStatus;
|
| 148 |
+
reviewer_notes: string | null;
|
| 149 |
+
resolved_at: string | null;
|
| 150 |
+
created_at: string;
|
| 151 |
+
}
|
| 152 |
+
|
| 153 |
+
// ─── Insert types (no id/timestamps) ─────────────────────────
|
| 154 |
+
|
| 155 |
+
export type InsertCompany = Omit<Company, "id" | "discovered_at" | "updated_at">;
|
| 156 |
+
export type InsertContact = Omit<Contact, "id" | "created_at" | "updated_at">;
|
| 157 |
+
export type InsertEvidence = Omit<Evidence, "id" | "collected_at">;
|
| 158 |
+
export type InsertLeadProfile = Omit<LeadProfile, "id" | "created_at">;
|
| 159 |
+
export type InsertLeadScore = Omit<LeadScore, "id" | "scored_at">;
|
| 160 |
+
|
| 161 |
+
// ─── Trigger.dev event payloads ────────────────────────────────
|
| 162 |
+
|
| 163 |
+
export interface CompanyDiscoveredPayload {
|
| 164 |
+
company_id: string;
|
| 165 |
+
domain: string;
|
| 166 |
+
name: string;
|
| 167 |
+
region: string;
|
| 168 |
+
source: "auto" | "manual";
|
| 169 |
+
}
|
| 170 |
+
|
| 171 |
+
export interface LeadScoredPayload {
|
| 172 |
+
lead_score_id: string;
|
| 173 |
+
company_id: string;
|
| 174 |
+
contact_id: string | null;
|
| 175 |
+
total_score: number;
|
| 176 |
+
tier: LeadTier;
|
| 177 |
+
}
|
| 178 |
+
|
| 179 |
+
export interface OutreachQueuedPayload {
|
| 180 |
+
company_id: string;
|
| 181 |
+
contact_id: string;
|
| 182 |
+
score: number;
|
| 183 |
+
tier: LeadTier;
|
| 184 |
+
}
|
|
@@ -0,0 +1,40 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import pino from "pino";
|
| 2 |
+
import { getEnv } from "../config/env";
|
| 3 |
+
|
| 4 |
+
// PII fields that will be redacted in logs
|
| 5 |
+
const PII_FIELDS = ["email", "full_name", "first_name", "last_name", "phone", "linkedin_url"];
|
| 6 |
+
|
| 7 |
+
function redactPii(obj: Record<string, unknown>): Record<string, unknown> {
|
| 8 |
+
const result: Record<string, unknown> = {};
|
| 9 |
+
for (const [key, value] of Object.entries(obj)) {
|
| 10 |
+
if (PII_FIELDS.includes(key) && typeof value === "string") {
|
| 11 |
+
// Show first 3 chars + *** e.g. "joh***"
|
| 12 |
+
result[key] = value.length > 3 ? `${value.slice(0, 3)}***` : "***";
|
| 13 |
+
} else if (value && typeof value === "object" && !Array.isArray(value)) {
|
| 14 |
+
result[key] = redactPii(value as Record<string, unknown>);
|
| 15 |
+
} else {
|
| 16 |
+
result[key] = value;
|
| 17 |
+
}
|
| 18 |
+
}
|
| 19 |
+
return result;
|
| 20 |
+
}
|
| 21 |
+
|
| 22 |
+
const env = getEnv();
|
| 23 |
+
|
| 24 |
+
export const logger = pino({
|
| 25 |
+
level: env.LOG_LEVEL,
|
| 26 |
+
transport:
|
| 27 |
+
env.NODE_ENV === "development"
|
| 28 |
+
? { target: "pino-pretty", options: { colorize: true } }
|
| 29 |
+
: undefined,
|
| 30 |
+
serializers: {
|
| 31 |
+
// Auto-redact PII in any "contact" or "data" field
|
| 32 |
+
contact: (val: Record<string, unknown>) => redactPii(val),
|
| 33 |
+
data: (val: Record<string, unknown>) => redactPii(val),
|
| 34 |
+
},
|
| 35 |
+
});
|
| 36 |
+
|
| 37 |
+
// Convenience method for audit-safe logging
|
| 38 |
+
export function auditLog(action: string, entity: string, details: Record<string, unknown>) {
|
| 39 |
+
logger.info({ action, entity, details: redactPii(details) }, `[AUDIT] ${action}`);
|
| 40 |
+
}
|
|
@@ -0,0 +1,103 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import { logger } from "./logger";
|
| 2 |
+
|
| 3 |
+
interface BucketState {
|
| 4 |
+
tokens: number;
|
| 5 |
+
lastRefill: number;
|
| 6 |
+
}
|
| 7 |
+
|
| 8 |
+
/**
|
| 9 |
+
* Token bucket rate limiter per provider.
|
| 10 |
+
* Controls how many API calls can be made per time window.
|
| 11 |
+
*/
|
| 12 |
+
export class RateLimiter {
|
| 13 |
+
private buckets = new Map<string, BucketState>();
|
| 14 |
+
|
| 15 |
+
constructor(
|
| 16 |
+
private readonly maxTokens: number,
|
| 17 |
+
private readonly refillRateMs: number // how often to fully refill
|
| 18 |
+
) {}
|
| 19 |
+
|
| 20 |
+
/**
|
| 21 |
+
* Returns true if the call is allowed, false if rate limit exceeded.
|
| 22 |
+
*/
|
| 23 |
+
tryConsume(provider: string, tokens = 1): boolean {
|
| 24 |
+
const now = Date.now();
|
| 25 |
+
let bucket = this.buckets.get(provider);
|
| 26 |
+
|
| 27 |
+
if (!bucket) {
|
| 28 |
+
bucket = { tokens: this.maxTokens, lastRefill: now };
|
| 29 |
+
this.buckets.set(provider, bucket);
|
| 30 |
+
}
|
| 31 |
+
|
| 32 |
+
// Refill based on elapsed time
|
| 33 |
+
const elapsed = now - bucket.lastRefill;
|
| 34 |
+
if (elapsed >= this.refillRateMs) {
|
| 35 |
+
bucket.tokens = this.maxTokens;
|
| 36 |
+
bucket.lastRefill = now;
|
| 37 |
+
}
|
| 38 |
+
|
| 39 |
+
if (bucket.tokens < tokens) {
|
| 40 |
+
logger.warn({ provider, tokensLeft: bucket.tokens }, `[RateLimit] ${provider} throttled`);
|
| 41 |
+
return false;
|
| 42 |
+
}
|
| 43 |
+
|
| 44 |
+
bucket.tokens -= tokens;
|
| 45 |
+
return true;
|
| 46 |
+
}
|
| 47 |
+
|
| 48 |
+
/**
|
| 49 |
+
* Wait until a token is available (blocking version).
|
| 50 |
+
*/
|
| 51 |
+
async consume(provider: string, tokens = 1): Promise<void> {
|
| 52 |
+
while (!this.tryConsume(provider, tokens)) {
|
| 53 |
+
await new Promise((r) => setTimeout(r, 500));
|
| 54 |
+
}
|
| 55 |
+
}
|
| 56 |
+
}
|
| 57 |
+
|
| 58 |
+
// ─── Daily quota tracker (persisted in memory, resets at midnight) ────────
|
| 59 |
+
|
| 60 |
+
interface DailyQuota {
|
| 61 |
+
count: number;
|
| 62 |
+
date: string; // YYYY-MM-DD
|
| 63 |
+
}
|
| 64 |
+
|
| 65 |
+
const dailyQuotas = new Map<string, DailyQuota>();
|
| 66 |
+
|
| 67 |
+
function todayStr(): string {
|
| 68 |
+
return new Date().toISOString().split("T")[0];
|
| 69 |
+
}
|
| 70 |
+
|
| 71 |
+
export function checkDailyQuota(key: string, limit: number): boolean {
|
| 72 |
+
const today = todayStr();
|
| 73 |
+
const quota = dailyQuotas.get(key);
|
| 74 |
+
|
| 75 |
+
if (!quota || quota.date !== today) {
|
| 76 |
+
dailyQuotas.set(key, { count: 0, date: today });
|
| 77 |
+
return true;
|
| 78 |
+
}
|
| 79 |
+
|
| 80 |
+
if (quota.count >= limit) {
|
| 81 |
+
logger.warn({ key, count: quota.count, limit }, `[DailyQuota] ${key} limit reached`);
|
| 82 |
+
return false;
|
| 83 |
+
}
|
| 84 |
+
return true;
|
| 85 |
+
}
|
| 86 |
+
|
| 87 |
+
export function incrementDailyQuota(key: string): void {
|
| 88 |
+
const today = todayStr();
|
| 89 |
+
const quota = dailyQuotas.get(key) ?? { count: 0, date: today };
|
| 90 |
+
if (quota.date !== today) {
|
| 91 |
+
quota.count = 0;
|
| 92 |
+
quota.date = today;
|
| 93 |
+
}
|
| 94 |
+
quota.count += 1;
|
| 95 |
+
dailyQuotas.set(key, quota);
|
| 96 |
+
}
|
| 97 |
+
|
| 98 |
+
// Pre-configured limiters for each provider
|
| 99 |
+
export const serperLimiter = new RateLimiter(10, 60_000); // 10 req/min
|
| 100 |
+
export const hunterLimiter = new RateLimiter(5, 60_000); // 5 req/min
|
| 101 |
+
export const snovLimiter = new RateLimiter(5, 60_000); // 5 req/min
|
| 102 |
+
export const reoonLimiter = new RateLimiter(10, 60_000); // 10 req/min
|
| 103 |
+
export const playwrightLimiter = new RateLimiter(3, 10_000); // 3 pages per 10s
|
|
@@ -0,0 +1,195 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/**
|
| 2 |
+
* Production-grade retry logic — failure-type-aware.
|
| 3 |
+
*
|
| 4 |
+
* NOT "retry 3 times with delay" (naive approach).
|
| 5 |
+
* Instead: each failure type gets a different response.
|
| 6 |
+
*
|
| 7 |
+
* 429 → respect Retry-After header, wait, then retry
|
| 8 |
+
* 503 → exponential backoff WITH JITTER (prevent thundering herd)
|
| 9 |
+
* 500 → retry 2x, then dead-letter for manual review
|
| 10 |
+
* 422 → permanent failure, do not retry (bad input)
|
| 11 |
+
* ECONNRESET → network issue, retry with short delay
|
| 12 |
+
* TIMEOUT → retry with longer timeout
|
| 13 |
+
*/
|
| 14 |
+
|
| 15 |
+
import { AxiosError } from "axios";
|
| 16 |
+
import { logger } from "./logger";
|
| 17 |
+
|
| 18 |
+
export interface RetryConfig {
|
| 19 |
+
provider: string;
|
| 20 |
+
maxRetries?: number; // default 3
|
| 21 |
+
baseDelayMs?: number; // default 1000
|
| 22 |
+
maxDelayMs?: number; // default 30000
|
| 23 |
+
}
|
| 24 |
+
|
| 25 |
+
// ─── Circuit breaker state ───────────────────────────────────
|
| 26 |
+
|
| 27 |
+
interface CircuitState {
|
| 28 |
+
failures: number;
|
| 29 |
+
lastFailure: number;
|
| 30 |
+
isOpen: boolean;
|
| 31 |
+
halfOpenAt: number; // when to try again
|
| 32 |
+
}
|
| 33 |
+
|
| 34 |
+
const circuits = new Map<string, CircuitState>();
|
| 35 |
+
const CIRCUIT_THRESHOLD = 5; // failures before opening
|
| 36 |
+
const CIRCUIT_RESET_MS = 60_000; // 1 min cooldown
|
| 37 |
+
|
| 38 |
+
export function isCircuitOpen(provider: string): boolean {
|
| 39 |
+
const state = circuits.get(provider);
|
| 40 |
+
if (!state?.isOpen) return false;
|
| 41 |
+
|
| 42 |
+
// Check if enough time has passed (half-open)
|
| 43 |
+
if (Date.now() >= state.halfOpenAt) {
|
| 44 |
+
state.isOpen = false; // allow one attempt
|
| 45 |
+
return false;
|
| 46 |
+
}
|
| 47 |
+
return true;
|
| 48 |
+
}
|
| 49 |
+
|
| 50 |
+
export function recordSuccess(provider: string): void {
|
| 51 |
+
circuits.set(provider, {
|
| 52 |
+
failures: 0,
|
| 53 |
+
lastFailure: 0,
|
| 54 |
+
isOpen: false,
|
| 55 |
+
halfOpenAt: 0,
|
| 56 |
+
});
|
| 57 |
+
}
|
| 58 |
+
|
| 59 |
+
export function recordFailure(provider: string): void {
|
| 60 |
+
const state = circuits.get(provider) ?? {
|
| 61 |
+
failures: 0, lastFailure: 0, isOpen: false, halfOpenAt: 0,
|
| 62 |
+
};
|
| 63 |
+
state.failures++;
|
| 64 |
+
state.lastFailure = Date.now();
|
| 65 |
+
|
| 66 |
+
if (state.failures >= CIRCUIT_THRESHOLD) {
|
| 67 |
+
state.isOpen = true;
|
| 68 |
+
state.halfOpenAt = Date.now() + CIRCUIT_RESET_MS;
|
| 69 |
+
logger.warn({ provider, failures: state.failures }, "Circuit OPEN — provider temporarily disabled");
|
| 70 |
+
}
|
| 71 |
+
|
| 72 |
+
circuits.set(provider, state);
|
| 73 |
+
}
|
| 74 |
+
|
| 75 |
+
// ─── Failure classification ──────────────────────────────────
|
| 76 |
+
|
| 77 |
+
type FailureType =
|
| 78 |
+
| "rate_limited" // 429
|
| 79 |
+
| "server_error" // 500
|
| 80 |
+
| "service_unavailable" // 503
|
| 81 |
+
| "bad_input" // 422, 400
|
| 82 |
+
| "auth_failed" // 401, 403
|
| 83 |
+
| "network_error" // ECONNRESET, ENOTFOUND
|
| 84 |
+
| "timeout" // ETIMEDOUT, ESOCKETTIMEDOUT
|
| 85 |
+
| "unknown";
|
| 86 |
+
|
| 87 |
+
function classifyFailure(err: unknown): { type: FailureType; retryable: boolean; waitMs: number } {
|
| 88 |
+
if (err instanceof AxiosError) {
|
| 89 |
+
const status = err.response?.status;
|
| 90 |
+
const retryAfter = parseInt(err.response?.headers?.["retry-after"] ?? "0", 10);
|
| 91 |
+
|
| 92 |
+
switch (status) {
|
| 93 |
+
case 429:
|
| 94 |
+
return {
|
| 95 |
+
type: "rate_limited",
|
| 96 |
+
retryable: true,
|
| 97 |
+
waitMs: retryAfter ? retryAfter * 1000 : 10_000,
|
| 98 |
+
};
|
| 99 |
+
case 503:
|
| 100 |
+
return { type: "service_unavailable", retryable: true, waitMs: 5_000 };
|
| 101 |
+
case 500:
|
| 102 |
+
return { type: "server_error", retryable: true, waitMs: 3_000 };
|
| 103 |
+
case 422:
|
| 104 |
+
case 400:
|
| 105 |
+
return { type: "bad_input", retryable: false, waitMs: 0 };
|
| 106 |
+
case 401:
|
| 107 |
+
case 403:
|
| 108 |
+
return { type: "auth_failed", retryable: false, waitMs: 0 };
|
| 109 |
+
}
|
| 110 |
+
|
| 111 |
+
// Network errors
|
| 112 |
+
const code = err.code;
|
| 113 |
+
if (code === "ECONNRESET" || code === "ENOTFOUND" || code === "ECONNREFUSED") {
|
| 114 |
+
return { type: "network_error", retryable: true, waitMs: 2_000 };
|
| 115 |
+
}
|
| 116 |
+
if (code === "ETIMEDOUT" || code === "ESOCKETTIMEDOUT") {
|
| 117 |
+
return { type: "timeout", retryable: true, waitMs: 3_000 };
|
| 118 |
+
}
|
| 119 |
+
}
|
| 120 |
+
|
| 121 |
+
return { type: "unknown", retryable: true, waitMs: 2_000 };
|
| 122 |
+
}
|
| 123 |
+
|
| 124 |
+
// ─── Main retry function ────────────────────────────────────
|
| 125 |
+
|
| 126 |
+
export async function withRetry<T>(
|
| 127 |
+
fn: () => Promise<T>,
|
| 128 |
+
config: RetryConfig
|
| 129 |
+
): Promise<T> {
|
| 130 |
+
const maxRetries = config.maxRetries ?? 3;
|
| 131 |
+
const baseDelay = config.baseDelayMs ?? 1000;
|
| 132 |
+
const maxDelay = config.maxDelayMs ?? 30_000;
|
| 133 |
+
let attempt = 0;
|
| 134 |
+
|
| 135 |
+
while (true) {
|
| 136 |
+
try {
|
| 137 |
+
const result = await fn();
|
| 138 |
+
if (attempt > 0) {
|
| 139 |
+
// Recovered after retry — record success
|
| 140 |
+
recordSuccess(config.provider);
|
| 141 |
+
logger.info({ provider: config.provider, attempts: attempt + 1 }, "Retry succeeded");
|
| 142 |
+
}
|
| 143 |
+
return result;
|
| 144 |
+
} catch (err) {
|
| 145 |
+
attempt++;
|
| 146 |
+
const failure = classifyFailure(err);
|
| 147 |
+
|
| 148 |
+
// Permanent failure — don't retry
|
| 149 |
+
if (!failure.retryable) {
|
| 150 |
+
logger.error(
|
| 151 |
+
{ provider: config.provider, failureType: failure.type, attempt },
|
| 152 |
+
"Permanent failure — not retrying"
|
| 153 |
+
);
|
| 154 |
+
recordFailure(config.provider);
|
| 155 |
+
throw err;
|
| 156 |
+
}
|
| 157 |
+
|
| 158 |
+
// Max retries exceeded
|
| 159 |
+
if (attempt >= maxRetries) {
|
| 160 |
+
logger.error(
|
| 161 |
+
{ provider: config.provider, failureType: failure.type, attempts: attempt },
|
| 162 |
+
"Max retries exceeded"
|
| 163 |
+
);
|
| 164 |
+
recordFailure(config.provider);
|
| 165 |
+
throw err;
|
| 166 |
+
}
|
| 167 |
+
|
| 168 |
+
// Calculate wait time with jitter
|
| 169 |
+
// Jitter prevents thundering herd: 1000 requests don't all retry at same time
|
| 170 |
+
const exponentialDelay = Math.min(
|
| 171 |
+
maxDelay,
|
| 172 |
+
baseDelay * Math.pow(2, attempt - 1)
|
| 173 |
+
);
|
| 174 |
+
const jitter = Math.random() * exponentialDelay * 0.3; // ±30% jitter
|
| 175 |
+
const waitMs = Math.max(failure.waitMs, exponentialDelay + jitter);
|
| 176 |
+
|
| 177 |
+
logger.warn(
|
| 178 |
+
{
|
| 179 |
+
provider: config.provider,
|
| 180 |
+
failureType: failure.type,
|
| 181 |
+
attempt,
|
| 182 |
+
maxRetries,
|
| 183 |
+
waitMs: Math.round(waitMs),
|
| 184 |
+
},
|
| 185 |
+
`Retry ${attempt}/${maxRetries} after ${Math.round(waitMs)}ms`
|
| 186 |
+
);
|
| 187 |
+
|
| 188 |
+
await sleep(waitMs);
|
| 189 |
+
}
|
| 190 |
+
}
|
| 191 |
+
}
|
| 192 |
+
|
| 193 |
+
function sleep(ms: number): Promise<void> {
|
| 194 |
+
return new Promise((resolve) => setTimeout(resolve, ms));
|
| 195 |
+
}
|
|
@@ -0,0 +1,249 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/**
|
| 2 |
+
* Slack Command Handler — Bidirectional Intelligence
|
| 3 |
+
*
|
| 4 |
+
* Handles incoming Slack slash commands and messages.
|
| 5 |
+
* Uses LLM for natural language understanding when needed.
|
| 6 |
+
*
|
| 7 |
+
* Commands:
|
| 8 |
+
* /discover → asks clarifying questions
|
| 9 |
+
* /discover region:UK → direct run with params
|
| 10 |
+
* /leads → show today's qualified leads
|
| 11 |
+
* /lead [company] → full lead details
|
| 12 |
+
* /status → system status
|
| 13 |
+
* /pause → pause automatic runs
|
| 14 |
+
* /resume → resume automatic runs
|
| 15 |
+
* /quota [number] → set today's quota
|
| 16 |
+
* /quota [number] always → permanent change
|
| 17 |
+
*/
|
| 18 |
+
|
| 19 |
+
import { getSupabaseClient } from "../../shared/supabase/client";
|
| 20 |
+
import { setQuotaOverride, isSystemPaused } from "../../discovery/lib/territory-manager";
|
| 21 |
+
import { sendClarifyingQuestions } from "./slack-service";
|
| 22 |
+
import { logger } from "../../shared/utils/logger";
|
| 23 |
+
|
| 24 |
+
export interface SlackCommand {
|
| 25 |
+
command: string;
|
| 26 |
+
text: string;
|
| 27 |
+
userId: string;
|
| 28 |
+
channelId: string;
|
| 29 |
+
}
|
| 30 |
+
|
| 31 |
+
/**
|
| 32 |
+
* Route incoming slash commands.
|
| 33 |
+
*/
|
| 34 |
+
export async function handleSlackCommand(cmd: SlackCommand): Promise<string> {
|
| 35 |
+
const { command, text } = cmd;
|
| 36 |
+
const args = text.trim().toLowerCase();
|
| 37 |
+
|
| 38 |
+
switch (command) {
|
| 39 |
+
case "/discover":
|
| 40 |
+
return handleDiscover(args, cmd);
|
| 41 |
+
case "/leads":
|
| 42 |
+
return handleLeads();
|
| 43 |
+
case "/lead":
|
| 44 |
+
return handleLeadDetail(text);
|
| 45 |
+
case "/status":
|
| 46 |
+
return handleStatus();
|
| 47 |
+
case "/pause":
|
| 48 |
+
return handlePause();
|
| 49 |
+
case "/resume":
|
| 50 |
+
return handleResume();
|
| 51 |
+
case "/quota":
|
| 52 |
+
return handleQuota(text);
|
| 53 |
+
default:
|
| 54 |
+
return `Unknown command: ${command}`;
|
| 55 |
+
}
|
| 56 |
+
}
|
| 57 |
+
|
| 58 |
+
// ─── /discover ───────────────────────────────────────────────
|
| 59 |
+
|
| 60 |
+
async function handleDiscover(args: string, cmd: SlackCommand): Promise<string> {
|
| 61 |
+
// Parse structured params if provided
|
| 62 |
+
const params = parseParams(args);
|
| 63 |
+
|
| 64 |
+
if (params.region && params.industry) {
|
| 65 |
+
// Direct run — no questions needed
|
| 66 |
+
const { manualDiscoveryTask } = await import("../../discovery/trigger-tasks/manual-discovery");
|
| 67 |
+
await manualDiscoveryTask.trigger({
|
| 68 |
+
region: params.region.toUpperCase(),
|
| 69 |
+
industry: params.industry,
|
| 70 |
+
maxCompanies: parseInt(params.max ?? "20", 10),
|
| 71 |
+
triggeredBy: `slack:${cmd.userId}`,
|
| 72 |
+
});
|
| 73 |
+
return `🚀 Manual discovery started:\n• Region: ${params.region.toUpperCase()}\n• Industry: ${params.industry}\n• Max: ${params.max ?? 20}\nI'll notify you when complete.`;
|
| 74 |
+
}
|
| 75 |
+
|
| 76 |
+
if (args && !params.region) {
|
| 77 |
+
// Natural language: "aj China pe kaam karo"
|
| 78 |
+
// Ask clarifying questions
|
| 79 |
+
await sendClarifyingQuestions(args, [
|
| 80 |
+
{
|
| 81 |
+
question: "Which cities?",
|
| 82 |
+
options: ["All major cities", "Capital only", "Let me specify..."],
|
| 83 |
+
},
|
| 84 |
+
{
|
| 85 |
+
question: "Which industry?",
|
| 86 |
+
options: ["Healthcare (dental, medical)", "Manufacturing", "Technology/SaaS", "All service businesses"],
|
| 87 |
+
},
|
| 88 |
+
{
|
| 89 |
+
question: "How many leads?",
|
| 90 |
+
options: ["5 (quick)", "10 (standard)", "20 (deep scan)"],
|
| 91 |
+
},
|
| 92 |
+
]);
|
| 93 |
+
return "I've posted clarifying questions ☝️";
|
| 94 |
+
}
|
| 95 |
+
|
| 96 |
+
// No args — interactive mode
|
| 97 |
+
return "Usage:\n• `/discover region:UK industry:dental` — direct run\n• `/discover China pe kaam karo` — natural language\n• `/discover` — this help message";
|
| 98 |
+
}
|
| 99 |
+
|
| 100 |
+
// ─── /leads ──────────────────────────────────────────────────
|
| 101 |
+
|
| 102 |
+
async function handleLeads(): Promise<string> {
|
| 103 |
+
const db = getSupabaseClient();
|
| 104 |
+
const today = new Date();
|
| 105 |
+
today.setHours(0, 0, 0, 0);
|
| 106 |
+
|
| 107 |
+
const { data: leads } = await db
|
| 108 |
+
.from("lead_scores")
|
| 109 |
+
.select(`
|
| 110 |
+
total_score, tier,
|
| 111 |
+
companies (name, domain, industry, city, service_match),
|
| 112 |
+
contacts (full_name, email, email_verified, linkedin_personal_url)
|
| 113 |
+
`)
|
| 114 |
+
.gte("created_at", today.toISOString())
|
| 115 |
+
.order("total_score", { ascending: false });
|
| 116 |
+
|
| 117 |
+
if (!leads?.length) return "No leads found today yet.";
|
| 118 |
+
|
| 119 |
+
const lines = leads.map((l: any, i: number) => {
|
| 120 |
+
const emoji = l.tier === "hot" ? "🔥" : l.tier === "warm" ? "✅" : "📋";
|
| 121 |
+
const email = l.contacts?.email_verified ? "📧✓" : l.contacts?.email ? "📧" : "—";
|
| 122 |
+
const li = l.contacts?.linkedin_personal_url ? "💼" : "—";
|
| 123 |
+
return `${emoji} ${l.total_score} | ${l.companies?.name ?? "?"} | ${l.companies?.industry ?? "?"} | ${l.companies?.city ?? "?"} | ${email} ${li} | ${l.companies?.service_match ?? "—"}`;
|
| 124 |
+
});
|
| 125 |
+
|
| 126 |
+
return `*Today's Leads (${leads.length}):*\n\n` +
|
| 127 |
+
`Score | Company | Industry | City | Channels | Service\n` +
|
| 128 |
+
`${"─".repeat(60)}\n` +
|
| 129 |
+
lines.join("\n") +
|
| 130 |
+
`\n\nType \`/lead [company name]\` for full details`;
|
| 131 |
+
}
|
| 132 |
+
|
| 133 |
+
// ─── /lead [company] ──────────────���─────────────────────────
|
| 134 |
+
|
| 135 |
+
async function handleLeadDetail(companySearch: string): Promise<string> {
|
| 136 |
+
if (!companySearch.trim()) return "Usage: `/lead ABC Dental`";
|
| 137 |
+
|
| 138 |
+
const db = getSupabaseClient();
|
| 139 |
+
const { data: companies } = await db
|
| 140 |
+
.from("companies")
|
| 141 |
+
.select("*")
|
| 142 |
+
.ilike("name", `%${companySearch.trim()}%`)
|
| 143 |
+
.limit(1);
|
| 144 |
+
|
| 145 |
+
if (!companies?.length) return `No company found matching "${companySearch}"`;
|
| 146 |
+
|
| 147 |
+
const company = companies[0];
|
| 148 |
+
const { data: contacts } = await db.from("contacts").select("*").eq("company_id", company.id);
|
| 149 |
+
const { data: scores } = await db.from("lead_scores").select("*").eq("company_id", company.id).limit(1);
|
| 150 |
+
const { data: profiles } = await db.from("lead_profiles").select("*").eq("company_id", company.id).limit(1);
|
| 151 |
+
|
| 152 |
+
const score = scores?.[0];
|
| 153 |
+
const profile = profiles?.[0];
|
| 154 |
+
const contact = contacts?.[0];
|
| 155 |
+
|
| 156 |
+
return `*${company.name}*\n` +
|
| 157 |
+
`Domain: ${company.domain}\n` +
|
| 158 |
+
`Industry: ${company.industry ?? "?"} · Employees: ${company.employee_count ?? "?"}\n` +
|
| 159 |
+
`City: ${company.city ?? "?"} · ${company.country ?? "?"}\n` +
|
| 160 |
+
`Service Match: ${company.service_match ?? "—"}\n` +
|
| 161 |
+
`LinkedIn: ${company.linkedin_url ?? "—"}\n\n` +
|
| 162 |
+
`*Score:* ${score?.total_score ?? "?"}/100 — ${score?.tier?.toUpperCase() ?? "?"}\n` +
|
| 163 |
+
` Fit: ${score?.company_fit ?? "?"}/25 · AI: ${score?.ai_readiness ?? "?"}/20 · Service: ${score?.service_match_score ?? "?"}/20\n` +
|
| 164 |
+
` Contact: ${score?.decision_maker ?? "?"}/20 · Timing: ${score?.timing_score ?? "?"}/15\n\n` +
|
| 165 |
+
`*Profile:*\n${profile?.profile_summary ?? "No profile yet"}\n` +
|
| 166 |
+
`Pain: ${(profile?.pain_points ?? []).join(", ")}\n` +
|
| 167 |
+
`Angle: _${profile?.outreach_angle ?? "?"}_\n\n` +
|
| 168 |
+
`*Contact:* ${contact?.full_name ?? "?"} — ${contact?.title ?? "?"}\n` +
|
| 169 |
+
` Email: ${contact?.email ?? "—"} ${contact?.email_verified ? "✓" : ""}\n` +
|
| 170 |
+
` LinkedIn: ${contact?.linkedin_personal_url ?? "—"}\n` +
|
| 171 |
+
` Social: ${JSON.stringify(contact?.social_profiles ?? {})}`;
|
| 172 |
+
}
|
| 173 |
+
|
| 174 |
+
// ─── /status ─────────────────────────────────────────────────
|
| 175 |
+
|
| 176 |
+
async function handleStatus(): Promise<string> {
|
| 177 |
+
const db = getSupabaseClient();
|
| 178 |
+
|
| 179 |
+
const paused = await isSystemPaused();
|
| 180 |
+
|
| 181 |
+
const { data: quotaConfig } = await db.from("system_config").select("value").eq("key", "daily_quota").single();
|
| 182 |
+
const quota = quotaConfig?.value;
|
| 183 |
+
|
| 184 |
+
const { data: territory } = await db.from("system_config").select("value").eq("key", "current_territory").single();
|
| 185 |
+
const pos = territory?.value;
|
| 186 |
+
|
| 187 |
+
const { data: todayRuns } = await db
|
| 188 |
+
.from("discovery_runs")
|
| 189 |
+
.select("status, leads_qualified")
|
| 190 |
+
.gte("ran_at", new Date(new Date().setHours(0, 0, 0, 0)).toISOString());
|
| 191 |
+
|
| 192 |
+
const todayLeads = todayRuns?.reduce((sum: number, r: any) => sum + (r.leads_qualified ?? 0), 0) ?? 0;
|
| 193 |
+
|
| 194 |
+
return `*System Status*\n` +
|
| 195 |
+
`State: ${paused ? "⏸️ PAUSED" : "▶️ RUNNING"}\n` +
|
| 196 |
+
`Daily Quota: ${(quota as any)?.today_override ?? (quota as any)?.default ?? 10}\n` +
|
| 197 |
+
`Leads Today: ${todayLeads}\n` +
|
| 198 |
+
`Current Territory: ${(pos as any)?.countryCode ?? "?"} city#${(pos as any)?.cityIndex ?? 0}\n` +
|
| 199 |
+
`Runs Today: ${todayRuns?.length ?? 0}`;
|
| 200 |
+
}
|
| 201 |
+
|
| 202 |
+
// ─── /pause, /resume ─────────────────────────────────────────
|
| 203 |
+
|
| 204 |
+
async function handlePause(): Promise<string> {
|
| 205 |
+
const db = getSupabaseClient();
|
| 206 |
+
await db.from("system_config").update({
|
| 207 |
+
value: { enabled: true, paused: true, paused_by: "slack" },
|
| 208 |
+
updated_at: new Date().toISOString(),
|
| 209 |
+
}).eq("key", "auto_mode");
|
| 210 |
+
return "⏸️ System paused. Automatic runs will not start.\nType `/resume` to restart.";
|
| 211 |
+
}
|
| 212 |
+
|
| 213 |
+
async function handleResume(): Promise<string> {
|
| 214 |
+
const db = getSupabaseClient();
|
| 215 |
+
await db.from("system_config").update({
|
| 216 |
+
value: { enabled: true, paused: false, paused_by: null },
|
| 217 |
+
updated_at: new Date().toISOString(),
|
| 218 |
+
}).eq("key", "auto_mode");
|
| 219 |
+
return "▶️ System resumed. Next automatic run will proceed on schedule.";
|
| 220 |
+
}
|
| 221 |
+
|
| 222 |
+
// ─── /quota ──────────────────────────────────────────────────
|
| 223 |
+
|
| 224 |
+
async function handleQuota(text: string): Promise<string> {
|
| 225 |
+
const parts = text.trim().split(/\s+/);
|
| 226 |
+
const num = parseInt(parts[0], 10);
|
| 227 |
+
|
| 228 |
+
if (isNaN(num) || num < 1 || num > 100) {
|
| 229 |
+
return "Usage: `/quota 15` (today only) or `/quota 15 always` (permanent)";
|
| 230 |
+
}
|
| 231 |
+
|
| 232 |
+
const permanent = parts[1] === "always" || parts[1] === "permanent";
|
| 233 |
+
await setQuotaOverride(num, permanent);
|
| 234 |
+
|
| 235 |
+
return permanent
|
| 236 |
+
? `✅ Daily quota permanently set to ${num} leads/day`
|
| 237 |
+
: `✅ Today's quota set to ${num} leads. Tomorrow back to default.`;
|
| 238 |
+
}
|
| 239 |
+
|
| 240 |
+
// ─── Helpers ───────────────────────────��─────────────────────
|
| 241 |
+
|
| 242 |
+
function parseParams(text: string): Record<string, string> {
|
| 243 |
+
const params: Record<string, string> = {};
|
| 244 |
+
const matches = text.matchAll(/(\w+):(\S+)/g);
|
| 245 |
+
for (const match of matches) {
|
| 246 |
+
params[match[1]] = match[2];
|
| 247 |
+
}
|
| 248 |
+
return params;
|
| 249 |
+
}
|
|
@@ -0,0 +1,273 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/**
|
| 2 |
+
* Slack Service — 3-Layer Data Delivery
|
| 3 |
+
*
|
| 4 |
+
* Layer 1: Daily Digest (1 rich message per day — summary table)
|
| 5 |
+
* Layer 2: Real-time Alerts (only HOT leads 85+ — immediate)
|
| 6 |
+
* Layer 3: Commands (/leads, /discover, /status, /pause, /quota)
|
| 7 |
+
*
|
| 8 |
+
* NOT Slack blast — organized, formatted, actionable.
|
| 9 |
+
*/
|
| 10 |
+
|
| 11 |
+
import axios from "axios";
|
| 12 |
+
import { getEnv } from "../../shared/config/env";
|
| 13 |
+
import { getSupabaseClient } from "../../shared/supabase/client";
|
| 14 |
+
import { logger } from "../../shared/utils/logger";
|
| 15 |
+
|
| 16 |
+
// ─── Slack API helper ────────────────────────────────────────
|
| 17 |
+
|
| 18 |
+
async function postMessage(channelId: string, blocks: unknown[], text: string): Promise<void> {
|
| 19 |
+
const env = getEnv();
|
| 20 |
+
try {
|
| 21 |
+
await axios.post("https://slack.com/api/chat.postMessage", {
|
| 22 |
+
channel: channelId,
|
| 23 |
+
text,
|
| 24 |
+
blocks,
|
| 25 |
+
}, {
|
| 26 |
+
headers: { Authorization: `Bearer ${env.SLACK_BOT_TOKEN}` },
|
| 27 |
+
timeout: 5_000,
|
| 28 |
+
});
|
| 29 |
+
} catch (err) {
|
| 30 |
+
logger.warn({ err }, "Slack post failed — non-critical");
|
| 31 |
+
}
|
| 32 |
+
}
|
| 33 |
+
|
| 34 |
+
// ─── LAYER 1: Daily Digest ───────────────────────────────────
|
| 35 |
+
|
| 36 |
+
export async function sendDailyDigest(runSummary: {
|
| 37 |
+
territory: string;
|
| 38 |
+
industry: string;
|
| 39 |
+
companiesSearched: number;
|
| 40 |
+
leadsQualified: number;
|
| 41 |
+
hotLeads: number;
|
| 42 |
+
warmLeads: number;
|
| 43 |
+
nurtureLeads: number;
|
| 44 |
+
tokensUsed: number;
|
| 45 |
+
durationMinutes: number;
|
| 46 |
+
}): Promise<void> {
|
| 47 |
+
const env = getEnv();
|
| 48 |
+
const db = getSupabaseClient();
|
| 49 |
+
|
| 50 |
+
// Fetch today's qualified leads
|
| 51 |
+
const today = new Date();
|
| 52 |
+
today.setHours(0, 0, 0, 0);
|
| 53 |
+
|
| 54 |
+
const { data: leads } = await db
|
| 55 |
+
.from("lead_scores")
|
| 56 |
+
.select(`
|
| 57 |
+
total_score, tier,
|
| 58 |
+
companies (name, domain, industry, employee_count, city, service_match),
|
| 59 |
+
contacts (full_name, title, email, email_verified, linkedin_personal_url)
|
| 60 |
+
`)
|
| 61 |
+
.gte("created_at", today.toISOString())
|
| 62 |
+
.order("total_score", { ascending: false })
|
| 63 |
+
.limit(20);
|
| 64 |
+
|
| 65 |
+
// Build lead table
|
| 66 |
+
const leadRows = (leads ?? []).map((lead: any, i: number) => {
|
| 67 |
+
const emoji = lead.tier === "hot" ? "🔥" : lead.tier === "warm" ? "✅" : "📋";
|
| 68 |
+
const company = lead.companies;
|
| 69 |
+
const contact = lead.contacts;
|
| 70 |
+
const emailIcon = contact?.email_verified ? "📧✓" : contact?.email ? "📧" : "—";
|
| 71 |
+
const linkedinIcon = contact?.linkedin_personal_url ? "💼✓" : "—";
|
| 72 |
+
|
| 73 |
+
return `${emoji} *${company?.name ?? "Unknown"}* — ${lead.total_score}/100 ${lead.tier.toUpperCase()}\n` +
|
| 74 |
+
` ${company?.industry ?? "?"} · ${company?.employee_count ?? "?"} emp · ${company?.city ?? "?"}\n` +
|
| 75 |
+
` ${contact?.full_name ?? "?"} (${contact?.title ?? "?"})\n` +
|
| 76 |
+
` ${emailIcon} ${linkedinIcon} · Match: ${company?.service_match ?? "—"}`;
|
| 77 |
+
}).join("\n\n");
|
| 78 |
+
|
| 79 |
+
const blocks = [
|
| 80 |
+
// Header
|
| 81 |
+
{
|
| 82 |
+
type: "header",
|
| 83 |
+
text: { type: "plain_text", text: `📊 Daily Lead Report — ${formatDate(new Date())}` },
|
| 84 |
+
},
|
| 85 |
+
// Summary stats
|
| 86 |
+
{
|
| 87 |
+
type: "section",
|
| 88 |
+
text: {
|
| 89 |
+
type: "mrkdwn",
|
| 90 |
+
text: `*Territory:* ${runSummary.territory} → ${runSummary.industry}\n` +
|
| 91 |
+
`*Searched:* ${runSummary.companiesSearched} companies\n` +
|
| 92 |
+
`*Qualified:* ${runSummary.leadsQualified} leads ` +
|
| 93 |
+
`(🔥 ${runSummary.hotLeads} hot · ✅ ${runSummary.warmLeads} warm · 📋 ${runSummary.nurtureLeads} nurture)\n` +
|
| 94 |
+
`*Duration:* ${runSummary.durationMinutes} min · *Tokens:* ${runSummary.tokensUsed.toLocaleString()}`,
|
| 95 |
+
},
|
| 96 |
+
},
|
| 97 |
+
{ type: "divider" },
|
| 98 |
+
// Lead list
|
| 99 |
+
{
|
| 100 |
+
type: "section",
|
| 101 |
+
text: {
|
| 102 |
+
type: "mrkdwn",
|
| 103 |
+
text: leadRows || "_No qualified leads found today_",
|
| 104 |
+
},
|
| 105 |
+
},
|
| 106 |
+
{ type: "divider" },
|
| 107 |
+
// Actions
|
| 108 |
+
{
|
| 109 |
+
type: "context",
|
| 110 |
+
elements: [
|
| 111 |
+
{
|
| 112 |
+
type: "mrkdwn",
|
| 113 |
+
text: "Type `/leads` for full details · `/discover region:UK` for manual run · `/status` for system status",
|
| 114 |
+
},
|
| 115 |
+
],
|
| 116 |
+
},
|
| 117 |
+
];
|
| 118 |
+
|
| 119 |
+
await postMessage(env.SLACK_ALERT_CHANNEL_ID, blocks,
|
| 120 |
+
`Daily Report: ${runSummary.leadsQualified} leads found`);
|
| 121 |
+
}
|
| 122 |
+
|
| 123 |
+
// ─── LAYER 2: Hot Lead Alert (85+ only) ──────────────────────
|
| 124 |
+
|
| 125 |
+
export async function sendHotLeadAlert(lead: {
|
| 126 |
+
companyName: string;
|
| 127 |
+
domain: string;
|
| 128 |
+
industry: string;
|
| 129 |
+
employeeCount: number | null;
|
| 130 |
+
city: string | null;
|
| 131 |
+
score: number;
|
| 132 |
+
tier: string;
|
| 133 |
+
contactName: string;
|
| 134 |
+
contactTitle: string;
|
| 135 |
+
email: string | null;
|
| 136 |
+
emailVerified: boolean;
|
| 137 |
+
linkedinPersonal: string | null;
|
| 138 |
+
linkedinCompany: string | null;
|
| 139 |
+
serviceMatch: string | null;
|
| 140 |
+
outreachAngle: string;
|
| 141 |
+
painPoints: string[];
|
| 142 |
+
socialProfiles: Record<string, string | null>;
|
| 143 |
+
}): Promise<void> {
|
| 144 |
+
const env = getEnv();
|
| 145 |
+
const emoji = lead.score >= 90 ? "🔥🔥🔥" : lead.score >= 85 ? "🔥🔥" : "🔥";
|
| 146 |
+
|
| 147 |
+
// Contact channels summary
|
| 148 |
+
const channels: string[] = [];
|
| 149 |
+
if (lead.email && lead.emailVerified) channels.push(`📧 ${lead.email} ✓`);
|
| 150 |
+
else if (lead.email) channels.push(`📧 ${lead.email} (unverified)`);
|
| 151 |
+
if (lead.linkedinPersonal) channels.push(`💼 <${lead.linkedinPersonal}|LinkedIn>`);
|
| 152 |
+
if (lead.linkedinCompany) channels.push(`🏢 <${lead.linkedinCompany}|Company LI>`);
|
| 153 |
+
if (lead.socialProfiles?.instagram) channels.push(`📷 <${lead.socialProfiles.instagram}|Instagram>`);
|
| 154 |
+
if (lead.socialProfiles?.facebook) channels.push(`👥 <${lead.socialProfiles.facebook}|Facebook>`);
|
| 155 |
+
|
| 156 |
+
const blocks = [
|
| 157 |
+
{
|
| 158 |
+
type: "header",
|
| 159 |
+
text: { type: "plain_text", text: `${emoji} HOT LEAD — ${lead.companyName}` },
|
| 160 |
+
},
|
| 161 |
+
{
|
| 162 |
+
type: "section",
|
| 163 |
+
fields: [
|
| 164 |
+
{ type: "mrkdwn", text: `*Score:*\n${lead.score}/100 — ${lead.tier.toUpperCase()}` },
|
| 165 |
+
{ type: "mrkdwn", text: `*Industry:*\n${lead.industry}` },
|
| 166 |
+
{ type: "mrkdwn", text: `*Employees:*\n${lead.employeeCount ?? "Unknown"}` },
|
| 167 |
+
{ type: "mrkdwn", text: `*Location:*\n${lead.city ?? "Unknown"}` },
|
| 168 |
+
{ type: "mrkdwn", text: `*Service Match:*\n${lead.serviceMatch ?? "General"}` },
|
| 169 |
+
{ type: "mrkdwn", text: `*Domain:*\n${lead.domain}` },
|
| 170 |
+
],
|
| 171 |
+
},
|
| 172 |
+
{ type: "divider" },
|
| 173 |
+
{
|
| 174 |
+
type: "section",
|
| 175 |
+
text: {
|
| 176 |
+
type: "mrkdwn",
|
| 177 |
+
text: `*👤 Decision Maker:*\n${lead.contactName} — ${lead.contactTitle}\n\n` +
|
| 178 |
+
`*📱 Channels:*\n${channels.join("\n") || "None found"}\n\n` +
|
| 179 |
+
`*🎯 Outreach Angle:*\n_"${lead.outreachAngle}"_\n\n` +
|
| 180 |
+
`*💢 Pain Points:*\n${lead.painPoints.map(p => `• ${p}`).join("\n")}`,
|
| 181 |
+
},
|
| 182 |
+
},
|
| 183 |
+
];
|
| 184 |
+
|
| 185 |
+
await postMessage(env.SLACK_ALERT_CHANNEL_ID, blocks,
|
| 186 |
+
`🔥 HOT LEAD: ${lead.companyName} — Score ${lead.score}`);
|
| 187 |
+
}
|
| 188 |
+
|
| 189 |
+
// ─── LAYER 2: Run Progress Updates ──────────────────────────
|
| 190 |
+
|
| 191 |
+
export async function sendRunStarted(territory: string, industry: string, quota: number): Promise<void> {
|
| 192 |
+
const env = getEnv();
|
| 193 |
+
await postMessage(env.SLACK_ALERT_CHANNEL_ID, [
|
| 194 |
+
{
|
| 195 |
+
type: "section",
|
| 196 |
+
text: {
|
| 197 |
+
type: "mrkdwn",
|
| 198 |
+
text: `🚀 *Daily run started*\n` +
|
| 199 |
+
`Territory: ${territory} → ${industry}\n` +
|
| 200 |
+
`Quota: ${quota} leads\n` +
|
| 201 |
+
`Estimated: ~90 min`,
|
| 202 |
+
},
|
| 203 |
+
},
|
| 204 |
+
], `Run started: ${territory} ${industry}`);
|
| 205 |
+
}
|
| 206 |
+
|
| 207 |
+
export async function sendRunProgress(qualified: number, quota: number, searched: number): Promise<void> {
|
| 208 |
+
const env = getEnv();
|
| 209 |
+
const progress = Math.round((qualified / quota) * 100);
|
| 210 |
+
const bar = "█".repeat(Math.round(progress / 10)) + "░".repeat(10 - Math.round(progress / 10));
|
| 211 |
+
|
| 212 |
+
await postMessage(env.SLACK_ALERT_CHANNEL_ID, [
|
| 213 |
+
{
|
| 214 |
+
type: "section",
|
| 215 |
+
text: {
|
| 216 |
+
type: "mrkdwn",
|
| 217 |
+
text: `📊 *Progress:* ${qualified}/${quota} qualified [${bar}] ${progress}%\n` +
|
| 218 |
+
`Searched: ${searched} companies`,
|
| 219 |
+
},
|
| 220 |
+
},
|
| 221 |
+
], `Progress: ${qualified}/${quota}`);
|
| 222 |
+
}
|
| 223 |
+
|
| 224 |
+
// ─── LAYER 3: Clarifying Questions ──────────────────────────
|
| 225 |
+
|
| 226 |
+
export async function sendClarifyingQuestions(
|
| 227 |
+
userMessage: string,
|
| 228 |
+
questions: { question: string; options: string[] }[]
|
| 229 |
+
): Promise<void> {
|
| 230 |
+
const env = getEnv();
|
| 231 |
+
|
| 232 |
+
const blocks: unknown[] = [
|
| 233 |
+
{
|
| 234 |
+
type: "section",
|
| 235 |
+
text: {
|
| 236 |
+
type: "mrkdwn",
|
| 237 |
+
text: `🤔 *Got it: "${userMessage}"*\nMujhe kuch clarify karna hai:`,
|
| 238 |
+
},
|
| 239 |
+
},
|
| 240 |
+
];
|
| 241 |
+
|
| 242 |
+
for (const q of questions) {
|
| 243 |
+
blocks.push({
|
| 244 |
+
type: "section",
|
| 245 |
+
text: {
|
| 246 |
+
type: "mrkdwn",
|
| 247 |
+
text: `*${q.question}*\n${q.options.map((o, i) => `${i + 1}. ${o}`).join("\n")}`,
|
| 248 |
+
},
|
| 249 |
+
});
|
| 250 |
+
}
|
| 251 |
+
|
| 252 |
+
blocks.push({
|
| 253 |
+
type: "context",
|
| 254 |
+
elements: [{
|
| 255 |
+
type: "mrkdwn",
|
| 256 |
+
text: "Just reply with numbers (e.g., `1 2 3`) or type your own answer",
|
| 257 |
+
}],
|
| 258 |
+
});
|
| 259 |
+
|
| 260 |
+
await postMessage(env.SLACK_ALERT_CHANNEL_ID, blocks,
|
| 261 |
+
"Clarifying questions for manual discovery");
|
| 262 |
+
}
|
| 263 |
+
|
| 264 |
+
// ─── Helpers ─────────────────────────────────────────────────
|
| 265 |
+
|
| 266 |
+
function formatDate(date: Date): string {
|
| 267 |
+
return date.toLocaleDateString("en-US", {
|
| 268 |
+
weekday: "long",
|
| 269 |
+
year: "numeric",
|
| 270 |
+
month: "long",
|
| 271 |
+
day: "numeric",
|
| 272 |
+
});
|
| 273 |
+
}
|
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/**
|
| 2 |
+
* Trigger.dev entry point — registers all tasks.
|
| 3 |
+
* This file must export all tasks for Trigger.dev to discover them.
|
| 4 |
+
*/
|
| 5 |
+
|
| 6 |
+
export { autoDiscoveryTask, autoDiscoverySchedule } from "./discovery/trigger-tasks/auto-discovery";
|
| 7 |
+
export { manualDiscoveryTask } from "./discovery/trigger-tasks/manual-discovery";
|
| 8 |
+
export { profilingTask } from "./profiling/trigger-tasks/profiling-router";
|
|
@@ -0,0 +1,279 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
-- ============================================================
|
| 2 |
+
-- AI Client Acquisition System — Supabase Schema
|
| 3 |
+
-- Run this in Supabase SQL Editor
|
| 4 |
+
-- ============================================================
|
| 5 |
+
|
| 6 |
+
-- Enable pgcrypto for UUID generation
|
| 7 |
+
CREATE EXTENSION IF NOT EXISTS "pgcrypto";
|
| 8 |
+
CREATE EXTENSION IF NOT EXISTS "pg_trgm"; -- for fuzzy name matching
|
| 9 |
+
|
| 10 |
+
-- ─── ENUMS ──────────────────────────────────────────────────
|
| 11 |
+
|
| 12 |
+
CREATE TYPE company_status AS ENUM (
|
| 13 |
+
'discovered', 'researching', 'profiled',
|
| 14 |
+
'qualified', 'nurture', 'archived', 'suppressed'
|
| 15 |
+
);
|
| 16 |
+
|
| 17 |
+
CREATE TYPE contact_status AS ENUM (
|
| 18 |
+
'found', 'email_verified', 'email_invalid',
|
| 19 |
+
'linkedin_only', 'suppressed'
|
| 20 |
+
);
|
| 21 |
+
|
| 22 |
+
CREATE TYPE lead_tier AS ENUM ('hot', 'warm', 'nurture', 'archive');
|
| 23 |
+
|
| 24 |
+
CREATE TYPE outreach_channel AS ENUM ('email', 'linkedin');
|
| 25 |
+
|
| 26 |
+
CREATE TYPE outreach_status AS ENUM (
|
| 27 |
+
'queued', 'sent', 'opened', 'replied',
|
| 28 |
+
'bounced', 'failed', 'review_needed'
|
| 29 |
+
);
|
| 30 |
+
|
| 31 |
+
CREATE TYPE intent_type AS ENUM (
|
| 32 |
+
'interested', 'question', 'not_now',
|
| 33 |
+
'not_interested', 'out_of_office', 'wrong_person', 'unknown'
|
| 34 |
+
);
|
| 35 |
+
|
| 36 |
+
CREATE TYPE review_status AS ENUM ('pending', 'approved', 'rejected', 'edited');
|
| 37 |
+
|
| 38 |
+
-- ─── CORE TABLES ────────────────────────────────────────────
|
| 39 |
+
|
| 40 |
+
-- ICP Configuration (editable from dashboard)
|
| 41 |
+
CREATE TABLE icp_config (
|
| 42 |
+
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
|
| 43 |
+
name TEXT NOT NULL DEFAULT 'default',
|
| 44 |
+
min_employees INTEGER NOT NULL DEFAULT 50,
|
| 45 |
+
industries TEXT[] NOT NULL DEFAULT ARRAY['technology','manufacturing','logistics','healthcare','finance'],
|
| 46 |
+
exclude_industries TEXT[] NOT NULL DEFAULT ARRAY['government','non-profit','education'],
|
| 47 |
+
geographies TEXT[] NOT NULL DEFAULT ARRAY['US','UK','AU','UAE','SA'],
|
| 48 |
+
keywords TEXT[] NOT NULL DEFAULT ARRAY['automation','digital transformation','AI','operations'],
|
| 49 |
+
tech_signals TEXT[] DEFAULT ARRAY['salesforce','hubspot','legacy_erp','sap'],
|
| 50 |
+
score_threshold INTEGER NOT NULL DEFAULT 70,
|
| 51 |
+
is_active BOOLEAN NOT NULL DEFAULT true,
|
| 52 |
+
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
| 53 |
+
updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
|
| 54 |
+
);
|
| 55 |
+
|
| 56 |
+
-- Weekly rotation state
|
| 57 |
+
CREATE TABLE rotation_state (
|
| 58 |
+
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
|
| 59 |
+
week_number INTEGER NOT NULL, -- 1=USA, 2=UK, 3=AU, 4=Gulf
|
| 60 |
+
region TEXT NOT NULL,
|
| 61 |
+
started_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
| 62 |
+
completed_at TIMESTAMPTZ,
|
| 63 |
+
companies_found INTEGER DEFAULT 0,
|
| 64 |
+
leads_qualified INTEGER DEFAULT 0
|
| 65 |
+
);
|
| 66 |
+
|
| 67 |
+
-- Companies discovered
|
| 68 |
+
CREATE TABLE companies (
|
| 69 |
+
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
|
| 70 |
+
domain TEXT NOT NULL UNIQUE,
|
| 71 |
+
name TEXT NOT NULL,
|
| 72 |
+
industry TEXT,
|
| 73 |
+
employee_count INTEGER,
|
| 74 |
+
employee_range TEXT, -- "50-200", "200-500" etc
|
| 75 |
+
description TEXT,
|
| 76 |
+
website_url TEXT,
|
| 77 |
+
linkedin_url TEXT,
|
| 78 |
+
country TEXT,
|
| 79 |
+
region TEXT,
|
| 80 |
+
tech_stack TEXT[],
|
| 81 |
+
growth_signals JSONB DEFAULT '[]', -- job posts, news, funding
|
| 82 |
+
raw_data JSONB DEFAULT '{}',
|
| 83 |
+
source TEXT NOT NULL, -- 'serper', 'linkedin', 'manual'
|
| 84 |
+
status company_status NOT NULL DEFAULT 'discovered',
|
| 85 |
+
discovered_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
| 86 |
+
updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
|
| 87 |
+
);
|
| 88 |
+
|
| 89 |
+
CREATE INDEX idx_companies_domain ON companies(domain);
|
| 90 |
+
CREATE INDEX idx_companies_status ON companies(status);
|
| 91 |
+
CREATE INDEX idx_companies_country ON companies(country);
|
| 92 |
+
CREATE INDEX idx_companies_name_trgm ON companies USING GIN (name gin_trgm_ops);
|
| 93 |
+
|
| 94 |
+
-- Contacts (decision-makers)
|
| 95 |
+
CREATE TABLE contacts (
|
| 96 |
+
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
|
| 97 |
+
company_id UUID NOT NULL REFERENCES companies(id) ON DELETE CASCADE,
|
| 98 |
+
full_name TEXT NOT NULL,
|
| 99 |
+
first_name TEXT,
|
| 100 |
+
last_name TEXT,
|
| 101 |
+
title TEXT NOT NULL,
|
| 102 |
+
seniority TEXT, -- 'c_suite','vp','director','manager'
|
| 103 |
+
email TEXT,
|
| 104 |
+
email_verified BOOLEAN DEFAULT FALSE,
|
| 105 |
+
email_source TEXT, -- 'hunter','snov','pattern'
|
| 106 |
+
linkedin_url TEXT,
|
| 107 |
+
linkedin_verified BOOLEAN DEFAULT FALSE,
|
| 108 |
+
status contact_status NOT NULL DEFAULT 'found',
|
| 109 |
+
suppressed BOOLEAN NOT NULL DEFAULT FALSE,
|
| 110 |
+
suppressed_at TIMESTAMPTZ,
|
| 111 |
+
suppressed_reason TEXT,
|
| 112 |
+
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
| 113 |
+
updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
|
| 114 |
+
);
|
| 115 |
+
|
| 116 |
+
CREATE INDEX idx_contacts_company ON contacts(company_id);
|
| 117 |
+
CREATE INDEX idx_contacts_email ON contacts(email);
|
| 118 |
+
CREATE INDEX idx_contacts_suppressed ON contacts(suppressed);
|
| 119 |
+
|
| 120 |
+
-- Evidence gathered during research
|
| 121 |
+
CREATE TABLE evidence (
|
| 122 |
+
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
|
| 123 |
+
company_id UUID NOT NULL REFERENCES companies(id) ON DELETE CASCADE,
|
| 124 |
+
type TEXT NOT NULL, -- 'job_posting','news','social_post','website_text'
|
| 125 |
+
content TEXT NOT NULL,
|
| 126 |
+
source_url TEXT,
|
| 127 |
+
ai_signal BOOLEAN DEFAULT FALSE, -- does this mention AI/automation?
|
| 128 |
+
collected_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
|
| 129 |
+
);
|
| 130 |
+
|
| 131 |
+
CREATE INDEX idx_evidence_company ON evidence(company_id);
|
| 132 |
+
|
| 133 |
+
-- Lead profiles (LLM output)
|
| 134 |
+
CREATE TABLE lead_profiles (
|
| 135 |
+
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
|
| 136 |
+
company_id UUID NOT NULL REFERENCES companies(id) ON DELETE CASCADE,
|
| 137 |
+
profile_summary TEXT NOT NULL,
|
| 138 |
+
pain_points TEXT[] DEFAULT '{}',
|
| 139 |
+
ai_use_case TEXT,
|
| 140 |
+
ai_readiness TEXT NOT NULL DEFAULT 'medium', -- low/medium/high
|
| 141 |
+
outreach_angle TEXT,
|
| 142 |
+
llm_model TEXT NOT NULL,
|
| 143 |
+
llm_confidence NUMERIC(3,2),
|
| 144 |
+
is_fallback BOOLEAN DEFAULT FALSE,
|
| 145 |
+
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
|
| 146 |
+
);
|
| 147 |
+
|
| 148 |
+
-- Lead scores
|
| 149 |
+
CREATE TABLE lead_scores (
|
| 150 |
+
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
|
| 151 |
+
company_id UUID NOT NULL REFERENCES companies(id) ON DELETE CASCADE,
|
| 152 |
+
contact_id UUID REFERENCES contacts(id),
|
| 153 |
+
total_score INTEGER NOT NULL CHECK (total_score BETWEEN 0 AND 100),
|
| 154 |
+
tier lead_tier NOT NULL,
|
| 155 |
+
company_fit INTEGER, -- 0-25
|
| 156 |
+
ai_readiness INTEGER, -- 0-25
|
| 157 |
+
decision_maker INTEGER, -- 0-20
|
| 158 |
+
growth_signal INTEGER, -- 0-15
|
| 159 |
+
engagement_potential INTEGER, -- 0-15
|
| 160 |
+
score_reasoning TEXT,
|
| 161 |
+
scored_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
|
| 162 |
+
);
|
| 163 |
+
|
| 164 |
+
CREATE INDEX idx_scores_company ON lead_scores(company_id);
|
| 165 |
+
CREATE INDEX idx_scores_tier ON lead_scores(tier);
|
| 166 |
+
CREATE INDEX idx_scores_total ON lead_scores(total_score DESC);
|
| 167 |
+
|
| 168 |
+
-- ─── OUTREACH TABLES ────────────────────────────────────────
|
| 169 |
+
|
| 170 |
+
CREATE TABLE outreach_sequences (
|
| 171 |
+
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
|
| 172 |
+
company_id UUID NOT NULL REFERENCES companies(id) ON DELETE CASCADE,
|
| 173 |
+
contact_id UUID NOT NULL REFERENCES contacts(id),
|
| 174 |
+
current_step INTEGER NOT NULL DEFAULT 0,
|
| 175 |
+
total_steps INTEGER NOT NULL DEFAULT 5,
|
| 176 |
+
next_action_at TIMESTAMPTZ,
|
| 177 |
+
status TEXT NOT NULL DEFAULT 'active', -- active/paused/completed/stopped
|
| 178 |
+
stopped_reason TEXT,
|
| 179 |
+
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
| 180 |
+
updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
|
| 181 |
+
);
|
| 182 |
+
|
| 183 |
+
CREATE TABLE outreach_log (
|
| 184 |
+
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
|
| 185 |
+
sequence_id UUID REFERENCES outreach_sequences(id),
|
| 186 |
+
company_id UUID NOT NULL REFERENCES companies(id),
|
| 187 |
+
contact_id UUID NOT NULL REFERENCES contacts(id),
|
| 188 |
+
channel outreach_channel NOT NULL,
|
| 189 |
+
step_number INTEGER NOT NULL,
|
| 190 |
+
template_id TEXT,
|
| 191 |
+
message_hash TEXT NOT NULL UNIQUE, -- prevent duplicate sends
|
| 192 |
+
subject TEXT,
|
| 193 |
+
status outreach_status NOT NULL DEFAULT 'queued',
|
| 194 |
+
provider_id TEXT, -- external message ID from Resend/LinkedIn
|
| 195 |
+
sent_at TIMESTAMPTZ,
|
| 196 |
+
opened_at TIMESTAMPTZ,
|
| 197 |
+
replied_at TIMESTAMPTZ,
|
| 198 |
+
bounced_at TIMESTAMPTZ,
|
| 199 |
+
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
|
| 200 |
+
);
|
| 201 |
+
|
| 202 |
+
CREATE INDEX idx_outreach_company ON outreach_log(company_id);
|
| 203 |
+
CREATE INDEX idx_outreach_hash ON outreach_log(message_hash);
|
| 204 |
+
CREATE INDEX idx_outreach_status ON outreach_log(status);
|
| 205 |
+
|
| 206 |
+
-- ─── ENGAGEMENT TABLES ──────────────────────────────────────
|
| 207 |
+
|
| 208 |
+
CREATE TABLE engagement_log (
|
| 209 |
+
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
|
| 210 |
+
outreach_id UUID REFERENCES outreach_log(id),
|
| 211 |
+
company_id UUID NOT NULL REFERENCES companies(id),
|
| 212 |
+
contact_id UUID NOT NULL REFERENCES contacts(id),
|
| 213 |
+
signal_type TEXT NOT NULL, -- 'open','reply','bounce','linkedin_accept'
|
| 214 |
+
intent intent_type,
|
| 215 |
+
raw_content TEXT, -- actual reply text (for NLP)
|
| 216 |
+
detected_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
|
| 217 |
+
);
|
| 218 |
+
|
| 219 |
+
-- ─── SYSTEM TABLES ──────────────────────────────────────────
|
| 220 |
+
|
| 221 |
+
CREATE TABLE suppression_list (
|
| 222 |
+
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
|
| 223 |
+
email TEXT,
|
| 224 |
+
domain TEXT,
|
| 225 |
+
reason TEXT NOT NULL, -- 'unsubscribed','bounced','manual'
|
| 226 |
+
added_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
|
| 227 |
+
);
|
| 228 |
+
|
| 229 |
+
CREATE INDEX idx_suppression_email ON suppression_list(email);
|
| 230 |
+
CREATE INDEX idx_suppression_domain ON suppression_list(domain);
|
| 231 |
+
|
| 232 |
+
CREATE TABLE human_review_queue (
|
| 233 |
+
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
|
| 234 |
+
type TEXT NOT NULL, -- 'outreach_approval','score_anomaly','escalation'
|
| 235 |
+
company_id UUID REFERENCES companies(id),
|
| 236 |
+
contact_id UUID REFERENCES contacts(id),
|
| 237 |
+
payload JSONB NOT NULL, -- full context for reviewer
|
| 238 |
+
status review_status NOT NULL DEFAULT 'pending',
|
| 239 |
+
reviewer_notes TEXT,
|
| 240 |
+
resolved_at TIMESTAMPTZ,
|
| 241 |
+
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
|
| 242 |
+
);
|
| 243 |
+
|
| 244 |
+
CREATE INDEX idx_review_status ON human_review_queue(status);
|
| 245 |
+
|
| 246 |
+
CREATE TABLE api_usage_log (
|
| 247 |
+
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
|
| 248 |
+
provider TEXT NOT NULL, -- 'serper','hunter','snov','reoon'
|
| 249 |
+
endpoint TEXT,
|
| 250 |
+
credits_used INTEGER DEFAULT 1,
|
| 251 |
+
success BOOLEAN NOT NULL,
|
| 252 |
+
error_msg TEXT,
|
| 253 |
+
called_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
|
| 254 |
+
);
|
| 255 |
+
|
| 256 |
+
CREATE TABLE audit_log (
|
| 257 |
+
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
|
| 258 |
+
action TEXT NOT NULL,
|
| 259 |
+
entity_type TEXT NOT NULL,
|
| 260 |
+
entity_id UUID,
|
| 261 |
+
actor TEXT NOT NULL DEFAULT 'system',
|
| 262 |
+
details JSONB DEFAULT '{}',
|
| 263 |
+
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
|
| 264 |
+
);
|
| 265 |
+
|
| 266 |
+
-- ─── SEED DATA ──────────────────────────────────────────────
|
| 267 |
+
|
| 268 |
+
INSERT INTO icp_config (name, min_employees, industries, geographies, keywords, score_threshold)
|
| 269 |
+
VALUES (
|
| 270 |
+
'default',
|
| 271 |
+
50,
|
| 272 |
+
ARRAY['technology','software','manufacturing','logistics','supply_chain','healthcare','finance','real_estate_tech','retail_tech'],
|
| 273 |
+
ARRAY['US','UK','AU','UAE','SA','SG'],
|
| 274 |
+
ARRAY['automation','digital transformation','AI','machine learning','operations','workflow','efficiency'],
|
| 275 |
+
70
|
| 276 |
+
);
|
| 277 |
+
|
| 278 |
+
INSERT INTO rotation_state (week_number, region)
|
| 279 |
+
VALUES (1, 'US');
|
|
@@ -0,0 +1,242 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
-- ============================================================
|
| 2 |
+
-- Migration 002 — Phase 1 Enhancements
|
| 3 |
+
-- Territory management, service profiles, social profiles,
|
| 4 |
+
-- discovery run tracking, pipeline checkpoints, LLM traces
|
| 5 |
+
-- ============================================================
|
| 6 |
+
|
| 7 |
+
-- ─── SERVICE PROFILES ────────────────────────────────────────
|
| 8 |
+
-- What services WE offer → what industries → what pain signals to look for
|
| 9 |
+
|
| 10 |
+
CREATE TABLE service_profiles (
|
| 11 |
+
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
|
| 12 |
+
service_name TEXT NOT NULL UNIQUE, -- 'AI Receptionist'
|
| 13 |
+
description TEXT,
|
| 14 |
+
target_industries TEXT[] NOT NULL, -- ['dental','medical','legal','salon']
|
| 15 |
+
min_employees INTEGER DEFAULT 3,
|
| 16 |
+
max_employees INTEGER DEFAULT 500,
|
| 17 |
+
pain_signals TEXT[] NOT NULL, -- website signals to detect
|
| 18 |
+
score_boost INTEGER NOT NULL DEFAULT 15, -- points added when matched
|
| 19 |
+
outreach_keywords TEXT[], -- words to use in outreach
|
| 20 |
+
is_active BOOLEAN DEFAULT true,
|
| 21 |
+
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
|
| 22 |
+
);
|
| 23 |
+
|
| 24 |
+
-- Seed service profiles
|
| 25 |
+
INSERT INTO service_profiles (service_name, description, target_industries, min_employees, pain_signals, score_boost, outreach_keywords)
|
| 26 |
+
VALUES
|
| 27 |
+
('AI Receptionist',
|
| 28 |
+
'Automated phone answering, appointment booking, 24/7 availability',
|
| 29 |
+
ARRAY['dental','medical','veterinary','legal','salon','spa','real_estate','accounting','chiropractic'],
|
| 30 |
+
3,
|
| 31 |
+
ARRAY['phone number prominent','book appointment','call us','receptionist','front desk','office hours','schedule a visit'],
|
| 32 |
+
20,
|
| 33 |
+
ARRAY['missed calls','after hours','appointment booking','front desk costs']),
|
| 34 |
+
|
| 35 |
+
('AI Customer Support',
|
| 36 |
+
'Chatbot, ticket automation, FAQ automation',
|
| 37 |
+
ARRAY['ecommerce','saas','retail','hospitality','travel','insurance','telecom'],
|
| 38 |
+
10,
|
| 39 |
+
ARRAY['contact form','support email','FAQ page','help center','no chatbot','submit a ticket'],
|
| 40 |
+
15,
|
| 41 |
+
ARRAY['support costs','response time','ticket volume','customer satisfaction']),
|
| 42 |
+
|
| 43 |
+
('AI Data Processing',
|
| 44 |
+
'Document processing, report automation, ERP modernization',
|
| 45 |
+
ARRAY['manufacturing','logistics','finance','healthcare','supply_chain','construction','energy'],
|
| 46 |
+
50,
|
| 47 |
+
ARRAY['legacy ERP','SAP','manual reporting','spreadsheet','data entry','compliance reporting'],
|
| 48 |
+
25,
|
| 49 |
+
ARRAY['manual processes','reporting overhead','data accuracy','compliance automation']),
|
| 50 |
+
|
| 51 |
+
('AI Sales Automation',
|
| 52 |
+
'Outreach automation, CRM enrichment, lead scoring',
|
| 53 |
+
ARRAY['b2b_saas','consulting','recruitment','insurance','financial_services','marketing_agency'],
|
| 54 |
+
10,
|
| 55 |
+
ARRAY['sales team','CRM','outbound','SDR','BDR','sales development','pipeline'],
|
| 56 |
+
20,
|
| 57 |
+
ARRAY['pipeline velocity','lead qualification','outbound efficiency','sales productivity']),
|
| 58 |
+
|
| 59 |
+
('AI Workflow Automation',
|
| 60 |
+
'General process automation, integration, workflow optimization',
|
| 61 |
+
ARRAY['technology','professional_services','education','media','nonprofit_large','government_contractor'],
|
| 62 |
+
20,
|
| 63 |
+
ARRAY['manual process','approval workflow','internal tools','legacy system','multiple platforms'],
|
| 64 |
+
15,
|
| 65 |
+
ARRAY['operational efficiency','process bottlenecks','tool consolidation','workflow speed']);
|
| 66 |
+
|
| 67 |
+
|
| 68 |
+
-- ─── TERRITORY GRID ──────────────────────────────────────────
|
| 69 |
+
-- Every city × industry = one territory unit
|
| 70 |
+
|
| 71 |
+
CREATE TABLE territory_grid (
|
| 72 |
+
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
|
| 73 |
+
country TEXT NOT NULL,
|
| 74 |
+
country_code TEXT NOT NULL,
|
| 75 |
+
city TEXT NOT NULL,
|
| 76 |
+
tier INTEGER NOT NULL DEFAULT 1, -- 1=major city, 2=mid, 3=small
|
| 77 |
+
timezone TEXT, -- 'America/New_York'
|
| 78 |
+
is_active BOOLEAN DEFAULT true,
|
| 79 |
+
|
| 80 |
+
UNIQUE(country_code, city)
|
| 81 |
+
);
|
| 82 |
+
|
| 83 |
+
-- Seed US Tier 1 cities
|
| 84 |
+
INSERT INTO territory_grid (country, country_code, city, tier, timezone) VALUES
|
| 85 |
+
('United States', 'US', 'New York', 1, 'America/New_York'),
|
| 86 |
+
('United States', 'US', 'Los Angeles', 1, 'America/Los_Angeles'),
|
| 87 |
+
('United States', 'US', 'Chicago', 1, 'America/Chicago'),
|
| 88 |
+
('United States', 'US', 'Houston', 1, 'America/Chicago'),
|
| 89 |
+
('United States', 'US', 'Phoenix', 1, 'America/Phoenix'),
|
| 90 |
+
('United States', 'US', 'Philadelphia', 1, 'America/New_York'),
|
| 91 |
+
('United States', 'US', 'San Antonio', 1, 'America/Chicago'),
|
| 92 |
+
('United States', 'US', 'San Diego', 1, 'America/Los_Angeles'),
|
| 93 |
+
('United States', 'US', 'Dallas', 1, 'America/Chicago'),
|
| 94 |
+
('United States', 'US', 'Austin', 1, 'America/Chicago'),
|
| 95 |
+
('United States', 'US', 'San Francisco', 1, 'America/Los_Angeles'),
|
| 96 |
+
('United States', 'US', 'Seattle', 1, 'America/Los_Angeles'),
|
| 97 |
+
('United States', 'US', 'Denver', 1, 'America/Denver'),
|
| 98 |
+
('United States', 'US', 'Boston', 1, 'America/New_York'),
|
| 99 |
+
('United States', 'US', 'Miami', 1, 'America/New_York');
|
| 100 |
+
|
| 101 |
+
-- UK cities
|
| 102 |
+
INSERT INTO territory_grid (country, country_code, city, tier, timezone) VALUES
|
| 103 |
+
('United Kingdom', 'GB', 'London', 1, 'Europe/London'),
|
| 104 |
+
('United Kingdom', 'GB', 'Manchester', 1, 'Europe/London'),
|
| 105 |
+
('United Kingdom', 'GB', 'Birmingham', 1, 'Europe/London'),
|
| 106 |
+
('United Kingdom', 'GB', 'Leeds', 2, 'Europe/London'),
|
| 107 |
+
('United Kingdom', 'GB', 'Edinburgh', 2, 'Europe/London'),
|
| 108 |
+
('United Kingdom', 'GB', 'Bristol', 2, 'Europe/London'),
|
| 109 |
+
('United Kingdom', 'GB', 'Glasgow', 2, 'Europe/London');
|
| 110 |
+
|
| 111 |
+
-- Australia cities
|
| 112 |
+
INSERT INTO territory_grid (country, country_code, city, tier, timezone) VALUES
|
| 113 |
+
('Australia', 'AU', 'Sydney', 1, 'Australia/Sydney'),
|
| 114 |
+
('Australia', 'AU', 'Melbourne', 1, 'Australia/Melbourne'),
|
| 115 |
+
('Australia', 'AU', 'Brisbane', 1, 'Australia/Brisbane'),
|
| 116 |
+
('Australia', 'AU', 'Perth', 2, 'Australia/Perth'),
|
| 117 |
+
('Australia', 'AU', 'Adelaide', 2, 'Australia/Adelaide');
|
| 118 |
+
|
| 119 |
+
-- Gulf cities
|
| 120 |
+
INSERT INTO territory_grid (country, country_code, city, tier, timezone) VALUES
|
| 121 |
+
('United Arab Emirates', 'AE', 'Dubai', 1, 'Asia/Dubai'),
|
| 122 |
+
('United Arab Emirates', 'AE', 'Abu Dhabi', 1, 'Asia/Dubai'),
|
| 123 |
+
('Saudi Arabia', 'SA', 'Riyadh', 1, 'Asia/Riyadh'),
|
| 124 |
+
('Saudi Arabia', 'SA', 'Jeddah', 2, 'Asia/Riyadh'),
|
| 125 |
+
('Qatar', 'QA', 'Doha', 1, 'Asia/Qatar');
|
| 126 |
+
|
| 127 |
+
|
| 128 |
+
-- ─── DISCOVERY RUNS ──────────────────────────────────────────
|
| 129 |
+
-- Track every search execution — prevents duplicate searches
|
| 130 |
+
|
| 131 |
+
CREATE TABLE discovery_runs (
|
| 132 |
+
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
|
| 133 |
+
run_type TEXT NOT NULL, -- 'auto' | 'manual'
|
| 134 |
+
territory_id UUID REFERENCES territory_grid(id),
|
| 135 |
+
country_code TEXT NOT NULL,
|
| 136 |
+
city TEXT NOT NULL,
|
| 137 |
+
industry TEXT NOT NULL,
|
| 138 |
+
search_queries TEXT[], -- actual Google queries used
|
| 139 |
+
companies_found INTEGER DEFAULT 0,
|
| 140 |
+
companies_passed_gate1 INTEGER DEFAULT 0,
|
| 141 |
+
companies_passed_gate2 INTEGER DEFAULT 0,
|
| 142 |
+
leads_qualified INTEGER DEFAULT 0,
|
| 143 |
+
quota_target INTEGER DEFAULT 10,
|
| 144 |
+
status TEXT DEFAULT 'running', -- 'running','completed','failed','partial'
|
| 145 |
+
triggered_by TEXT DEFAULT 'system', -- 'system' | 'slack:username'
|
| 146 |
+
ran_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
| 147 |
+
completed_at TIMESTAMPTZ
|
| 148 |
+
);
|
| 149 |
+
|
| 150 |
+
CREATE INDEX idx_discovery_runs_territory ON discovery_runs(city, industry, ran_at DESC);
|
| 151 |
+
|
| 152 |
+
|
| 153 |
+
-- ─── TERRITORY PROGRESS ─────────────────────────────────────
|
| 154 |
+
-- Tracks which city+industry combos have been covered and when
|
| 155 |
+
|
| 156 |
+
CREATE TABLE territory_progress (
|
| 157 |
+
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
|
| 158 |
+
territory_id UUID REFERENCES territory_grid(id),
|
| 159 |
+
industry TEXT NOT NULL,
|
| 160 |
+
last_run_at TIMESTAMPTZ NOT NULL,
|
| 161 |
+
next_eligible_at TIMESTAMPTZ NOT NULL, -- last_run + 30 days
|
| 162 |
+
total_leads INTEGER DEFAULT 0,
|
| 163 |
+
|
| 164 |
+
UNIQUE(territory_id, industry)
|
| 165 |
+
);
|
| 166 |
+
|
| 167 |
+
|
| 168 |
+
-- ─── PIPELINE CHECKPOINTS ───────────────────────────────────
|
| 169 |
+
-- Allows pipeline to resume from failure point (idempotency)
|
| 170 |
+
|
| 171 |
+
CREATE TABLE pipeline_checkpoints (
|
| 172 |
+
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
|
| 173 |
+
run_id UUID REFERENCES discovery_runs(id),
|
| 174 |
+
company_domain TEXT NOT NULL,
|
| 175 |
+
stage TEXT NOT NULL, -- 'scraped','filtered','contacts_found','profiled','scored'
|
| 176 |
+
stage_data JSONB DEFAULT '{}', -- intermediate data for resume
|
| 177 |
+
completed BOOLEAN DEFAULT false,
|
| 178 |
+
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
| 179 |
+
updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
|
| 180 |
+
|
| 181 |
+
UNIQUE(run_id, company_domain)
|
| 182 |
+
);
|
| 183 |
+
|
| 184 |
+
|
| 185 |
+
-- ─── LLM CALL TRACES ────────────────────────────────────────
|
| 186 |
+
-- Every LLM call logged for cost tracking and debugging
|
| 187 |
+
|
| 188 |
+
CREATE TABLE llm_traces (
|
| 189 |
+
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
|
| 190 |
+
trace_id TEXT NOT NULL, -- pipeline run trace
|
| 191 |
+
operation TEXT NOT NULL, -- 'profile','score','email_classify','pain_detect'
|
| 192 |
+
model TEXT NOT NULL, -- 'meta/llama-3.3-70b-instruct'
|
| 193 |
+
provider TEXT NOT NULL DEFAULT 'nvidia',
|
| 194 |
+
prompt_tokens INTEGER,
|
| 195 |
+
completion_tokens INTEGER,
|
| 196 |
+
total_tokens INTEGER,
|
| 197 |
+
latency_ms INTEGER,
|
| 198 |
+
success BOOLEAN NOT NULL,
|
| 199 |
+
fallback_used BOOLEAN DEFAULT false,
|
| 200 |
+
grounding_score NUMERIC(3,2), -- 0.00-1.00 how well grounded
|
| 201 |
+
company_id UUID REFERENCES companies(id),
|
| 202 |
+
input_hash TEXT, -- hash of prompt (no PII stored)
|
| 203 |
+
output_hash TEXT, -- hash of output
|
| 204 |
+
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
|
| 205 |
+
);
|
| 206 |
+
|
| 207 |
+
CREATE INDEX idx_llm_traces_trace ON llm_traces(trace_id);
|
| 208 |
+
CREATE INDEX idx_llm_traces_company ON llm_traces(company_id);
|
| 209 |
+
|
| 210 |
+
|
| 211 |
+
-- ─── SYSTEM CONFIG ──────────────────────���────────────────────
|
| 212 |
+
-- Runtime configuration that Slack commands can modify
|
| 213 |
+
|
| 214 |
+
CREATE TABLE system_config (
|
| 215 |
+
key TEXT PRIMARY KEY,
|
| 216 |
+
value JSONB NOT NULL,
|
| 217 |
+
updated_by TEXT DEFAULT 'system',
|
| 218 |
+
updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
|
| 219 |
+
);
|
| 220 |
+
|
| 221 |
+
INSERT INTO system_config (key, value) VALUES
|
| 222 |
+
('daily_quota', '{"default": 10, "today_override": null}'),
|
| 223 |
+
('schedule', '{"start_hour_utc": 4, "enabled": true}'),
|
| 224 |
+
('auto_mode', '{"enabled": true, "paused": false, "paused_by": null}'),
|
| 225 |
+
('current_territory', '{"country_code": "US", "city_index": 0, "industry_index": 0}');
|
| 226 |
+
|
| 227 |
+
|
| 228 |
+
-- ─── ADD SOCIAL PROFILES TO CONTACTS ─────────────────────────
|
| 229 |
+
|
| 230 |
+
ALTER TABLE contacts ADD COLUMN IF NOT EXISTS linkedin_personal_url TEXT;
|
| 231 |
+
ALTER TABLE contacts ADD COLUMN IF NOT EXISTS social_profiles JSONB DEFAULT '{}';
|
| 232 |
+
ALTER TABLE contacts ADD COLUMN IF NOT EXISTS email_verification_layers JSONB DEFAULT '{}';
|
| 233 |
+
ALTER TABLE contacts ADD COLUMN IF NOT EXISTS email_tier TEXT; -- 'personal','authority','context_verified','rejected'
|
| 234 |
+
ALTER TABLE contacts ADD COLUMN IF NOT EXISTS authority_confirmed BOOLEAN DEFAULT false;
|
| 235 |
+
|
| 236 |
+
-- ─── ADD CITY TO COMPANIES ───────────────────────────────────
|
| 237 |
+
|
| 238 |
+
ALTER TABLE companies ADD COLUMN IF NOT EXISTS city TEXT;
|
| 239 |
+
ALTER TABLE companies ADD COLUMN IF NOT EXISTS service_match TEXT; -- matched service name
|
| 240 |
+
ALTER TABLE companies ADD COLUMN IF NOT EXISTS service_match_score INTEGER DEFAULT 0;
|
| 241 |
+
ALTER TABLE companies ADD COLUMN IF NOT EXISTS pain_signals TEXT[] DEFAULT '{}';
|
| 242 |
+
ALTER TABLE companies ADD COLUMN IF NOT EXISTS trace_id TEXT; -- pipeline trace
|