Seth commited on
Commit
ced5eff
·
1 Parent(s): cb28f8c
backend/app/apollo_service.py CHANGED
@@ -1,449 +1,3 @@
1
- <<<<<<< HEAD
2
- """
3
- Apollo.io API service for creating contacts, enriching contact data, and adding them to sequences.
4
- Reference:
5
- - Create contact: https://docs.apollo.io/reference/create-a-contact
6
- - Add to sequence: https://docs.apollo.io/reference/add-contacts-to-sequence
7
- - Enrich person: https://docs.apollo.io/reference/enrich-people-data
8
- """
9
- import os
10
- import httpx
11
- from typing import Optional, Dict, Any
12
-
13
- APOLLO_API_KEY = os.environ.get("APOLLO_API_KEY", "")
14
- APOLLO_API_URL = "https://api.apollo.io/api/v1"
15
- APOLLO_TRIAL_LIST_NAME = "VPR TRIAL LEADS"
16
- # Allow list ID to be set directly via environment variable (more reliable than lookup)
17
- APOLLO_TRIAL_LIST_ID = os.environ.get("APOLLO_TRIAL_LIST_ID", None)
18
- # Sequence ID for adding contacts to email sequences (preferred over lists)
19
- APOLLO_TRIAL_SEQUENCE_ID = os.environ.get("APOLLO_TRIAL_SEQUENCE_ID", None)
20
-
21
-
22
- async def get_list_id(list_name: Optional[str] = None) -> Optional[str]:
23
- """
24
- Get Apollo list ID. First tries environment variable, then attempts API lookup.
25
-
26
- Args:
27
- list_name: Name of the list (for lookup if env var not set)
28
-
29
- Returns:
30
- List ID as string if found, None otherwise
31
- """
32
- # First, try to use the list ID from environment variable (most reliable)
33
- if APOLLO_TRIAL_LIST_ID:
34
- # Apollo list IDs are typically hexadecimal strings (MongoDB ObjectIds)
35
- # Accept them as strings, just strip whitespace
36
- list_id = str(APOLLO_TRIAL_LIST_ID).strip()
37
- if list_id:
38
- print(f"[INFO] Using Apollo list ID from environment variable: {list_id}")
39
- return list_id
40
- else:
41
- print(f"[WARNING] APOLLO_TRIAL_LIST_ID is empty")
42
-
43
- # If no env var, try to look up by name (this may not work if API endpoint is different)
44
- if not list_name or not APOLLO_API_KEY:
45
- return None
46
-
47
- # Note: The /lists endpoint may not be available in all Apollo API versions
48
- # Try alternative: search for lists using a different endpoint
49
- try:
50
- async with httpx.AsyncClient() as client:
51
- # Try the lists endpoint (may return 404 in some API versions)
52
- response = await client.get(
53
- f"{APOLLO_API_URL}/lists",
54
- headers={
55
- "Content-Type": "application/json",
56
- "Cache-Control": "no-cache",
57
- "X-Api-Key": APOLLO_API_KEY
58
- },
59
- timeout=10.0
60
- )
61
- if response.status_code == 200:
62
- data = response.json()
63
- lists = data.get("lists", [])
64
- for list_item in lists:
65
- if list_item.get("name") == list_name:
66
- list_id = list_item.get("id")
67
- print(f"[INFO] Found Apollo list '{list_name}' with ID: {list_id}")
68
- # Return as string (Apollo IDs are typically hex strings)
69
- return str(list_id) if list_id else None
70
- print(f"[WARNING] Apollo list '{list_name}' not found in available lists")
71
- else:
72
- print(f"[WARNING] Apollo lists endpoint returned {response.status_code}, cannot lookup list by name")
73
- except Exception as e:
74
- print(f"[WARNING] Failed to fetch Apollo list ID: {str(e)}")
75
-
76
- return None
77
-
78
-
79
- async def add_contact_to_sequence(contact_id: str, sequence_id: str) -> bool:
80
- """
81
- Add a contact to an Apollo.io email sequence.
82
-
83
- Args:
84
- contact_id: The Apollo contact ID
85
- sequence_id: The Apollo sequence ID
86
-
87
- Returns:
88
- True if contact was successfully added to sequence, False otherwise
89
- """
90
- if not APOLLO_API_KEY:
91
- print("[WARNING] APOLLO_API_KEY not set, skipping sequence enrollment")
92
- return False
93
-
94
- try:
95
- async with httpx.AsyncClient() as client:
96
- response = await client.post(
97
- f"{APOLLO_API_URL}/sequence_contacts",
98
- headers={
99
- "Content-Type": "application/json",
100
- "Cache-Control": "no-cache",
101
- "X-Api-Key": APOLLO_API_KEY
102
- },
103
- json={
104
- "sequence_id": sequence_id,
105
- "contact_id": contact_id
106
- },
107
- timeout=10.0
108
- )
109
-
110
- if response.status_code in [200, 201]:
111
- print(f"[INFO] Successfully added contact {contact_id} to sequence {sequence_id}")
112
- return True
113
- else:
114
- error_data = response.text
115
- print(f"[ERROR] Failed to add contact to sequence: {response.status_code} - {error_data}")
116
- return False
117
- except httpx.HTTPStatusError as e:
118
- print(f"[ERROR] Apollo API HTTP error adding to sequence: {e.response.status_code} - {e.response.text}")
119
- return False
120
- except Exception as e:
121
- print(f"[ERROR] Failed to add contact to sequence: {str(e)}")
122
- return False
123
-
124
-
125
- async def create_apollo_contact(
126
- email: str,
127
- first_name: Optional[str] = None,
128
- last_name: Optional[str] = None,
129
- organization_name: Optional[str] = None,
130
- title: Optional[str] = None,
131
- list_name: Optional[str] = None,
132
- sequence_id: Optional[str] = None
133
- ) -> bool:
134
- """
135
- Create a contact in Apollo.io and optionally add to a sequence or list.
136
-
137
- Args:
138
- email: Contact email address (required)
139
- first_name: Contact first name
140
- last_name: Contact last name
141
- organization_name: Organization name
142
- title: Job title
143
- list_name: Name of the list to add contact to (defaults to APOLLO_TRIAL_LIST_NAME)
144
- sequence_id: ID of the sequence to add contact to (preferred over list)
145
-
146
- Returns:
147
- True if contact created successfully, False otherwise
148
-
149
- Raises:
150
- ValueError: If APOLLO_API_KEY is not set
151
- """
152
- if not APOLLO_API_KEY:
153
- print("[WARNING] APOLLO_API_KEY not set, skipping Apollo contact creation")
154
- return False
155
-
156
- # Use default list name if not provided
157
- if list_name is None:
158
- list_name = APOLLO_TRIAL_LIST_NAME
159
-
160
- # Parse name if full name is provided but first/last are not
161
- if not first_name and not last_name:
162
- # Try to extract from email or use email prefix
163
- email_prefix = email.split('@')[0]
164
- if '.' in email_prefix:
165
- parts = email_prefix.split('.')
166
- first_name = parts[0].capitalize() if parts else None
167
- last_name = parts[1].capitalize() if len(parts) > 1 else None
168
- else:
169
- first_name = email_prefix.capitalize()
170
-
171
- # Extract organization domain from email
172
- organization_domain = None
173
- if '@' in email:
174
- organization_domain = email.split('@')[1]
175
-
176
- # Prepare contact data
177
- contact_data: Dict[str, Any] = {
178
- "email": email.lower(),
179
- "run_dedupe": True # Prevent duplicate contacts
180
- }
181
-
182
- if first_name:
183
- contact_data["first_name"] = first_name
184
- if last_name:
185
- contact_data["last_name"] = last_name
186
- if organization_name:
187
- contact_data["organization_name"] = organization_name
188
- if organization_domain:
189
- contact_data["organization_domain"] = organization_domain
190
- if title:
191
- contact_data["title"] = title
192
-
193
- try:
194
- async with httpx.AsyncClient() as client:
195
- # Get the list ID if list_name is provided
196
- list_ids = []
197
- target_list_id = None # Store for later use
198
- if list_name:
199
- list_id = await get_list_id(list_name)
200
- if list_id:
201
- target_list_id = list_id # Store for verification later
202
- # Apollo API accepts list_ids as an array of strings (hex IDs)
203
- list_ids = [str(list_id)]
204
- contact_data["list_ids"] = list_ids
205
- print(f"[INFO] Adding contact to list ID: {list_id}")
206
- else:
207
- print(f"[WARNING] Could not find list '{list_name}'. Set APOLLO_TRIAL_LIST_ID environment variable with the list ID, or create contact without list assignment")
208
-
209
- # Log the payload being sent (for debugging)
210
- print(f"[DEBUG] Creating Apollo contact with payload: {contact_data}")
211
-
212
- # Create the contact
213
- response = await client.post(
214
- f"{APOLLO_API_URL}/contacts",
215
- headers={
216
- "Content-Type": "application/json",
217
- "Cache-Control": "no-cache",
218
- "X-Api-Key": APOLLO_API_KEY
219
- },
220
- json=contact_data,
221
- timeout=10.0
222
- )
223
-
224
- # Log the full response for debugging
225
- print(f"[DEBUG] Apollo API response status: {response.status_code}")
226
- try:
227
- response_json = response.json()
228
- print(f"[DEBUG] Apollo API response (full): {response_json}")
229
- except:
230
- print(f"[DEBUG] Apollo API response body (text): {response.text[:1000]}") # First 1000 chars
231
-
232
- if response.status_code == 200 or response.status_code == 201:
233
- result = response.json()
234
- contact = result.get("contact", {})
235
- contact_id = contact.get("id")
236
- print(f"[INFO] Successfully created Apollo contact: {email} (ID: {contact_id})")
237
-
238
- # Priority: Add to sequence if sequence_id is provided (this is supported by API)
239
- target_sequence_id = sequence_id or APOLLO_TRIAL_SEQUENCE_ID
240
- if contact_id and target_sequence_id:
241
- print(f"[INFO] Adding contact to sequence: {target_sequence_id}")
242
- sequence_success = await add_contact_to_sequence(contact_id, target_sequence_id)
243
- if sequence_success:
244
- print(f"[INFO] ✓ Contact successfully enrolled in sequence")
245
- else:
246
- print(f"[WARNING] Failed to add contact to sequence, but contact was created")
247
-
248
- # Fallback: Try to add to list (API limitation - may not work)
249
- if list_ids and contact_id and target_list_id and not target_sequence_id:
250
- print(f"[INFO] Contact created with list_ids parameter: {list_ids}")
251
- print(f"[INFO] ⚠️ Apollo.io API Limitation: The API does not return list_ids in responses,")
252
- print(f"[INFO] so we cannot verify if the contact was added to the list via API.")
253
- print(f"[INFO] Please verify manually in Apollo.io that contact '{email}' is in list '{list_name or target_list_id}'")
254
- print(f"[INFO] Consider using sequences instead (APOLLO_TRIAL_SEQUENCE_ID) for better API support.")
255
-
256
- return True
257
- else:
258
- error_data = response.text
259
- print(f"[ERROR] Failed to create Apollo contact: {response.status_code} - {error_data}")
260
- return False
261
-
262
- except httpx.HTTPStatusError as e:
263
- print(f"[ERROR] Apollo API HTTP error: {e.response.status_code} - {e.response.text}")
264
- return False
265
- except Exception as e:
266
- print(f"[ERROR] Failed to create Apollo contact: {str(e)}")
267
- return False
268
-
269
-
270
- async def enrich_contact_by_email(email: str) -> Optional[Dict[str, Any]]:
271
- """
272
- Enrich contact data from Apollo.io using email address.
273
-
274
- Args:
275
- email: Contact email address
276
-
277
- Returns:
278
- Dictionary with enriched contact data, or None if not found
279
- """
280
- if not APOLLO_API_KEY:
281
- print("[WARNING] APOLLO_API_KEY not set, skipping Apollo enrichment")
282
- return None
283
-
284
- try:
285
- async with httpx.AsyncClient() as client:
286
- # Try people/match endpoint first (for exact email match)
287
- print(f"[DEBUG] Attempting Apollo.io enrichment for {email} via /people/match endpoint")
288
- response = await client.post(
289
- f"{APOLLO_API_URL}/people/match",
290
- headers={
291
- "Content-Type": "application/json",
292
- "Cache-Control": "no-cache",
293
- "X-Api-Key": APOLLO_API_KEY
294
- },
295
- json={
296
- "email": email.lower()
297
- # Note: reveal_phone_number requires webhook_url, so we skip it for now
298
- },
299
- timeout=10.0
300
- )
301
-
302
- print(f"[DEBUG] Apollo.io /people/match response status: {response.status_code}")
303
-
304
- if response.status_code == 200:
305
- data = response.json()
306
- print(f"[DEBUG] Apollo.io /people/match response data keys: {list(data.keys())}")
307
- person = data.get("person", {})
308
- if person:
309
- print(f"[DEBUG] Found person data in Apollo.io response")
310
- # Extract enriched data
311
- enriched_data = {
312
- "first_name": person.get("first_name"),
313
- "last_name": person.get("last_name"),
314
- "title": person.get("title"),
315
- "phone_number": person.get("phone_numbers", [{}])[0].get("raw_number") if person.get("phone_numbers") else None,
316
- "linkedin_url": person.get("linkedin_url"),
317
- "headline": person.get("headline"),
318
- "organization_name": person.get("organization", {}).get("name") if person.get("organization") else None,
319
- "organization_website": person.get("organization", {}).get("website_url") if person.get("organization") else None,
320
- "organization_address": None, # May need to parse from organization data
321
- }
322
-
323
- # Try to get organization address
324
- if person.get("organization"):
325
- org = person.get("organization", {})
326
- address_parts = []
327
- if org.get("street_address"):
328
- address_parts.append(org.get("street_address"))
329
- if org.get("city"):
330
- address_parts.append(org.get("city"))
331
- if org.get("state"):
332
- address_parts.append(org.get("state"))
333
- if org.get("postal_code"):
334
- address_parts.append(org.get("postal_code"))
335
- if org.get("country"):
336
- address_parts.append(org.get("country"))
337
- if address_parts:
338
- enriched_data["organization_address"] = ", ".join(address_parts)
339
-
340
- print(f"[INFO] Successfully enriched contact data for {email} from Apollo.io")
341
- return enriched_data
342
- else:
343
- print(f"[DEBUG] Apollo.io /people/match returned 200 but no person data found")
344
- elif response.status_code == 404:
345
- print(f"[DEBUG] Apollo.io /people/match returned 404 - contact not found in database")
346
- elif response.status_code == 401:
347
- print(f"[ERROR] Apollo.io API authentication failed - check your API key")
348
- try:
349
- error_data = response.json()
350
- print(f"[ERROR] Apollo.io error details: {error_data}")
351
- except:
352
- print(f"[ERROR] Apollo.io error response: {response.text}")
353
- else:
354
- print(f"[DEBUG] Apollo.io /people/match returned status {response.status_code}")
355
- try:
356
- error_data = response.json()
357
- print(f"[DEBUG] Apollo.io response: {error_data}")
358
- except:
359
- print(f"[DEBUG] Apollo.io response text: {response.text[:500]}")
360
-
361
- # If match fails, try the new search endpoint (api_search)
362
- print(f"[DEBUG] Attempting Apollo.io enrichment for {email} via /mixed_people/api_search endpoint")
363
- search_response = await client.post(
364
- f"{APOLLO_API_URL}/mixed_people/api_search",
365
- headers={
366
- "Content-Type": "application/json",
367
- "Cache-Control": "no-cache",
368
- "X-Api-Key": APOLLO_API_KEY
369
- },
370
- json={
371
- "email": email.lower(),
372
- "per_page": 1
373
- },
374
- timeout=10.0
375
- )
376
-
377
- print(f"[DEBUG] Apollo.io /mixed_people/api_search response status: {search_response.status_code}")
378
-
379
- if search_response.status_code == 200:
380
- search_data = search_response.json()
381
- print(f"[DEBUG] Apollo.io /mixed_people/api_search response data keys: {list(search_data.keys())}")
382
- people = search_data.get("people", [])
383
- print(f"[DEBUG] Found {len(people)} people in search results")
384
- if people:
385
- person = people[0]
386
- # Extract enriched data (same structure as above)
387
- enriched_data = {
388
- "first_name": person.get("first_name"),
389
- "last_name": person.get("last_name"),
390
- "title": person.get("title"),
391
- "phone_number": person.get("phone_numbers", [{}])[0].get("raw_number") if person.get("phone_numbers") else None,
392
- "linkedin_url": person.get("linkedin_url"),
393
- "headline": person.get("headline"),
394
- "organization_name": person.get("organization", {}).get("name") if person.get("organization") else None,
395
- "organization_website": person.get("organization", {}).get("website_url") if person.get("organization") else None,
396
- "organization_address": None,
397
- }
398
-
399
- if person.get("organization"):
400
- org = person.get("organization", {})
401
- address_parts = []
402
- if org.get("street_address"):
403
- address_parts.append(org.get("street_address"))
404
- if org.get("city"):
405
- address_parts.append(org.get("city"))
406
- if org.get("state"):
407
- address_parts.append(org.get("state"))
408
- if org.get("postal_code"):
409
- address_parts.append(org.get("postal_code"))
410
- if org.get("country"):
411
- address_parts.append(org.get("country"))
412
- if address_parts:
413
- enriched_data["organization_address"] = ", ".join(address_parts)
414
-
415
- print(f"[INFO] Successfully enriched contact data for {email} from Apollo.io (via search)")
416
- return enriched_data
417
- else:
418
- print(f"[DEBUG] Apollo.io /mixed_people/api_search returned 200 but no people in results")
419
- elif search_response.status_code == 404:
420
- print(f"[DEBUG] Apollo.io /mixed_people/api_search returned 404 - contact not found")
421
- elif search_response.status_code == 401:
422
- print(f"[ERROR] Apollo.io API authentication failed on search - check your API key")
423
- try:
424
- error_data = search_response.json()
425
- print(f"[ERROR] Apollo.io search error details: {error_data}")
426
- except:
427
- print(f"[ERROR] Apollo.io search error response: {search_response.text}")
428
- else:
429
- print(f"[DEBUG] Apollo.io /mixed_people/api_search returned status {search_response.status_code}")
430
- try:
431
- error_data = search_response.json()
432
- print(f"[DEBUG] Apollo.io search response: {error_data}")
433
- except:
434
- print(f"[DEBUG] Apollo.io search response text: {search_response.text[:500]}")
435
-
436
- print(f"[INFO] No contact data found in Apollo.io for {email} - contact may not exist in Apollo's database")
437
- return None
438
-
439
- except httpx.HTTPStatusError as e:
440
- print(f"[ERROR] Apollo API HTTP error during enrichment: {e.response.status_code} - {e.response.text}")
441
- return None
442
- except Exception as e:
443
- print(f"[ERROR] Failed to enrich contact from Apollo.io: {str(e)}")
444
- return None
445
-
446
- =======
447
  """
448
  Apollo.io API service for creating contacts, enriching contact data, and adding them to sequences.
449
  Reference:
@@ -887,5 +441,3 @@ async def enrich_contact_by_email(email: str) -> Optional[Dict[str, Any]]:
887
  except Exception as e:
888
  print(f"[ERROR] Failed to enrich contact from Apollo.io: {str(e)}")
889
  return None
890
-
891
- >>>>>>> daae7a900bd14d0802e4f04b99edb85493053f1d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  """
2
  Apollo.io API service for creating contacts, enriching contact data, and adding them to sequences.
3
  Reference:
 
441
  except Exception as e:
442
  print(f"[ERROR] Failed to enrich contact from Apollo.io: {str(e)}")
443
  return None
 
 
backend/app/auth.py CHANGED
@@ -1,97 +1,3 @@
1
- <<<<<<< HEAD
2
- import os
3
- import jwt
4
- from datetime import datetime, timedelta
5
- from typing import Optional
6
- from fastapi import Depends, HTTPException, status
7
- from fastapi.security import HTTPBearer, HTTPAuthorizationCredentials
8
- from sqlalchemy.orm import Session
9
- from .db import SessionLocal
10
- from .models import User
11
-
12
- # JWT Configuration
13
- SECRET_KEY = os.environ.get("JWT_SECRET_KEY", "your-secret-key-change-in-production")
14
- ALGORITHM = "HS256"
15
- ACCESS_TOKEN_EXPIRE_MINUTES = 60 * 24 * 7 # 7 days
16
-
17
- security = HTTPBearer()
18
-
19
-
20
- def get_db():
21
- """Database dependency."""
22
- db = SessionLocal()
23
- try:
24
- yield db
25
- finally:
26
- db.close()
27
-
28
-
29
- def create_access_token(data: dict, expires_delta: Optional[timedelta] = None):
30
- """Create a JWT access token."""
31
- to_encode = data.copy()
32
- # Ensure 'sub' (subject) is a string, not an integer
33
- if "sub" in to_encode:
34
- to_encode["sub"] = str(to_encode["sub"])
35
- if expires_delta:
36
- expire = datetime.utcnow() + expires_delta
37
- else:
38
- expire = datetime.utcnow() + timedelta(minutes=ACCESS_TOKEN_EXPIRE_MINUTES)
39
- to_encode.update({"exp": expire})
40
- encoded_jwt = jwt.encode(to_encode, SECRET_KEY, algorithm=ALGORITHM)
41
- return encoded_jwt
42
-
43
-
44
- def verify_token(token: str) -> dict:
45
- """Verify and decode a JWT token."""
46
- try:
47
- payload = jwt.decode(token, SECRET_KEY, algorithms=[ALGORITHM])
48
- return payload
49
- except jwt.ExpiredSignatureError:
50
- raise HTTPException(
51
- status_code=status.HTTP_401_UNAUTHORIZED,
52
- detail="Token has expired",
53
- )
54
- except jwt.InvalidTokenError:
55
- raise HTTPException(
56
- status_code=status.HTTP_401_UNAUTHORIZED,
57
- detail="Could not validate credentials",
58
- )
59
-
60
-
61
- def get_current_user(
62
- credentials: HTTPAuthorizationCredentials = Depends(security),
63
- db: Session = Depends(get_db)
64
- ) -> User:
65
- """Get the current authenticated user from JWT token."""
66
- token = credentials.credentials
67
- payload = verify_token(token)
68
- user_id: int = payload.get("sub")
69
-
70
- if user_id is None:
71
- raise HTTPException(
72
- status_code=status.HTTP_401_UNAUTHORIZED,
73
- detail="Could not validate credentials",
74
- )
75
-
76
- # Convert user_id back to integer for database query
77
- try:
78
- user_id_int = int(user_id)
79
- except (ValueError, TypeError):
80
- raise HTTPException(
81
- status_code=status.HTTP_401_UNAUTHORIZED,
82
- detail="Invalid user ID in token",
83
- )
84
-
85
- user = db.query(User).filter(User.id == user_id_int).first()
86
- if user is None:
87
- raise HTTPException(
88
- status_code=status.HTTP_401_UNAUTHORIZED,
89
- detail="User not found",
90
- )
91
-
92
- return user
93
-
94
- =======
95
  import os
96
  import jwt
97
  from datetime import datetime, timedelta
@@ -183,5 +89,3 @@ def get_current_user(
183
  )
184
 
185
  return user
186
-
187
- >>>>>>> daae7a900bd14d0802e4f04b99edb85493053f1d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import os
2
  import jwt
3
  from datetime import datetime, timedelta
 
89
  )
90
 
91
  return user
 
 
backend/app/auth_routes.py CHANGED
@@ -1,4 +1,3 @@
1
- <<<<<<< HEAD
2
  import os
3
  from fastapi import APIRouter, Depends, HTTPException, Body
4
  from pydantic import BaseModel, EmailStr
@@ -241,6 +240,7 @@ async def get_current_user_info(current_user: User = Depends(get_current_user)):
241
  }
242
 
243
 
 
244
  @router.post("/api/auth/api-key/create")
245
  async def create_api_key(
246
  request: CreateAPIKeyRequest,
@@ -345,242 +345,3 @@ async def delete_api_key(
345
  "success": True,
346
  "message": "API key deactivated successfully"
347
  }
348
-
349
- =======
350
- import os
351
- from fastapi import APIRouter, Depends, HTTPException, Body
352
- from pydantic import BaseModel, EmailStr
353
- from sqlalchemy.orm import Session
354
- from .models import User
355
- from .auth import create_access_token, get_current_user
356
- from .firebase_auth import verify_firebase_token
357
- from .otp_service import request_otp, verify_otp
358
- from .email_validator import validate_business_email, is_business_email
359
- from .db import SessionLocal
360
-
361
- def get_db():
362
- """Database dependency."""
363
- db = SessionLocal()
364
- try:
365
- yield db
366
- finally:
367
- db.close()
368
-
369
- router = APIRouter()
370
-
371
-
372
- class FirebaseLoginRequest(BaseModel):
373
- id_token: str
374
-
375
-
376
- class OTPRequestRequest(BaseModel):
377
- email: EmailStr
378
-
379
-
380
- class OTPVerifyRequest(BaseModel):
381
- email: EmailStr
382
- otp: str
383
-
384
-
385
- @router.post("/api/auth/firebase/login")
386
- async def firebase_login(
387
- request: FirebaseLoginRequest,
388
- db: Session = Depends(get_db)
389
- ):
390
- """
391
- Login with Firebase ID token.
392
- Validates business email and creates/updates user.
393
- """
394
- try:
395
- # Verify Firebase token
396
- user_info = await verify_firebase_token(request.id_token)
397
- email = user_info.get('email')
398
-
399
- if not email:
400
- raise HTTPException(status_code=400, detail="Email not found in Firebase token")
401
-
402
- # Validate business email
403
- if not is_business_email(email):
404
- raise HTTPException(
405
- status_code=400,
406
- detail="Only business email addresses are allowed. Personal email accounts (Gmail, Yahoo, Outlook, etc.) are not permitted. Please use your work email address."
407
- )
408
-
409
- # Get or create user
410
- user = db.query(User).filter(
411
- (User.email == email.lower()) | (User.firebase_uid == user_info['uid'])
412
- ).first()
413
-
414
- if not user:
415
- user = User(
416
- email=email.lower(),
417
- name=user_info.get('name'),
418
- picture=user_info.get('picture'),
419
- firebase_uid=user_info['uid'],
420
- auth_method='firebase',
421
- email_verified=True
422
- )
423
- db.add(user)
424
- db.commit()
425
- db.refresh(user)
426
- print(f"[INFO] New user created via Firebase: {email}")
427
-
428
- # Enrich contact data from Apollo.io and update Brevo + Monday.com
429
- try:
430
- from .apollo_service import enrich_contact_by_email
431
- from .brevo_service import create_brevo_contact, BREVO_TRIAL_LIST_ID
432
- from .monday_service import create_monday_lead
433
-
434
- # Enrich contact data from Apollo.io
435
- enriched_data = await enrich_contact_by_email(email)
436
-
437
- # Use enriched data if available, otherwise use basic data
438
- first_name = enriched_data.get("first_name") if enriched_data else None
439
- last_name = enriched_data.get("last_name") if enriched_data else None
440
- org_name = enriched_data.get("organization_name") if enriched_data else None
441
-
442
- # Fallback to Firebase data if Apollo didn't provide it
443
- if not first_name or not last_name:
444
- full_name = user_info.get('name', '')
445
- if full_name:
446
- name_parts = full_name.strip().split(' ', 1)
447
- first_name = first_name or (name_parts[0] if name_parts else None)
448
- last_name = last_name or (name_parts[1] if len(name_parts) > 1 else None)
449
-
450
- if not org_name:
451
- org_domain = email.split('@')[1] if '@' in email else None
452
- org_name = org_domain.split('.')[0].capitalize() if org_domain else None
453
-
454
- # Update Brevo contact with enriched data
455
- await create_brevo_contact(
456
- email=email,
457
- first_name=first_name,
458
- last_name=last_name,
459
- organization_name=org_name or (enriched_data.get("organization_name") if enriched_data else None),
460
- phone_number=enriched_data.get("phone_number") if enriched_data else None,
461
- linkedin_url=enriched_data.get("linkedin_url") if enriched_data else None,
462
- title=enriched_data.get("title") if enriched_data else None,
463
- headline=enriched_data.get("headline") if enriched_data else None,
464
- organization_website=enriched_data.get("organization_website") if enriched_data else None,
465
- organization_address=enriched_data.get("organization_address") if enriched_data else None,
466
- list_id=BREVO_TRIAL_LIST_ID
467
- )
468
-
469
- # Create lead in Monday.com
470
- await create_monday_lead(
471
- email=email,
472
- first_name=first_name,
473
- last_name=last_name,
474
- phone_number=enriched_data.get("phone_number") if enriched_data else None,
475
- linkedin_url=enriched_data.get("linkedin_url") if enriched_data else None,
476
- title=enriched_data.get("title") if enriched_data else None,
477
- headline=enriched_data.get("headline") if enriched_data else None,
478
- organization_name=org_name or (enriched_data.get("organization_name") if enriched_data else None),
479
- organization_website=enriched_data.get("organization_website") if enriched_data else None,
480
- organization_address=enriched_data.get("organization_address") if enriched_data else None,
481
- )
482
- except Exception as e:
483
- # Don't fail user creation if integrations fail
484
- print(f"[WARNING] Failed to enrich/update contact for {email}: {str(e)}")
485
- else:
486
- # Update user info
487
- user.firebase_uid = user_info['uid']
488
- user.email_verified = True
489
- user.name = user_info.get('name', user.name)
490
- user.picture = user_info.get('picture', user.picture)
491
- if user.auth_method != 'firebase':
492
- user.auth_method = 'firebase'
493
- db.commit()
494
- print(f"[INFO] User logged in via Firebase: {email}")
495
-
496
- # Generate JWT token
497
- token = create_access_token(data={"sub": user.id})
498
-
499
- return {
500
- "token": token,
501
- "user": {
502
- "id": user.id,
503
- "email": user.email,
504
- "name": user.name,
505
- "picture": user.picture,
506
- "auth_method": user.auth_method
507
- }
508
- }
509
- except HTTPException:
510
- raise
511
- except Exception as e:
512
- print(f"[ERROR] Firebase login failed: {str(e)}")
513
- raise HTTPException(status_code=400, detail=f"Authentication failed: {str(e)}")
514
-
515
-
516
- @router.post("/api/auth/otp/request")
517
- async def request_otp_endpoint(
518
- request: OTPRequestRequest,
519
- db: Session = Depends(get_db)
520
- ):
521
- """
522
- Request OTP for email login.
523
- Validates business email before sending OTP.
524
- """
525
- try:
526
- # Validate business email
527
- validate_business_email(request.email)
528
-
529
- # Request OTP
530
- result = await request_otp(request.email, db)
531
- return result
532
- except HTTPException:
533
- raise
534
- except Exception as e:
535
- print(f"[ERROR] OTP request failed: {str(e)}")
536
- raise HTTPException(status_code=500, detail=f"Failed to send OTP: {str(e)}")
537
-
538
-
539
- @router.post("/api/auth/otp/verify")
540
- async def verify_otp_endpoint(
541
- request: OTPVerifyRequest,
542
- db: Session = Depends(get_db)
543
- ):
544
- """
545
- Verify OTP and login.
546
- Validates business email and OTP code.
547
- """
548
- try:
549
- # Validate business email
550
- validate_business_email(request.email)
551
-
552
- # Verify OTP
553
- user = await verify_otp(request.email, request.otp, db)
554
-
555
- # Generate JWT token
556
- token = create_access_token(data={"sub": user.id})
557
-
558
- return {
559
- "token": token,
560
- "user": {
561
- "id": user.id,
562
- "email": user.email,
563
- "name": user.name,
564
- "picture": user.picture,
565
- "auth_method": user.auth_method
566
- }
567
- }
568
- except HTTPException:
569
- raise
570
- except Exception as e:
571
- print(f"[ERROR] OTP verification failed: {str(e)}")
572
- raise HTTPException(status_code=400, detail=f"OTP verification failed: {str(e)}")
573
-
574
-
575
- @router.get("/api/auth/me")
576
- async def get_current_user_info(current_user: User = Depends(get_current_user)):
577
- """Get current user information."""
578
- return {
579
- "id": current_user.id,
580
- "email": current_user.email,
581
- "name": current_user.name,
582
- "picture": current_user.picture,
583
- "auth_method": current_user.auth_method,
584
- }
585
-
586
- >>>>>>> daae7a900bd14d0802e4f04b99edb85493053f1d
 
 
1
  import os
2
  from fastapi import APIRouter, Depends, HTTPException, Body
3
  from pydantic import BaseModel, EmailStr
 
240
  }
241
 
242
 
243
+ # API Key Management Endpoints (newly added for external API access)
244
  @router.post("/api/auth/api-key/create")
245
  async def create_api_key(
246
  request: CreateAPIKeyRequest,
 
345
  "success": True,
346
  "message": "API key deactivated successfully"
347
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
backend/app/email_validator.py CHANGED
@@ -1,66 +1,3 @@
1
- <<<<<<< HEAD
2
- """
3
- Email validation utilities to ensure only business emails are allowed.
4
- """
5
- from fastapi import HTTPException
6
-
7
- # List of personal email domains to block
8
- PERSONAL_EMAIL_DOMAINS = {
9
- 'gmail.com', 'yahoo.com', 'hotmail.com', 'outlook.com',
10
- 'aol.com', 'icloud.com', 'mail.com', 'protonmail.com',
11
- 'yandex.com', 'zoho.com', 'gmx.com', 'live.com', 'msn.com',
12
- 'me.com', 'mac.com', 'yahoo.co.uk', 'yahoo.co.jp', 'yahoo.fr',
13
- 'yahoo.de', 'yahoo.it', 'yahoo.es', 'yahoo.in', 'yahoo.com.au',
14
- 'gmail.co.uk', 'gmail.fr', 'gmail.de', 'gmail.it', 'gmail.es',
15
- 'gmail.in', 'gmail.com.au', 'hotmail.co.uk', 'hotmail.fr',
16
- 'hotmail.de', 'hotmail.it', 'hotmail.es', 'outlook.co.uk',
17
- 'outlook.fr', 'outlook.de', 'outlook.it', 'outlook.es',
18
- 'rediffmail.com', 'sina.com', 'qq.com', '163.com', '126.com',
19
- 'mail.ru', 'inbox.com', 'fastmail.com', 'tutanota.com',
20
- 'hey.com', 'pm.me'
21
- }
22
-
23
-
24
- def is_business_email(email: str) -> bool:
25
- """
26
- Check if email is a business email (not personal).
27
-
28
- Args:
29
- email: Email address to validate
30
-
31
- Returns:
32
- True if business email, False if personal email
33
- """
34
- if not email or '@' not in email:
35
- return False
36
-
37
- domain = email.split('@')[1].lower().strip()
38
- return domain not in PERSONAL_EMAIL_DOMAINS
39
-
40
-
41
- def validate_business_email(email: str) -> None:
42
- """
43
- Raise exception if email is not a business email.
44
-
45
- Args:
46
- email: Email address to validate
47
-
48
- Raises:
49
- HTTPException: If email is a personal email domain
50
- """
51
- if not email:
52
- raise HTTPException(
53
- status_code=400,
54
- detail="Email address is required"
55
- )
56
-
57
- if not is_business_email(email):
58
- raise HTTPException(
59
- status_code=400,
60
- detail="Only business email addresses are allowed. Personal email accounts (Gmail, Yahoo, Outlook, etc.) are not permitted. Please use your work email address."
61
- )
62
-
63
- =======
64
  """
65
  Email validation utilities to ensure only business emails are allowed.
66
  """
@@ -121,5 +58,3 @@ def validate_business_email(email: str) -> None:
121
  status_code=400,
122
  detail="Only business email addresses are allowed. Personal email accounts (Gmail, Yahoo, Outlook, etc.) are not permitted. Please use your work email address."
123
  )
124
-
125
- >>>>>>> daae7a900bd14d0802e4f04b99edb85493053f1d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  """
2
  Email validation utilities to ensure only business emails are allowed.
3
  """
 
58
  status_code=400,
59
  detail="Only business email addresses are allowed. Personal email accounts (Gmail, Yahoo, Outlook, etc.) are not permitted. Please use your work email address."
60
  )
 
 
backend/app/firebase_auth.py CHANGED
@@ -1,97 +1,3 @@
1
- <<<<<<< HEAD
2
- """
3
- Firebase Authentication utilities.
4
- """
5
- import os
6
- import json
7
- import firebase_admin
8
- from firebase_admin import auth, credentials
9
- from fastapi import HTTPException
10
-
11
- # Initialize Firebase Admin SDK
12
- _firebase_initialized = False
13
-
14
- def initialize_firebase():
15
- """Initialize Firebase Admin SDK."""
16
- global _firebase_initialized
17
-
18
- if _firebase_initialized:
19
- return
20
-
21
- if not firebase_admin._apps:
22
- # Try to get service account from environment variable (JSON string)
23
- service_account_json = os.environ.get("FIREBASE_SERVICE_ACCOUNT_JSON")
24
-
25
- if service_account_json:
26
- try:
27
- service_account_info = json.loads(service_account_json)
28
- cred = credentials.Certificate(service_account_info)
29
- firebase_admin.initialize_app(cred)
30
- _firebase_initialized = True
31
- print("[INFO] Firebase Admin SDK initialized from environment variable")
32
- return
33
- except json.JSONDecodeError:
34
- print("[WARNING] Failed to parse FIREBASE_SERVICE_ACCOUNT_JSON")
35
-
36
- # Try to get service account from file path
37
- service_account_path = os.environ.get("FIREBASE_SERVICE_ACCOUNT_KEY")
38
- if service_account_path and os.path.exists(service_account_path):
39
- cred = credentials.Certificate(service_account_path)
40
- firebase_admin.initialize_app(cred)
41
- _firebase_initialized = True
42
- print(f"[INFO] Firebase Admin SDK initialized from file: {service_account_path}")
43
- return
44
-
45
- # Try to use default credentials (for Google Cloud environments)
46
- try:
47
- firebase_admin.initialize_app()
48
- _firebase_initialized = True
49
- print("[INFO] Firebase Admin SDK initialized with default credentials")
50
- return
51
- except Exception as e:
52
- print(f"[WARNING] Firebase initialization failed: {e}")
53
- raise HTTPException(
54
- status_code=500,
55
- detail="Firebase not configured. Please set FIREBASE_SERVICE_ACCOUNT_JSON or FIREBASE_SERVICE_ACCOUNT_KEY environment variable."
56
- )
57
-
58
-
59
- async def verify_firebase_token(id_token: str) -> dict:
60
- """
61
- Verify Firebase ID token and return user info.
62
-
63
- Args:
64
- id_token: Firebase ID token from client
65
-
66
- Returns:
67
- Dictionary with user information (uid, email, name, picture)
68
-
69
- Raises:
70
- HTTPException: If token is invalid
71
- """
72
- initialize_firebase()
73
-
74
- try:
75
- decoded_token = auth.verify_id_token(id_token)
76
-
77
- return {
78
- 'uid': decoded_token['uid'],
79
- 'email': decoded_token.get('email'),
80
- 'name': decoded_token.get('name'),
81
- 'picture': decoded_token.get('picture'),
82
- }
83
- except ValueError as e:
84
- raise HTTPException(
85
- status_code=401,
86
- detail=f"Invalid Firebase token: {str(e)}"
87
- )
88
- except Exception as e:
89
- raise HTTPException(
90
- status_code=401,
91
- detail=f"Firebase authentication failed: {str(e)}"
92
- )
93
-
94
- =======
95
  """
96
  Firebase Authentication utilities.
97
  """
@@ -183,5 +89,3 @@ async def verify_firebase_token(id_token: str) -> dict:
183
  status_code=401,
184
  detail=f"Firebase authentication failed: {str(e)}"
185
  )
186
-
187
- >>>>>>> daae7a900bd14d0802e4f04b99edb85493053f1d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  """
2
  Firebase Authentication utilities.
3
  """
 
89
  status_code=401,
90
  detail=f"Firebase authentication failed: {str(e)}"
91
  )
 
 
backend/app/models.py CHANGED
@@ -1,4 +1,3 @@
1
- <<<<<<< HEAD
2
  from sqlalchemy import Column, Integer, String, Float, DateTime, Text, ForeignKey, Boolean
3
  from sqlalchemy.orm import relationship
4
  from sqlalchemy.sql import func
@@ -39,7 +38,7 @@ class User(Base):
39
  primaryjoin="User.id == ExtractionRecord.user_id"
40
  )
41
 
42
- # Relationship to API keys
43
  api_keys = relationship(
44
  "APIKey",
45
  back_populates="user",
@@ -135,108 +134,3 @@ class APIKey(Base):
135
  "User",
136
  back_populates="api_keys"
137
  )
138
- =======
139
- from sqlalchemy import Column, Integer, String, Float, DateTime, Text, ForeignKey, Boolean
140
- from sqlalchemy.orm import relationship
141
- from sqlalchemy.sql import func
142
-
143
- from .db import Base
144
-
145
-
146
- class User(Base):
147
- """
148
- Stores user information from Firebase or OTP authentication.
149
- """
150
- __tablename__ = "users"
151
-
152
- id = Column(Integer, primary_key=True, index=True)
153
- email = Column(String, unique=True, index=True, nullable=False)
154
- name = Column(String, nullable=True)
155
- picture = Column(String, nullable=True)
156
-
157
- # Auth method: 'firebase' or 'otp'
158
- auth_method = Column(String, default='firebase')
159
-
160
- # Firebase-specific
161
- firebase_uid = Column(String, unique=True, index=True, nullable=True)
162
-
163
- # OTP-specific
164
- email_verified = Column(Boolean, default=False)
165
-
166
- created_at = Column(
167
- DateTime(timezone=True),
168
- server_default=func.now(),
169
- )
170
-
171
- # Relationship to extraction records (explicitly specify user_id as the foreign key)
172
- # Note: primaryjoin must be specified because ExtractionRecord has multiple foreign keys to User
173
- extractions = relationship(
174
- "ExtractionRecord",
175
- back_populates="user",
176
- primaryjoin="User.id == ExtractionRecord.user_id"
177
- )
178
-
179
-
180
- class ExtractionRecord(Base):
181
- """
182
- Stores one extraction run so the History page can show past jobs.
183
- We'll fill it from the /api/extract endpoint later.
184
- """
185
-
186
- __tablename__ = "extractions"
187
-
188
- id = Column(Integer, primary_key=True, index=True)
189
- user_id = Column(Integer, ForeignKey("users.id"), nullable=False, index=True)
190
-
191
- file_name = Column(String, index=True)
192
- file_type = Column(String)
193
- file_size = Column(String)
194
-
195
- status = Column(String) # "completed" | "failed"
196
- confidence = Column(Float) # overall confidence (0–100)
197
- fields_extracted = Column(Integer) # number of fields extracted
198
- total_time_ms = Column(Integer) # total processing time in ms
199
-
200
- raw_output = Column(Text) # JSON string from the model
201
- file_base64 = Column(Text, nullable=True) # Base64 encoded original file for preview
202
- error_message = Column(Text, nullable=True)
203
-
204
- created_at = Column(
205
- DateTime(timezone=True),
206
- server_default=func.now(),
207
- )
208
-
209
- # Relationship to user (explicitly specify user_id as the foreign key)
210
- # Note: primaryjoin must be specified because ExtractionRecord has multiple foreign keys to User
211
- user = relationship(
212
- "User",
213
- back_populates="extractions",
214
- primaryjoin="ExtractionRecord.user_id == User.id"
215
- )
216
-
217
- # Track if this extraction was shared (original extraction ID)
218
- shared_from_extraction_id = Column(Integer, ForeignKey("extractions.id"), nullable=True, index=True)
219
- shared_by_user_id = Column(Integer, ForeignKey("users.id"), nullable=True, index=True)
220
-
221
-
222
- class ShareToken(Base):
223
- """
224
- Stores share tokens for sharing extractions with other users.
225
- """
226
- __tablename__ = "share_tokens"
227
-
228
- id = Column(Integer, primary_key=True, index=True)
229
- token = Column(String, unique=True, index=True, nullable=False) # Unique share token
230
- extraction_id = Column(Integer, ForeignKey("extractions.id"), nullable=False, index=True)
231
- sender_user_id = Column(Integer, ForeignKey("users.id"), nullable=False, index=True)
232
- recipient_email = Column(String, nullable=True, index=True) # Nullable for public share links
233
- expires_at = Column(DateTime(timezone=True), nullable=True) # Optional expiration
234
- accessed = Column(Boolean, default=False) # Track if link was accessed
235
- accessed_at = Column(DateTime(timezone=True), nullable=True)
236
- accessed_by_user_id = Column(Integer, ForeignKey("users.id"), nullable=True)
237
-
238
- created_at = Column(
239
- DateTime(timezone=True),
240
- server_default=func.now(),
241
- )
242
- >>>>>>> daae7a900bd14d0802e4f04b99edb85493053f1d
 
 
1
  from sqlalchemy import Column, Integer, String, Float, DateTime, Text, ForeignKey, Boolean
2
  from sqlalchemy.orm import relationship
3
  from sqlalchemy.sql import func
 
38
  primaryjoin="User.id == ExtractionRecord.user_id"
39
  )
40
 
41
+ # Relationship to API keys (newly added for API key authentication)
42
  api_keys = relationship(
43
  "APIKey",
44
  back_populates="user",
 
134
  "User",
135
  back_populates="api_keys"
136
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
backend/app/monday_service.py CHANGED
@@ -1,396 +1,3 @@
1
- <<<<<<< HEAD
2
- """
3
- Monday.com API service for creating leads with automatic field matching.
4
- Reference: https://developer.monday.com/api-reference/docs
5
- """
6
- import os
7
- import httpx
8
- import json
9
- from typing import Optional, Dict, Any, List, Tuple
10
- from difflib import SequenceMatcher
11
-
12
- MONDAY_API_KEY = os.environ.get("MONDAY_API_KEY", "")
13
- MONDAY_API_URL = "https://api.monday.com/v2"
14
- MONDAY_BOARD_ID = os.environ.get("MONDAY_BOARD_ID", None) # Your "New Leads" board ID
15
-
16
- # Cache for board columns to avoid repeated API calls
17
- _board_columns_cache: Dict[str, List[Dict[str, Any]]] = {}
18
-
19
-
20
- def _calculate_similarity(str1: str, str2: str) -> float:
21
- """
22
- Calculate similarity between two strings using SequenceMatcher.
23
- Returns a value between 0.0 and 1.0.
24
- """
25
- return SequenceMatcher(None, str1.lower(), str2.lower()).ratio()
26
-
27
-
28
- def _find_best_column_match(
29
- field_name: str,
30
- available_columns: List[Dict[str, Any]],
31
- min_similarity: float = 0.3
32
- ) -> Optional[Tuple[str, str, float]]:
33
- """
34
- Find the best matching column for a field name using semantic similarity.
35
-
36
- Args:
37
- field_name: The field name to match (e.g., "first_name", "email")
38
- available_columns: List of column dicts with 'id' and 'title' keys
39
- min_similarity: Minimum similarity threshold (0.0 to 1.0)
40
-
41
- Returns:
42
- Tuple of (column_id, column_title, similarity_score) or None if no match found
43
- """
44
- best_match = None
45
- best_score = 0.0
46
-
47
- # Normalize field name for matching
48
- normalized_field = field_name.lower().replace("_", " ").replace("-", " ")
49
-
50
- # Common field name variations
51
- field_variations = [
52
- normalized_field,
53
- field_name.lower(),
54
- field_name.replace("_", ""),
55
- ]
56
-
57
- # Add common synonyms
58
- synonyms = {
59
- "first_name": ["first name", "firstname", "fname", "given name"],
60
- "last_name": ["last name", "lastname", "lname", "surname", "family name"],
61
- "email": ["email address", "email", "e-mail", "mail"],
62
- "phone_number": ["phone", "phone number", "telephone", "mobile", "cell"],
63
- "linkedin_url": ["linkedin", "linkedin profile", "linkedin url", "linkedin link"],
64
- "title": ["job title", "position", "role", "job"],
65
- "headline": ["headline", "tagline", "bio"],
66
- "organization_name": ["company", "organization", "org", "company name", "employer"],
67
- "organization_website": ["website", "company website", "url", "web"],
68
- "organization_address": ["address", "company address", "location"],
69
- }
70
-
71
- if field_name in synonyms:
72
- field_variations.extend(synonyms[field_name])
73
-
74
- for column in available_columns:
75
- column_title = column.get("title", "").lower()
76
- column_id = column.get("id", "")
77
-
78
- if not column_title or not column_id:
79
- continue
80
-
81
- # Calculate similarity for each variation
82
- for variation in field_variations:
83
- score = _calculate_similarity(variation, column_title)
84
- if score > best_score:
85
- best_score = score
86
- best_match = (column_id, column.get("title", ""), score)
87
-
88
- if best_match and best_score >= min_similarity:
89
- return best_match
90
- return None
91
-
92
-
93
- async def _get_board_columns(board_id: str) -> List[Dict[str, Any]]:
94
- """
95
- Fetch board columns from Monday.com API.
96
-
97
- Args:
98
- board_id: Monday.com board ID
99
-
100
- Returns:
101
- List of column dictionaries with 'id', 'title', and 'type' keys
102
- """
103
- # Check cache first
104
- if board_id in _board_columns_cache:
105
- return _board_columns_cache[board_id]
106
-
107
- if not MONDAY_API_KEY:
108
- print("[WARNING] MONDAY_API_KEY not set, cannot fetch board columns")
109
- return []
110
-
111
- query = """
112
- query ($boardId: ID!) {
113
- boards(ids: [$boardId]) {
114
- columns {
115
- id
116
- title
117
- type
118
- }
119
- }
120
- }
121
- """
122
-
123
- headers = {
124
- "Authorization": MONDAY_API_KEY,
125
- "Content-Type": "application/json"
126
- }
127
-
128
- try:
129
- async with httpx.AsyncClient(timeout=30.0) as client:
130
- response = await client.post(
131
- MONDAY_API_URL,
132
- json={
133
- "query": query,
134
- "variables": {"boardId": board_id}
135
- },
136
- headers=headers
137
- )
138
-
139
- if response.status_code == 200:
140
- result = response.json()
141
- if result.get("data") and result["data"].get("boards"):
142
- boards = result["data"]["boards"]
143
- if boards and boards[0].get("columns"):
144
- columns = boards[0]["columns"]
145
- # Cache the result
146
- _board_columns_cache[board_id] = columns
147
- print(f"[INFO] Fetched {len(columns)} columns from Monday.com board {board_id}")
148
- return columns
149
- elif result.get("errors"):
150
- print(f"[ERROR] Failed to fetch board columns: {result['errors']}")
151
- else:
152
- print(f"[ERROR] Failed to fetch board columns: {response.status_code} - {response.text}")
153
- except Exception as e:
154
- print(f"[ERROR] Exception while fetching board columns: {str(e)}")
155
-
156
- return []
157
-
158
-
159
- def _format_column_value(value: Any, column_type: str, column_id: Optional[str] = None) -> Any:
160
- """
161
- Format a value according to Monday.com column type.
162
-
163
- Args:
164
- value: The value to format
165
- column_type: Monday.com column type (email, phone, link, text, etc.)
166
- column_id: Column ID (for special handling)
167
-
168
- Returns:
169
- For email/phone/link: Python dict object
170
- For text/other types: Plain string
171
- """
172
- if value is None:
173
- return ""
174
-
175
- value_str = str(value)
176
-
177
- if column_type == "email":
178
- # Monday.com email format requires dict object (will be JSON encoded later)
179
- return {"email": value_str, "text": value_str}
180
- elif column_type == "phone":
181
- return {"phone": value_str, "countryShortName": "US"}
182
- elif column_type == "link":
183
- # If it's already a URL, use it; otherwise create a link
184
- if value_str.startswith("http://") or value_str.startswith("https://"):
185
- return {"url": value_str, "text": value_str}
186
- else:
187
- return {"url": f"https://{value_str}", "text": value_str}
188
- else:
189
- # Text, status, and other types - just return the string
190
- return value_str
191
-
192
-
193
- async def create_monday_lead(
194
- email: str,
195
- first_name: Optional[str] = None,
196
- last_name: Optional[str] = None,
197
- phone_number: Optional[str] = None,
198
- linkedin_url: Optional[str] = None,
199
- title: Optional[str] = None,
200
- headline: Optional[str] = None,
201
- organization_name: Optional[str] = None,
202
- organization_website: Optional[str] = None,
203
- organization_address: Optional[str] = None,
204
- board_id: Optional[str] = None
205
- ) -> bool:
206
- """
207
- Create a new lead item in Monday.com board.
208
-
209
- Args:
210
- email: Contact email address (required)
211
- first_name: Contact first name
212
- last_name: Contact last name
213
- phone_number: Phone number
214
- linkedin_url: LinkedIn profile URL
215
- title: Job title
216
- headline: Professional headline
217
- organization_name: Company name
218
- organization_website: Company website
219
- organization_address: Company address
220
- board_id: Monday.com board ID as string (defaults to MONDAY_BOARD_ID env var)
221
-
222
- Returns:
223
- True if lead created successfully, False otherwise
224
- """
225
- if not MONDAY_API_KEY:
226
- print("[WARNING] MONDAY_API_KEY not set, skipping Monday.com lead creation")
227
- return False
228
-
229
- target_board_id = board_id or MONDAY_BOARD_ID
230
- if not target_board_id:
231
- print("[WARNING] MONDAY_BOARD_ID not set, skipping Monday.com lead creation")
232
- return False
233
-
234
- # Prepare item name (use full name or email)
235
- item_name = email
236
- if first_name and last_name:
237
- item_name = f"{first_name} {last_name}"
238
- elif first_name:
239
- item_name = first_name
240
- elif last_name:
241
- item_name = last_name
242
-
243
- # Fetch board columns to automatically match fields
244
- print(f"[INFO] Fetching Monday.com board columns for automatic field matching...")
245
- board_columns = await _get_board_columns(str(target_board_id))
246
-
247
- if not board_columns:
248
- print("[WARNING] Could not fetch board columns, skipping Monday.com lead creation")
249
- return False
250
-
251
- # Create a mapping of column IDs to column types for formatting
252
- column_types = {col["id"]: col.get("type", "text") for col in board_columns}
253
-
254
- # Prepare data fields to map
255
- data_fields = {
256
- "email": email,
257
- "first_name": first_name,
258
- "last_name": last_name,
259
- "phone_number": phone_number,
260
- "linkedin_url": linkedin_url,
261
- "title": title,
262
- "headline": headline,
263
- "organization_name": organization_name,
264
- "organization_website": organization_website,
265
- "organization_address": organization_address,
266
- }
267
-
268
- # Automatically match fields to columns using semantic similarity
269
- column_values = {}
270
- matched_fields = []
271
- # Track which columns have been matched to handle duplicates (e.g., first_name and last_name -> Name)
272
- column_matches = {} # column_id -> (field_name, value)
273
-
274
- for field_name, field_value in data_fields.items():
275
- if not field_value:
276
- continue
277
-
278
- match = _find_best_column_match(field_name, board_columns)
279
- if match:
280
- column_id, column_title, similarity = match
281
- column_type = column_types.get(column_id, "text")
282
-
283
- # Handle special case: if first_name and last_name both match to the same "Name" column
284
- if column_id in column_matches:
285
- existing_field, existing_value = column_matches[column_id]
286
- # If both first_name and last_name match to the same column, combine them
287
- if (field_name in ["first_name", "last_name"] and
288
- existing_field in ["first_name", "last_name"] and
289
- field_name != existing_field):
290
- # Combine first and last name
291
- if field_name == "first_name":
292
- combined_value = f"{field_value} {existing_value}"
293
- else:
294
- combined_value = f"{existing_value} {field_value}"
295
- formatted_value = _format_column_value(combined_value, column_type, column_id)
296
- column_values[column_id] = formatted_value
297
- matched_fields.append(f"{existing_field}+{field_name} -> {column_title} (combined)")
298
- print(f"[INFO] Combined '{existing_field}' and '{field_name}' to column '{column_title}' (ID: {column_id})")
299
- continue
300
- else:
301
- # Different fields matching to same column - use the one with higher similarity
302
- print(f"[DEBUG] Column '{column_title}' already matched to '{existing_field}', skipping '{field_name}'")
303
- continue
304
-
305
- formatted_value = _format_column_value(field_value, column_type, column_id)
306
- column_values[column_id] = formatted_value
307
- column_matches[column_id] = (field_name, field_value)
308
- matched_fields.append(f"{field_name} -> {column_title} (similarity: {similarity:.2f})")
309
- print(f"[INFO] Matched '{field_name}' to column '{column_title}' (ID: {column_id}, type: {column_type}, value: {formatted_value[:100] if len(str(formatted_value)) > 100 else formatted_value})")
310
- else:
311
- print(f"[DEBUG] No suitable column match found for '{field_name}' (skipping)")
312
-
313
- if not column_values:
314
- print("[WARNING] No fields could be matched to board columns")
315
- return False
316
-
317
- print(f"[INFO] Successfully matched {len(matched_fields)} fields to Monday.com columns")
318
-
319
- # Convert column_values to JSON string for GraphQL mutation
320
- # Monday.com expects column values as a JSON string where:
321
- # - Text columns: plain string values
322
- # - Email/Phone/Link columns: dict objects (properly JSON encoded)
323
- column_values_json = json.dumps(column_values)
324
- print(f"[DEBUG] Monday.com column_values JSON: {column_values_json[:500]}")
325
-
326
- # GraphQL mutation
327
- # Note: Monday.com uses ID! (string) type for board_id, not Int!
328
- mutation = """
329
- mutation ($boardId: ID!, $itemName: String!, $columnValues: JSON!) {
330
- create_item (board_id: $boardId, item_name: $itemName, column_values: $columnValues) {
331
- id
332
- }
333
- }
334
- """
335
-
336
- # Convert board_id to string (Monday.com expects ID! which is a string)
337
- board_id_str = str(target_board_id)
338
-
339
- variables = {
340
- "boardId": board_id_str,
341
- "itemName": item_name,
342
- "columnValues": column_values_json
343
- }
344
-
345
- headers = {
346
- "Authorization": MONDAY_API_KEY,
347
- "Content-Type": "application/json"
348
- }
349
-
350
- try:
351
- async with httpx.AsyncClient(timeout=30.0) as client:
352
- response = await client.post(
353
- MONDAY_API_URL,
354
- json={
355
- "query": mutation,
356
- "variables": variables
357
- },
358
- headers=headers
359
- )
360
-
361
- if response.status_code == 200:
362
- result = response.json()
363
- if result.get("data") and result["data"].get("create_item"):
364
- item_id = result["data"]["create_item"].get("id")
365
- print(f"[INFO] Successfully created Monday.com lead: {item_name} (ID: {item_id})")
366
- return True
367
- elif result.get("errors"):
368
- errors = result.get("errors", [])
369
- for error in errors:
370
- error_msg = error.get("message", "Unknown error")
371
- error_path = error.get("path", [])
372
- print(f"[ERROR] Monday.com API error: {error_msg}")
373
- if error_path:
374
- print(f"[ERROR] Error path: {error_path}")
375
- # Log full error for debugging
376
- print(f"[DEBUG] Full Monday.com error response: {json.dumps(errors, indent=2)}")
377
- return False
378
- else:
379
- print(f"[ERROR] Unexpected Monday.com API response: {result}")
380
- return False
381
- else:
382
- error_data = response.text
383
- print(f"[ERROR] Failed to create Monday.com lead: {response.status_code} - {error_data}")
384
- return False
385
-
386
- except httpx.HTTPStatusError as e:
387
- print(f"[ERROR] Monday.com API HTTP error: {e.response.status_code} - {e.response.text}")
388
- return False
389
- except Exception as e:
390
- print(f"[ERROR] Failed to create Monday.com lead: {str(e)}")
391
- return False
392
-
393
- =======
394
  """
395
  Monday.com API service for creating leads with automatic field matching.
396
  Reference: https://developer.monday.com/api-reference/docs
@@ -781,5 +388,3 @@ async def create_monday_lead(
781
  except Exception as e:
782
  print(f"[ERROR] Failed to create Monday.com lead: {str(e)}")
783
  return False
784
-
785
- >>>>>>> daae7a900bd14d0802e4f04b99edb85493053f1d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  """
2
  Monday.com API service for creating leads with automatic field matching.
3
  Reference: https://developer.monday.com/api-reference/docs
 
388
  except Exception as e:
389
  print(f"[ERROR] Failed to create Monday.com lead: {str(e)}")
390
  return False
 
 
backend/app/openrouter_client.py CHANGED
@@ -1,867 +1,3 @@
1
- <<<<<<< HEAD
2
- import os
3
- import base64
4
- import json
5
- import re
6
- import time
7
- import asyncio
8
- from io import BytesIO
9
- from typing import Any, Dict, List, Optional, Tuple
10
- import httpx
11
-
12
- try:
13
- import fitz # PyMuPDF
14
- from PIL import Image
15
- PDF_SUPPORT = True
16
- except ImportError as e:
17
- PDF_SUPPORT = False
18
- print(f"[WARNING] PDF support libraries not available: {e}. PDF conversion will not work.")
19
-
20
-
21
- # RunPod Serverless OCR Configuration
22
- RUNPOD_ENDPOINT = os.environ.get("RUNPOD_ENDPOINT", "https://api.runpod.ai/v2/j2jvf8t6n0rk5c/run")
23
- RUNPOD_API_KEY = os.environ.get("RUNPOD_API_KEY", "rpa_0UJOK33ZO7SID9B3ASFSKKPUHNPBQC5Z2128RB4O4qi9ts")
24
-
25
- # Extract endpoint ID from endpoint URL for status polling
26
- # URL format: https://api.runpod.ai/v2/{endpoint_id}/run
27
- _endpoint_id = RUNPOD_ENDPOINT.split("/v2/")[1].split("/")[0] if "/v2/" in RUNPOD_ENDPOINT else None
28
- RUNPOD_STATUS_ENDPOINT = f"https://api.runpod.ai/v2/{_endpoint_id}/status" if _endpoint_id else None
29
-
30
-
31
- def _pdf_to_images(pdf_bytes: bytes) -> List[bytes]:
32
- """
33
- Convert PDF pages to PNG images.
34
- Returns a list of PNG image bytes, one per page.
35
- """
36
- if not PDF_SUPPORT:
37
- raise RuntimeError("PyMuPDF not installed. Cannot convert PDF to images.")
38
-
39
- pdf_doc = fitz.open(stream=pdf_bytes, filetype="pdf")
40
- images = []
41
-
42
- print(f"[INFO] PDF has {len(pdf_doc)} page(s)")
43
-
44
- for page_num in range(len(pdf_doc)):
45
- page = pdf_doc[page_num]
46
- # Render page to image (zoom factor 2 for better quality)
47
- mat = fitz.Matrix(2.0, 2.0) # 2x zoom for better quality
48
- pix = page.get_pixmap(matrix=mat)
49
-
50
- # Convert to PIL Image
51
- img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
52
-
53
- # Resize if too large to avoid GPU memory issues (max 1920px on longest side)
54
- max_size = 1920
55
- w, h = img.size
56
- if w > max_size or h > max_size:
57
- if w > h:
58
- new_w = max_size
59
- new_h = int(h * (max_size / w))
60
- else:
61
- new_h = max_size
62
- new_w = int(w * (max_size / h))
63
- img = img.resize((new_w, new_h), Image.LANCZOS)
64
- print(f"[INFO] Resized page {page_num + 1} from {w}x{h} to {new_w}x{new_h}")
65
- else:
66
- print(f"[INFO] Converted page {page_num + 1} to image ({w}x{h})")
67
-
68
- # Convert to JPEG bytes (better compression)
69
- img_bytes = BytesIO()
70
- img.save(img_bytes, format="JPEG", quality=95)
71
- images.append(img_bytes.getvalue())
72
-
73
- pdf_doc.close()
74
- return images
75
-
76
-
77
- def _image_bytes_to_base64(image_bytes: bytes) -> str:
78
- """Convert image bytes to base64 data URL (JPEG format)."""
79
- b64 = base64.b64encode(image_bytes).decode("utf-8")
80
- data_url = f"data:image/jpeg;base64,{b64}"
81
- print(f"[DEBUG] Base64 encoded image: {len(image_bytes)} bytes -> {len(data_url)} chars")
82
- return data_url
83
-
84
-
85
- def _parse_markdown_table(text: str) -> Optional[Tuple[List[str], List[List[str]]]]:
86
- """
87
- Parse a markdown table from text.
88
- Returns (headers, rows) if table found, None otherwise.
89
- Handles various table formats including malformed ones.
90
- """
91
- lines = [line.strip() for line in text.split('\n')]
92
-
93
- # Find potential table start (line with multiple | and actual text content)
94
- table_start = None
95
- for i, line in enumerate(lines):
96
- if '|' in line and line.count('|') >= 2:
97
- # Skip separator lines (only |, -, :, spaces)
98
- if re.match(r'^[\s\|\-:]+$', line):
99
- continue
100
- # Check if line has meaningful text (not just | characters)
101
- cells = [cell.strip() for cell in line.split('|')]
102
- if cells and not cells[0]:
103
- cells = cells[1:]
104
- if cells and not cells[-1]:
105
- cells = cells[:-1]
106
- # Must have at least 2 columns with some text
107
- meaningful_cells = [c for c in cells if len(c) > 0]
108
- if len(meaningful_cells) >= 2:
109
- table_start = i
110
- break
111
-
112
- if table_start is None:
113
- return None
114
-
115
- # Find table end (first non-empty line without | after table start)
116
- table_end = None
117
- for i in range(table_start + 1, len(lines)):
118
- line = lines[i]
119
- if not line: # Empty line, continue
120
- continue
121
- if '|' not in line:
122
- # Non-empty line without | means table ended
123
- table_end = i
124
- break
125
-
126
- if table_end is None:
127
- table_end = len(lines)
128
-
129
- table_lines = lines[table_start:table_end]
130
-
131
- # Find the actual header row (should have meaningful text, not just | or separators)
132
- headers = None
133
- header_idx = None
134
-
135
- for i, line in enumerate(table_lines):
136
- if not line or '|' not in line:
137
- continue
138
-
139
- # Skip separator lines (lines with only |, -, :, spaces)
140
- if re.match(r'^[\s\|\-:]+$', line):
141
- continue
142
-
143
- # Check if this line has meaningful content (not just | characters)
144
- cells = [cell.strip() for cell in line.split('|')]
145
- # Remove empty cells at start/end
146
- if cells and not cells[0]:
147
- cells = cells[1:]
148
- if cells and not cells[-1]:
149
- cells = cells[:-1]
150
-
151
- # Header should have at least 3 columns and meaningful text
152
- if len(cells) >= 3:
153
- # Check if cells have actual text (not just empty or single char)
154
- meaningful_cells = [c for c in cells if len(c) > 1]
155
- if len(meaningful_cells) >= 3:
156
- headers = cells
157
- header_idx = i
158
- break
159
-
160
- if not headers or header_idx is None:
161
- return None
162
-
163
- # Parse data rows (skip separator line after header if present)
164
- rows = []
165
- num_columns = len(headers)
166
-
167
- for i in range(header_idx + 1, len(table_lines)):
168
- line = table_lines[i]
169
-
170
- if not line:
171
- continue
172
-
173
- # Skip separator lines
174
- if re.match(r'^[\s\|\-:]+$', line):
175
- continue
176
-
177
- if '|' not in line:
178
- # No more table rows
179
- break
180
-
181
- cells = [cell.strip() for cell in line.split('|')]
182
- # Remove empty cells at start/end
183
- if cells and not cells[0]:
184
- cells = cells[1:]
185
- if cells and not cells[-1]:
186
- cells = cells[:-1]
187
-
188
- # Only add rows that match header column count (allow some flexibility)
189
- if len(cells) == num_columns or (len(cells) >= num_columns - 1 and len(cells) <= num_columns + 1):
190
- # Pad or trim to match header count
191
- if len(cells) < num_columns:
192
- cells.extend([''] * (num_columns - len(cells)))
193
- elif len(cells) > num_columns:
194
- cells = cells[:num_columns]
195
-
196
- # Only add if row has at least one non-empty cell
197
- if any(cell for cell in cells):
198
- rows.append(cells)
199
-
200
- if not rows:
201
- return None
202
-
203
- return (headers, rows)
204
-
205
-
206
- def _extract_metadata(text: str) -> Dict[str, str]:
207
- """
208
- Extract metadata from document header text.
209
- Looks for title, office, notice number, and description.
210
- """
211
- metadata = {
212
- "title": "",
213
- "office": "",
214
- "notice_no": "",
215
- "description": ""
216
- }
217
-
218
- lines = [line.strip() for line in text.split('\n') if line.strip()]
219
-
220
- # Extract office (usually first non-empty line)
221
- if lines:
222
- metadata["office"] = lines[0]
223
-
224
- # Look for notice number pattern (like "पत्रक सं- 1239" or "सं- 1239")
225
- notice_pattern = r'(?:पत्रक\s+)?सं[-\s:]*(\d+)'
226
- for line in lines[:10]: # Check first 10 lines
227
- match = re.search(notice_pattern, line)
228
- if match:
229
- metadata["notice_no"] = match.group(1)
230
- break
231
-
232
- # Look for title - usually in quotes or contains specific keywords
233
- # Check for quoted text first
234
- quoted_title = re.search(r'["""]([^"""]+)["""]', text[:1000])
235
- if quoted_title:
236
- metadata["title"] = quoted_title.group(1).strip()
237
- else:
238
- # Look for title patterns
239
- title_keywords = ['सम्पत्ति', 'सूचना', 'विज्ञप्ति', 'नाम परिवर्तन']
240
- for line in lines[:5]:
241
- if any(keyword in line for keyword in title_keywords):
242
- # Extract the title phrase
243
- title_match = re.search(r'(सम्पत्ति[^।]*|सूचना[^।]*|विज्ञप्ति[^।]*)', line)
244
- if title_match:
245
- metadata["title"] = title_match.group(1).strip()
246
- break
247
-
248
- # Extract description (text before table, usually contains key phrases)
249
- description_keywords = ['नाम परिवर्तन', 'अधिनियम', 'धारा', 'प्रकाशन', 'आवेदन']
250
- description_parts = []
251
- for i, line in enumerate(lines[:15]): # Check first 15 lines
252
- if any(keyword in line for keyword in description_keywords):
253
- description_parts.append(line)
254
- # Get a few surrounding lines for context
255
- if i > 0:
256
- description_parts.insert(0, lines[i-1])
257
- if i < len(lines) - 1:
258
- description_parts.append(lines[i+1])
259
- break
260
-
261
- if description_parts:
262
- description = ' '.join(description_parts).strip()
263
- if len(description) > 30: # Only if substantial
264
- # Clean up and limit length
265
- description = re.sub(r'\s+', ' ', description)
266
- metadata["description"] = description[:300] # Limit length
267
-
268
- return metadata
269
-
270
-
271
- def _parse_model_response(response_text: str) -> Tuple[str, Dict[str, Any]]:
272
- """
273
- Parse model response to extract text and metadata.
274
- The model may return text and metadata in various formats.
275
- Returns: (extracted_text, metadata_dict)
276
- """
277
- metadata = {}
278
- text = response_text
279
-
280
- # Try to find JSON metadata section
281
- # Look for METADATA: or metadata: section
282
- metadata_patterns = [
283
- r'METADATA:\s*\n?\s*({.*?})(?:\n\n|\nTEXT|$)',
284
- r'metadata:\s*\n?\s*({.*?})(?:\n\n|\nTEXT|$)',
285
- r'METADATA:\s*\n?\s*```json\s*({.*?})\s*```',
286
- r'METADATA:\s*\n?\s*```\s*({.*?})\s*```',
287
- ]
288
-
289
- for pattern in metadata_patterns:
290
- match = re.search(pattern, response_text, re.DOTALL | re.IGNORECASE)
291
- if match:
292
- try:
293
- metadata_json = match.group(1).strip()
294
- metadata = json.loads(metadata_json)
295
- # Remove metadata section from text
296
- text = response_text[:match.start()] + response_text[match.end():]
297
- break
298
- except (json.JSONDecodeError, IndexError):
299
- continue
300
-
301
- # If no JSON found, try to extract metadata from structured text format
302
- if not metadata:
303
- # Look for key-value pairs in METADATA section
304
- metadata_section = re.search(r'METADATA:\s*\n(.*?)(?:\n\n|\nTEXT|$)', response_text, re.DOTALL | re.IGNORECASE)
305
- if metadata_section:
306
- metadata_text = metadata_section.group(1)
307
- # Parse key-value pairs
308
- for line in metadata_text.split('\n'):
309
- if ':' in line:
310
- parts = line.split(':', 1)
311
- if len(parts) == 2:
312
- key = parts[0].strip().lower().replace(' ', '_')
313
- value = parts[1].strip()
314
- if value:
315
- metadata[key] = value
316
-
317
- # Extract TEXT section if present
318
- text_match = re.search(r'TEXT:\s*\n(.*?)(?:\n\nMETADATA|$)', response_text, re.DOTALL | re.IGNORECASE)
319
- if text_match:
320
- text = text_match.group(1).strip()
321
- else:
322
- # If no TEXT section, remove METADATA section if found
323
- text = re.sub(r'METADATA:.*', '', response_text, flags=re.DOTALL | re.IGNORECASE).strip()
324
-
325
- # Clean up text
326
- text = text.strip()
327
-
328
- # Clean up metadata - remove empty values
329
- metadata = {k: v for k, v in metadata.items() if v and str(v).strip()}
330
-
331
- return text, metadata
332
-
333
-
334
- def _extract_footer_notes(text: str) -> List[str]:
335
- """
336
- Extract footer notes from document.
337
- Usually appears after the table.
338
- """
339
- notes = []
340
-
341
- # Find table end
342
- lines = text.split('\n')
343
- table_end_idx = len(lines)
344
-
345
- for i, line in enumerate(lines):
346
- if '|' in line:
347
- # Find last table line
348
- j = i + 1
349
- while j < len(lines) and ('|' in lines[j] or re.match(r'^[\s\|\-:]+$', lines[j])):
350
- j += 1
351
- table_end_idx = j
352
- break
353
-
354
- # Extract footer text (after table)
355
- footer_lines = lines[table_end_idx:]
356
- footer_text = '\n'.join(footer_lines).strip()
357
-
358
- # Split into sentences/notes
359
- # Look for sentences ending with period, exclamation, or specific keywords
360
- sentences = re.split(r'[।\.!]\s+', footer_text)
361
-
362
- for sentence in sentences:
363
- sentence = sentence.strip()
364
- if len(sentence) > 20: # Only substantial notes
365
- # Clean up
366
- sentence = re.sub(r'\s+', ' ', sentence)
367
- if sentence:
368
- notes.append(sentence)
369
-
370
- # Limit to most relevant notes (usually 2-4)
371
- return notes[:5]
372
-
373
-
374
- def _parse_text_with_tables(text: str, page_metadata: Dict[str, Any] = None) -> Dict[str, Any]:
375
- """
376
- Parse text and extract structured data including tables.
377
- Uses model-extracted metadata if provided, otherwise falls back to basic extraction.
378
- Returns structured JSON format with metadata, table, and footer_notes.
379
- """
380
- result = {
381
- "text": text, # Keep original text
382
- "metadata": page_metadata if page_metadata else {},
383
- "table": [],
384
- "footer_notes": []
385
- }
386
-
387
- # Check if text contains a table
388
- table_data = _parse_markdown_table(text)
389
-
390
- if table_data:
391
- headers, rows = table_data
392
- print(f"[INFO] Found table with {len(headers)} columns and {len(rows)} rows")
393
-
394
- # Use provided metadata or extract basic metadata as fallback
395
- if not result["metadata"]:
396
- result["metadata"] = _extract_metadata(text)
397
-
398
- # Map headers to field names using original header text
399
- # Keep original language, just make valid JSON keys and handle duplicates
400
- header_mapping = {}
401
- header_counts = {} # Track occurrences of each header
402
-
403
- for i, header in enumerate(headers):
404
- header_clean = header.strip()
405
-
406
- # Create a valid JSON key from the original header
407
- # Remove special characters that aren't valid in JSON keys, but keep the text
408
- # Replace spaces and special chars with underscores, but preserve the original text
409
- header_key = header_clean
410
-
411
- # Track how many times we've seen this exact header
412
- if header_key not in header_counts:
413
- header_counts[header_key] = 0
414
-
415
- header_counts[header_key] += 1
416
-
417
- # If this header appears multiple times, append a number
418
- if header_counts[header_key] > 1:
419
- header_key = f"{header_key}_{header_counts[header_key]}"
420
-
421
- # Clean the key to be valid for JSON (remove/replace problematic characters)
422
- # Keep the original text but make it JSON-safe
423
- header_key = re.sub(r'[^\w\s\u0900-\u097F]', '', header_key) # Keep Unicode Hindi chars
424
- header_key = re.sub(r'\s+', '_', header_key) # Replace spaces with underscores
425
-
426
- # If key is empty after cleaning, use column index
427
- if not header_key:
428
- header_key = f"column_{i+1}"
429
-
430
- header_mapping[i] = header_key
431
-
432
- # Parse table rows - each row becomes a separate section
433
- table_rows_dict = {}
434
- for idx, row in enumerate(rows, start=1):
435
- row_dict = {}
436
- for i, header_idx in header_mapping.items():
437
- if i < len(row):
438
- row_dict[header_idx] = row[i].strip()
439
-
440
- if row_dict:
441
- # Each row is a separate section: row_1, row_2, etc.
442
- table_rows_dict[f"row_{idx}"] = row_dict
443
-
444
- # Store rows as separate sections instead of array
445
- result["table"] = table_rows_dict
446
-
447
- # Extract footer notes
448
- result["footer_notes"] = _extract_footer_notes(text)
449
- else:
450
- # No table found, just extract basic metadata
451
- result["metadata"] = _extract_metadata(text)
452
- result["footer_notes"] = _extract_footer_notes(text)
453
-
454
- return result
455
-
456
-
457
- async def _poll_runpod_job(job_id: str, client: httpx.AsyncClient, max_wait_time: int = 300) -> Dict[str, Any]:
458
- """
459
- Poll RunPod job status until completion.
460
- Returns the final job result with output.
461
- """
462
- headers = {
463
- "Content-Type": "application/json",
464
- "Authorization": f"Bearer {RUNPOD_API_KEY}"
465
- }
466
-
467
- start_time = time.time()
468
- poll_interval = 2 # Poll every 2 seconds
469
-
470
- while True:
471
- # Check timeout
472
- elapsed = time.time() - start_time
473
- if elapsed > max_wait_time:
474
- raise RuntimeError(f"Job {job_id} timed out after {max_wait_time} seconds")
475
-
476
- # Poll job status
477
- status_url = f"{RUNPOD_STATUS_ENDPOINT}/{job_id}"
478
- response = await client.get(status_url, headers=headers)
479
- response.raise_for_status()
480
- status_result = response.json()
481
-
482
- status = status_result.get("status", "").upper()
483
-
484
- if status == "COMPLETED":
485
- print(f"[INFO] Job {job_id} completed successfully")
486
- return status_result
487
- elif status == "FAILED":
488
- error_msg = status_result.get("error", "Unknown error")
489
- raise RuntimeError(f"Job {job_id} failed: {error_msg}")
490
- elif status in ["IN_QUEUE", "IN_PROGRESS"]:
491
- print(f"[INFO] Job {job_id} status: {status}, waiting...")
492
- await asyncio.sleep(poll_interval)
493
- else:
494
- # Unknown status, wait and retry
495
- print(f"[INFO] Job {job_id} status: {status}, waiting...")
496
- await asyncio.sleep(poll_interval)
497
-
498
-
499
- async def _extract_text_with_ocr(image_bytes: bytes, page_num: int, total_pages: int, custom_prompt: str = None) -> Dict[str, Any]:
500
- """
501
- Extract text and metadata from a single page/image using the RunPod serverless OCR model.
502
- Uses model-driven extraction to identify and extract metadata fields dynamically.
503
- Returns text output in full_text field and extracted metadata.
504
-
505
- Args:
506
- image_bytes: Image bytes to process
507
- page_num: Page number
508
- total_pages: Total number of pages
509
- custom_prompt: Optional custom prompt for field extraction
510
- """
511
- # Convert image bytes to base64
512
- image_base64 = base64.b64encode(image_bytes).decode("utf-8")
513
-
514
- print(f"[INFO] OCR: Processing page {page_num}/{total_pages} with RunPod endpoint")
515
-
516
- try:
517
- # Use custom prompt if provided, otherwise use default
518
- if custom_prompt:
519
- metadata_prompt = custom_prompt
520
- else:
521
- # Default prompt for general text extraction
522
- metadata_prompt = """Extract all text from this image."""
523
-
524
- # Prepare request payload for RunPod
525
- # RunPod serverless endpoints expect image_base64, image_url, or image_path
526
- payload = {
527
- "input": {
528
- "prompt": metadata_prompt,
529
- "image_base64": image_base64 # Base64 encoded image
530
- }
531
- }
532
-
533
- # Make HTTP request to RunPod endpoint
534
- headers = {
535
- "Content-Type": "application/json",
536
- "Authorization": f"Bearer {RUNPOD_API_KEY}"
537
- }
538
-
539
- async with httpx.AsyncClient(timeout=300.0) as client:
540
- # Submit job
541
- response = await client.post(
542
- RUNPOD_ENDPOINT,
543
- headers=headers,
544
- json=payload
545
- )
546
- response.raise_for_status()
547
- result = response.json()
548
-
549
- # Check if this is an async job (has job ID and status)
550
- job_id = result.get("id")
551
- status = result.get("status", "").upper()
552
-
553
- if job_id and status in ["IN_QUEUE", "IN_PROGRESS"]:
554
- # This is an async job, need to poll for completion
555
- print(f"[INFO] Job submitted with ID: {job_id}, status: {status}")
556
- if not RUNPOD_STATUS_ENDPOINT:
557
- raise RuntimeError("RunPod status endpoint not configured. Cannot poll async job.")
558
-
559
- # Poll until completion
560
- result = await _poll_runpod_job(job_id, client)
561
-
562
- # Extract text from RunPod response
563
- # RunPod serverless typically returns: {"id": "...", "status": "...", "output": "..."}
564
- # The output might be a string or a dict depending on the model
565
- extracted_text = ""
566
-
567
- if "output" in result:
568
- output = result["output"]
569
- if isinstance(output, str):
570
- extracted_text = output
571
- elif isinstance(output, dict):
572
- # If output is a dict, try common fields
573
- extracted_text = output.get("text", output.get("result", output.get("content", "")))
574
- if not extracted_text and isinstance(output.get("text"), str):
575
- extracted_text = output["text"]
576
- elif isinstance(output, list) and len(output) > 0:
577
- # If output is a list, take the first element
578
- extracted_text = str(output[0])
579
- elif "result" in result:
580
- extracted_text = str(result["result"])
581
- elif "text" in result:
582
- extracted_text = str(result["text"])
583
- else:
584
- # Fallback: convert entire response to string
585
- extracted_text = str(result)
586
-
587
- if not extracted_text:
588
- extracted_text = ""
589
-
590
- print(f"[INFO] OCR: Extracted {len(extracted_text)} characters from page {page_num}")
591
-
592
- # Parse model response to extract text and metadata
593
- parsed_text, parsed_metadata = _parse_model_response(extracted_text)
594
-
595
- # Calculate confidence based on response quality
596
- # Create a mock response object for compatibility with confidence calculation
597
- mock_response = type('obj', (object,), {
598
- 'choices': [type('obj', (object,), {'finish_reason': 'stop'})()],
599
- 'usage': type('obj', (object,), {'completion_tokens': len(parsed_text.split())})()
600
- })()
601
- confidence = _calculate_ocr_confidence(mock_response, parsed_text)
602
-
603
- # Determine document type from metadata if available
604
- doc_type = parsed_metadata.get("document_type", "other")
605
- if doc_type == "other" and parsed_metadata.get("title"):
606
- # Try to infer from title
607
- title_lower = parsed_metadata.get("title", "").lower()
608
- if any(kw in title_lower for kw in ["tender", "bid", "quotation"]):
609
- doc_type = "tender"
610
- elif any(kw in title_lower for kw in ["recruitment", "appointment", "vacancy"]):
611
- doc_type = "recruitment"
612
- elif any(kw in title_lower for kw in ["notice", "notification", "circular"]):
613
- doc_type = "notice"
614
-
615
- # Return text and extracted metadata
616
- return {
617
- "doc_type": doc_type,
618
- "confidence": confidence,
619
- "full_text": parsed_text,
620
- "fields": parsed_metadata if parsed_metadata else {} # Model-extracted metadata
621
- }
622
-
623
- except httpx.HTTPStatusError as e:
624
- error_msg = f"HTTP {e.response.status_code}: {e.response.text}"
625
- print(f"[ERROR] OCR API HTTP error for page {page_num}: {error_msg}")
626
- raise RuntimeError(f"OCR API error for page {page_num}: {error_msg}")
627
- except Exception as e:
628
- error_msg = str(e)
629
- print(f"[ERROR] OCR API error for page {page_num}: {error_msg}")
630
- raise RuntimeError(f"OCR API error for page {page_num}: {error_msg}")
631
-
632
-
633
- def _calculate_ocr_confidence(response, extracted_text: str) -> float:
634
- """
635
- Calculate confidence score based on OCR response quality.
636
- Returns a score from 0-100, with higher scores for better extraction quality.
637
- """
638
- # Start with a higher base confidence for successful extractions
639
- base_confidence = 92.0
640
-
641
- # Adjust confidence based on text quality heuristics
642
- text_length = len(extracted_text.strip())
643
-
644
- if text_length == 0:
645
- return 0.0
646
- elif text_length < 10:
647
- # Very short text - might be error or empty
648
- return max(30.0, base_confidence - 40.0)
649
- elif text_length < 50:
650
- # Short text - might be incomplete
651
- return max(60.0, base_confidence - 20.0)
652
- elif text_length > 1000:
653
- # Long text - likely good extraction
654
- confidence = min(100.0, base_confidence + 5.0)
655
- elif text_length > 500:
656
- # Medium-long text - good extraction
657
- confidence = min(100.0, base_confidence + 3.0)
658
- else:
659
- confidence = base_confidence
660
-
661
- # Check for structured content (tables, etc.) - indicates good extraction
662
- if '|' in extracted_text and extracted_text.count('|') > 5:
663
- # Table detected - boost confidence significantly
664
- confidence = min(100.0, confidence + 6.0)
665
-
666
- # Check for meaningful content (non-whitespace ratio)
667
- non_whitespace = len([c for c in extracted_text if not c.isspace()])
668
- if text_length > 0:
669
- content_ratio = non_whitespace / text_length
670
- if content_ratio > 0.85:
671
- # Very high content ratio - excellent extraction
672
- confidence = min(100.0, confidence + 5.0)
673
- elif content_ratio > 0.75:
674
- # High content ratio - good extraction
675
- confidence = min(100.0, confidence + 3.0)
676
- elif content_ratio > 0.6:
677
- # Moderate content ratio - decent extraction
678
- confidence = min(100.0, confidence + 1.0)
679
- elif content_ratio < 0.3:
680
- # Low content ratio - mostly whitespace
681
- confidence = max(60.0, confidence - 15.0)
682
-
683
- # Check for common OCR quality indicators
684
- # Presence of numbers, dates, and structured patterns indicates good extraction
685
- has_numbers = any(c.isdigit() for c in extracted_text)
686
- has_letters = any(c.isalpha() for c in extracted_text)
687
- has_punctuation = any(c in '.,;:!?()[]{}' for c in extracted_text)
688
-
689
- if has_numbers and has_letters and has_punctuation:
690
- # Well-structured text with mixed content - high confidence
691
- confidence = min(100.0, confidence + 2.0)
692
-
693
- # Cap at 100% and ensure minimum quality threshold
694
- return round(min(100.0, max(0.0, confidence)), 1)
695
-
696
-
697
- async def extract_fields_from_document(
698
- file_bytes: bytes,
699
- content_type: str,
700
- filename: str,
701
- key_fields: str = None,
702
- ) -> Dict[str, Any]:
703
- """
704
- Extract text from document using OCR model.
705
- Processes pages separately for better reliability.
706
- Returns text output in full_text, keeps JSON/XML fields empty for now.
707
- """
708
- # Get raw image bytes for processing
709
- if content_type == "application/pdf" or content_type.endswith("/pdf"):
710
- if not PDF_SUPPORT:
711
- raise RuntimeError("PDF support requires PyMuPDF. Please install it.")
712
- # For PDFs, convert to images
713
- pdf_images = _pdf_to_images(file_bytes)
714
- image_bytes_list = pdf_images
715
- else:
716
- # For regular images, process the file bytes
717
- # Convert to JPEG for consistency
718
- try:
719
- img = Image.open(BytesIO(file_bytes))
720
- if img.mode != "RGB":
721
- img = img.convert("RGB")
722
-
723
- # Resize if too large (max 1920px on longest side)
724
- max_size = 1920
725
- w, h = img.size
726
- if w > max_size or h > max_size:
727
- if w > h:
728
- new_w = max_size
729
- new_h = int(h * (max_size / w))
730
- else:
731
- new_h = max_size
732
- new_w = int(w * (max_size / h))
733
- img = img.resize((new_w, new_h), Image.LANCZOS)
734
- print(f"[INFO] Resized image from {w}x{h} to {new_w}x{new_h}")
735
-
736
- # Convert to JPEG bytes
737
- img_bytes = BytesIO()
738
- img.save(img_bytes, format="JPEG", quality=95)
739
- image_bytes_list = [img_bytes.getvalue()]
740
- except Exception as e:
741
- # Fallback: use original file bytes
742
- print(f"[WARNING] Could not process image with PIL: {e}. Using original bytes.")
743
- image_bytes_list = [file_bytes]
744
-
745
- total_pages = len(image_bytes_list)
746
- print(f"[INFO] Processing {total_pages} page(s) with OCR model...")
747
-
748
- # Process each page separately
749
- page_results = []
750
- for page_num, img_bytes in enumerate(image_bytes_list):
751
- print(f"[INFO] Processing page {page_num + 1}/{total_pages}...")
752
- try:
753
- page_result = await _extract_text_with_ocr(img_bytes, page_num + 1, total_pages, None)
754
- page_results.append({
755
- "page_number": page_num + 1,
756
- "text": page_result.get("full_text", ""),
757
- "fields": page_result.get("fields", {}),
758
- "confidence": page_result.get("confidence", 0),
759
- "doc_type": page_result.get("doc_type", "other"),
760
- })
761
- print(f"[INFO] Page {page_num + 1} processed successfully")
762
- except Exception as e:
763
- print(f"[ERROR] Failed to process page {page_num + 1}: {e}")
764
- page_results.append({
765
- "page_number": page_num + 1,
766
- "text": "",
767
- "fields": {},
768
- "confidence": 0,
769
- "error": str(e)
770
- })
771
-
772
- # Combine results from all pages
773
- combined_full_text = "\n\n".join([f"=== PAGE {p['page_number']} ===\n\n{p['text']}" for p in page_results if p.get("text")])
774
-
775
- # Extract user-specified fields if key_fields provided
776
- extracted_fields = {}
777
- if key_fields and key_fields.strip():
778
- # Parse user input: "Invoice Number, Invoice Date, PO Number" -> ['Invoice Number', 'Invoice Date', 'PO Number']
779
- field_list = [f.strip() for f in key_fields.split(',') if f.strip()]
780
- if field_list:
781
- print(f"[INFO] Extracting user-specified fields: {field_list}")
782
-
783
- # Format fields as JSON array string for prompt
784
- fields_json = json.dumps(field_list)
785
- custom_prompt = f"Extract the following fields from this image and return as JSON: {fields_json}. Return only a valid JSON object with the field names as keys and their extracted values."
786
-
787
- # Run second OCR pass on first page (usually has most metadata) with custom prompt
788
- if image_bytes_list and len(image_bytes_list) > 0:
789
- try:
790
- print("[INFO] Running second OCR pass for field extraction...")
791
- field_result = await _extract_text_with_ocr(image_bytes_list[0], 1, 1, custom_prompt)
792
- field_text = field_result.get("full_text", "")
793
-
794
- # Try to parse JSON from the response
795
- try:
796
- # Look for JSON in the response
797
- json_match = re.search(r'\{[^{}]*(?:\{[^{}]*\}[^{}]*)*\}', field_text, re.DOTALL)
798
- if json_match:
799
- extracted_fields = json.loads(json_match.group(0))
800
- print(f"[INFO] Successfully extracted {len(extracted_fields)} fields from second OCR pass")
801
- else:
802
- # Try parsing the entire response as JSON
803
- extracted_fields = json.loads(field_text)
804
- print(f"[INFO] Successfully extracted {len(extracted_fields)} fields from second OCR pass")
805
- except json.JSONDecodeError:
806
- print(f"[WARNING] Could not parse JSON from field extraction response: {field_text[:200]}")
807
- extracted_fields = {}
808
- except Exception as e:
809
- print(f"[WARNING] Field extraction failed: {e}")
810
- extracted_fields = {}
811
-
812
- # Parse each page for tables and structure the output
813
- structured_pages = {}
814
- for page_result in page_results:
815
- if page_result.get("text"):
816
- page_num = page_result.get("page_number", 1)
817
- page_text = page_result.get("text", "")
818
-
819
- # Parse text for tables and structure
820
- parsed_data = _parse_text_with_tables(page_text, {})
821
-
822
- # Build structured page output (without Fields - moved to root level)
823
- page_key = f"page_{page_num}"
824
- structured_pages[page_key] = {
825
- "text": parsed_data["text"],
826
- "table": parsed_data["table"],
827
- "footer_notes": parsed_data["footer_notes"],
828
- "confidence": page_result.get("confidence", 0),
829
- "doc_type": page_result.get("doc_type", "other")
830
- }
831
-
832
- # If we have structured pages, use them; otherwise keep fields empty
833
- if structured_pages:
834
- # Always return pages with page_X keys (even for single page)
835
- combined_fields = structured_pages
836
- else:
837
- combined_fields = {}
838
-
839
- # Calculate average confidence
840
- confidences = [p.get("confidence", 0) for p in page_results if p.get("confidence", 0) > 0]
841
- avg_confidence = sum(confidences) / len(confidences) if confidences else 0
842
-
843
- # Determine doc_type from first successful page
844
- doc_type = "other"
845
- for page_result in page_results:
846
- if page_result.get("doc_type") and page_result["doc_type"] != "other":
847
- doc_type = page_result["doc_type"]
848
- break
849
-
850
- # Build return object - add Fields at root level only if extracted_fields is not empty
851
- return_obj = {
852
- "doc_type": doc_type,
853
- "confidence": avg_confidence,
854
- "full_text": combined_full_text,
855
- "fields": combined_fields, # Now contains structured data with tables
856
- "pages": page_results
857
- }
858
-
859
- # Add Fields at root level only if user provided key_fields and extraction succeeded
860
- if extracted_fields:
861
- return_obj["Fields"] = extracted_fields
862
-
863
- return return_obj
864
- =======
865
  import os
866
  import base64
867
  import json
@@ -1724,4 +860,3 @@ async def extract_fields_from_document(
1724
  return_obj["Fields"] = extracted_fields
1725
 
1726
  return return_obj
1727
- >>>>>>> daae7a900bd14d0802e4f04b99edb85493053f1d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import os
2
  import base64
3
  import json
 
860
  return_obj["Fields"] = extracted_fields
861
 
862
  return return_obj
 
backend/app/otp_service.py CHANGED
@@ -1,202 +1,3 @@
1
- <<<<<<< HEAD
2
- """
3
- OTP (One-Time Password) service for email-based authentication.
4
- """
5
- import random
6
- import string
7
- from datetime import datetime, timedelta
8
- from typing import Dict, Optional
9
- from sqlalchemy.orm import Session
10
- from fastapi import HTTPException
11
- from .models import User
12
- from .brevo_service import send_otp_email
13
-
14
- # Store OTPs in memory (in production, use Redis or database)
15
- otp_store: Dict[str, dict] = {}
16
-
17
-
18
- def generate_otp(length: int = 6) -> str:
19
- """
20
- Generate a random OTP code.
21
-
22
- Args:
23
- length: Length of OTP (default: 6)
24
-
25
- Returns:
26
- Random OTP string
27
- """
28
- return ''.join(random.choices(string.digits, k=length))
29
-
30
-
31
- async def request_otp(email: str, db: Session) -> dict:
32
- """
33
- Generate and send OTP to email using Brevo.
34
-
35
- Args:
36
- email: Email address to send OTP to
37
- db: Database session
38
-
39
- Returns:
40
- Dictionary with success message
41
- """
42
- # Generate OTP
43
- otp = generate_otp()
44
- expires_at = datetime.utcnow() + timedelta(minutes=10)
45
-
46
- # Store OTP (in production, use Redis or database with TTL)
47
- otp_store[email.lower()] = {
48
- 'otp': otp,
49
- 'expires_at': expires_at,
50
- 'attempts': 0,
51
- 'max_attempts': 5
52
- }
53
-
54
- # Send OTP via Brevo
55
- try:
56
- await send_otp_email(email, otp)
57
- print(f"[INFO] OTP generated and sent to {email}")
58
- except Exception as e:
59
- # Remove OTP from store if email sending failed
60
- if email.lower() in otp_store:
61
- del otp_store[email.lower()]
62
- raise HTTPException(
63
- status_code=500,
64
- detail=f"Failed to send OTP email: {str(e)}"
65
- )
66
-
67
- return {
68
- "message": "OTP sent to your email address",
69
- "expires_in_minutes": 10
70
- }
71
-
72
-
73
- async def verify_otp(email: str, otp: str, db: Session) -> User:
74
- """
75
- Verify OTP and return/create user.
76
-
77
- Args:
78
- email: Email address
79
- otp: OTP code to verify
80
- db: Database session
81
-
82
- Returns:
83
- User object
84
-
85
- Raises:
86
- HTTPException: If OTP is invalid, expired, or max attempts exceeded
87
- """
88
- email_lower = email.lower()
89
- stored = otp_store.get(email_lower)
90
-
91
- if not stored:
92
- raise HTTPException(
93
- status_code=400,
94
- detail="OTP not found. Please request a new OTP."
95
- )
96
-
97
- # Check if expired
98
- if datetime.utcnow() > stored['expires_at']:
99
- del otp_store[email_lower]
100
- raise HTTPException(
101
- status_code=400,
102
- detail="OTP has expired. Please request a new OTP."
103
- )
104
-
105
- # Check max attempts
106
- if stored['attempts'] >= stored['max_attempts']:
107
- del otp_store[email_lower]
108
- raise HTTPException(
109
- status_code=400,
110
- detail="Maximum verification attempts exceeded. Please request a new OTP."
111
- )
112
-
113
- # Verify OTP
114
- if stored['otp'] != otp:
115
- stored['attempts'] += 1
116
- remaining_attempts = stored['max_attempts'] - stored['attempts']
117
- raise HTTPException(
118
- status_code=400,
119
- detail=f"Invalid OTP. {remaining_attempts} attempt(s) remaining."
120
- )
121
-
122
- # OTP verified successfully
123
- # Get or create user
124
- user = db.query(User).filter(User.email == email_lower).first()
125
-
126
- if not user:
127
- user = User(
128
- email=email_lower,
129
- auth_method='otp',
130
- email_verified=True
131
- )
132
- db.add(user)
133
- db.commit()
134
- db.refresh(user)
135
- print(f"[INFO] New user created via OTP: {email_lower}")
136
-
137
- # Enrich contact data from Apollo.io and update Brevo + Monday.com
138
- try:
139
- from .apollo_service import enrich_contact_by_email
140
- from .brevo_service import create_brevo_contact, BREVO_TRIAL_LIST_ID
141
- from .monday_service import create_monday_lead
142
-
143
- # Enrich contact data from Apollo.io
144
- enriched_data = await enrich_contact_by_email(email_lower)
145
-
146
- # Use enriched data if available
147
- first_name = enriched_data.get("first_name") if enriched_data else None
148
- last_name = enriched_data.get("last_name") if enriched_data else None
149
- org_name = enriched_data.get("organization_name") if enriched_data else None
150
-
151
- # Fallback to email domain if Apollo didn't provide organization
152
- if not org_name:
153
- org_domain = email_lower.split('@')[1] if '@' in email_lower else None
154
- org_name = org_domain.split('.')[0].capitalize() if org_domain else None
155
-
156
- # Update Brevo contact with enriched data
157
- await create_brevo_contact(
158
- email=email_lower,
159
- first_name=first_name,
160
- last_name=last_name,
161
- organization_name=org_name or (enriched_data.get("organization_name") if enriched_data else None),
162
- phone_number=enriched_data.get("phone_number") if enriched_data else None,
163
- linkedin_url=enriched_data.get("linkedin_url") if enriched_data else None,
164
- title=enriched_data.get("title") if enriched_data else None,
165
- headline=enriched_data.get("headline") if enriched_data else None,
166
- organization_website=enriched_data.get("organization_website") if enriched_data else None,
167
- organization_address=enriched_data.get("organization_address") if enriched_data else None,
168
- list_id=BREVO_TRIAL_LIST_ID
169
- )
170
-
171
- # Create lead in Monday.com
172
- await create_monday_lead(
173
- email=email_lower,
174
- first_name=first_name,
175
- last_name=last_name,
176
- phone_number=enriched_data.get("phone_number") if enriched_data else None,
177
- linkedin_url=enriched_data.get("linkedin_url") if enriched_data else None,
178
- title=enriched_data.get("title") if enriched_data else None,
179
- headline=enriched_data.get("headline") if enriched_data else None,
180
- organization_name=org_name or (enriched_data.get("organization_name") if enriched_data else None),
181
- organization_website=enriched_data.get("organization_website") if enriched_data else None,
182
- organization_address=enriched_data.get("organization_address") if enriched_data else None,
183
- )
184
- except Exception as e:
185
- # Don't fail user creation if integrations fail
186
- print(f"[WARNING] Failed to enrich/update contact for {email_lower}: {str(e)}")
187
- else:
188
- user.email_verified = True
189
- if user.auth_method != 'otp':
190
- user.auth_method = 'otp'
191
- db.commit()
192
- print(f"[INFO] User verified via OTP: {email_lower}")
193
-
194
- # Remove OTP from store after successful verification
195
- del otp_store[email_lower]
196
-
197
- return user
198
-
199
- =======
200
  """
201
  OTP (One-Time Password) service for email-based authentication.
202
  """
@@ -393,5 +194,3 @@ async def verify_otp(email: str, otp: str, db: Session) -> User:
393
  del otp_store[email_lower]
394
 
395
  return user
396
-
397
- >>>>>>> daae7a900bd14d0802e4f04b99edb85493053f1d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  """
2
  OTP (One-Time Password) service for email-based authentication.
3
  """
 
194
  del otp_store[email_lower]
195
 
196
  return user
 
 
backend/app/schemas.py CHANGED
@@ -1,31 +1,3 @@
1
- <<<<<<< HEAD
2
- from pydantic import BaseModel
3
- from typing import Dict, Optional
4
- from datetime import datetime
5
-
6
-
7
- class ExtractionStage(BaseModel):
8
- time: int
9
- status: str
10
- variation: str
11
-
12
-
13
- class ExtractionRecordBase(BaseModel):
14
- id: int
15
- fileName: str
16
- fileType: str
17
- fileSize: str
18
- extractedAt: datetime
19
- status: str
20
- confidence: float
21
- fieldsExtracted: int
22
- totalTime: int
23
- stages: Dict[str, ExtractionStage]
24
- errorMessage: Optional[str] = None
25
-
26
- class Config:
27
- from_attributes = True
28
- =======
29
  from pydantic import BaseModel
30
  from typing import Dict, Optional
31
  from datetime import datetime
@@ -52,4 +24,3 @@ class ExtractionRecordBase(BaseModel):
52
 
53
  class Config:
54
  from_attributes = True
55
- >>>>>>> daae7a900bd14d0802e4f04b99edb85493053f1d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  from pydantic import BaseModel
2
  from typing import Dict, Optional
3
  from datetime import datetime
 
24
 
25
  class Config:
26
  from_attributes = True