MichaelWelsch commited on
Commit
0f792c2
·
verified ·
1 Parent(s): f73ccac

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +115 -53
app.py CHANGED
@@ -11,8 +11,10 @@ Gradio App – robuste Async-Ausführung via Status-Endpoints
11
  - FIX: exclude_hash richtig laden/berücksichtigen (paginierte Wholix-Suche)
12
  - FIX: pro Lead harte Fehlerisolierung -> kein Gradio-Absturz
13
  - NEU: Speichert IMMER in Wholix. Falls keine echte E-Mail vorhanden ist,
14
- wird eine stabile Platzhalter-Adresse generiert (no-email.invalid)
15
- und ein Tag "no-email" gesetzt.
 
 
16
  """
17
 
18
  import json
@@ -40,7 +42,7 @@ WHOLIX_BASE_URL = "https://api.wholix.ai"
40
 
41
  MAX_LEADS = 100 # Sicherheitskappe
42
  MAX_WORKERS = 16 # gleichzeitige Pipelines am Server
43
- JOB_TTL_SEC = 60 * 60 # wie lange fertige Jobs im Speicher bleiben (1h)
44
 
45
  # ============================== LOGGING ====================================
46
 
@@ -148,6 +150,46 @@ def req(
148
  return _safe_json(text)
149
  return text
150
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
151
  # ========================= Async Start & Poll ==============================
152
 
153
  def _looks_like_ngrok_html(body: Any) -> bool:
@@ -410,6 +452,8 @@ def wholix_login(email: str, password: str) -> str:
410
 
411
  # ===================== Helfer für Platzhalter-E-Mail =======================
412
 
 
 
413
  def _slug(val: Any, maxlen: int = 24) -> str:
414
  s = "" if val is None else str(val)
415
  s = s.lower().strip()
@@ -419,25 +463,70 @@ def _slug(val: Any, maxlen: int = 24) -> str:
419
 
420
 
421
  def _make_placeholder_email(record: dict) -> str:
 
422
  fn = _slug(record.get("firstname"))
423
  ln = _slug(record.get("lastname"))
424
- co = _slug(record.get("company_name"))
425
  cid = _slug(record.get("exclude_hash") or uuid.uuid4().hex[:8], maxlen=16)
426
- local = "-".join([p for p in [fn, ln, co] if p]) or "lead"
427
- local = re.sub(r"\.+", ".", local.replace("-", "."))
428
- local = local.strip(".")[:48]
429
- return f"{local}.{cid}@no-email.invalid"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
430
 
431
 
432
  def wholix_store_contact(token: str, record: dict, module: str = "Contacts", allow_placeholder: bool = True) -> dict:
433
  """
434
  Sendet NUR erlaubte Felder an Wholix und saniert problematische Werte.
435
- NEU: Wenn keine E-Mail vorhanden ist und allow_placeholder=True,
436
- wird automatisch eine Platzhalter-Adresse erzeugt (no-email.invalid),
437
- damit der Datensatz *immer* gespeichert werden kann.
438
  """
439
- import urllib.parse as _urlparse
440
-
441
  if not isinstance(record, dict):
442
  raise ValueError("Wholix: record muss ein dict sein.")
443
 
@@ -449,14 +538,15 @@ def wholix_store_contact(token: str, record: dict, module: str = "Contacts", all
449
  return s if s else None
450
 
451
  email = _clean_str(record.get("email"))
452
- if not email and allow_placeholder:
453
- email = _make_placeholder_email(record)
454
- record["email"] = email
455
- # Kennzeichnungs-Tag ergänzen (wird später sauber normiert)
456
- if isinstance(record.get("tags"), dict):
457
- keys = list({*record["tags"].get("keys", []), "no-email"})
458
- vals = list({*record["tags"].get("values", []), "no-email"})
459
- record["tags"] = {"keys": keys, "values": vals}
 
460
 
461
  ALLOWED = {
462
  "firstname",
@@ -491,20 +581,6 @@ def wholix_store_contact(token: str, record: dict, module: str = "Contacts", all
491
  v = v[1:-1].strip().strip("'\"")
492
  return _clean_str(v)
493
 
494
- def _normalize_url(u):
495
- u = _clean_str(u)
496
- if not u:
497
- return None
498
- if not re.match(r"^[a-zA-Z][a-zA-Z0-9+\-.]*://", u):
499
- u = "https://" + u
500
- try:
501
- pr = _urlparse.urlparse(u)
502
- if not pr.scheme or not pr.netloc:
503
- return None
504
- return u
505
- except Exception:
506
- return None
507
-
508
  out = {}
509
  for k in ALLOWED:
510
  if k not in record:
@@ -527,10 +603,8 @@ def wholix_store_contact(token: str, record: dict, module: str = "Contacts", all
527
  # Pflichtfeld sicher (jetzt inkl. Platzhalter möglich)
528
  out["email"] = _clean_str(email)
529
 
530
- url = f"{WHOLIX_BASE_URL}/api/v1/table-object-data/store-objects"
531
- headers = {"Authorization": f"Bearer {token}"}
532
- body = {"module": module, "action": "store", "data": [out]}
533
- return req(url, method="POST", headers=headers, json_body=body, timeout=(5.0, 30.0))
534
 
535
 
536
  # ======= NEW: Wholix-Excludes paginiert laden (wie in deinem JS) ===========
@@ -847,19 +921,7 @@ def run_pipeline_bg(job_id: str, curl_text: str, n_leads_ui: int):
847
  return s or None
848
 
849
  def norm_url(u: Any) -> Optional[str]:
850
- u = "" if u is None else str(u).strip()
851
- if not u:
852
- return None
853
- if not re.match(r"^[a-zA-Z][a-zA-Z0-9+\-.]*://", u):
854
- u = "https://" + u
855
- try:
856
- from urllib.parse import urlparse
857
- pr = urlparse(u)
858
- if pr.scheme and pr.netloc:
859
- return u
860
- except Exception:
861
- pass
862
- return None
863
 
864
  for i in range(1, n_leads + 1):
865
  try:
@@ -899,7 +961,7 @@ def run_pipeline_bg(job_id: str, curl_text: str, n_leads_ui: int):
899
  "Checkliste_Landingpage": checklist,
900
  "homepage_url": homepage_url, # optional
901
  "tags": raw_tag, # kommt als "[AI]" → später normiert
902
- "Touch_Point": "LinkedIn DM", # optional
903
  }
904
  try:
905
  draft = email_generate_async(token_id, variables, items)
 
11
  - FIX: exclude_hash richtig laden/berücksichtigen (paginierte Wholix-Suche)
12
  - FIX: pro Lead harte Fehlerisolierung -> kein Gradio-Absturz
13
  - NEU: Speichert IMMER in Wholix. Falls keine echte E-Mail vorhanden ist,
14
+ wird eine stabile Platzhalter-Adresse generiert (example.com) und
15
+ ein Tag "no-email" gesetzt.
16
+ - NEU: 422-Robustheit -> progressive Degradation & Minimal-Record-Fallback
17
+ - NEU: URL-Normalisierung (Umlaute/IDNA) für Pfad/Query/Fragment
18
  """
19
 
20
  import json
 
42
 
43
  MAX_LEADS = 100 # Sicherheitskappe
44
  MAX_WORKERS = 16 # gleichzeitige Pipelines am Server
45
+ JOB_TTL_SEC = 60 * 60 # wie lange fertige Jobs im Speicher bleiben (1h)
46
 
47
  # ============================== LOGGING ====================================
48
 
 
150
  return _safe_json(text)
151
  return text
152
 
153
+ # ========================= URL NORMALISIERUNG ==============================
154
+
155
+ from urllib.parse import urlsplit, urlunsplit, quote
156
+
157
+ def _normalize_url(u: Optional[str]) -> Optional[str]:
158
+ """
159
+ Normalisiert URLs für strenge Validatoren:
160
+ - Scheme ergänzen (https)
161
+ - Host -> IDNA (punycode)
162
+ - Path/Query/Fragment UTF-8 percent-encoden
163
+ """
164
+ if not u:
165
+ return None
166
+ u = u.strip()
167
+ if not re.match(r"^[a-zA-Z][a-zA-Z0-9+\-.]*://", u):
168
+ u = "https://" + u
169
+ try:
170
+ parts = urlsplit(u)
171
+ if not parts.scheme or not parts.netloc:
172
+ return None
173
+ # Host zu IDNA
174
+ try:
175
+ host = parts.hostname.encode("idna").decode("ascii") if parts.hostname else ""
176
+ except Exception:
177
+ return None
178
+ netloc = host
179
+ if parts.port:
180
+ netloc += f":{parts.port}"
181
+ if parts.username:
182
+ auth = parts.username
183
+ if parts.password:
184
+ auth += f":{parts.password}"
185
+ netloc = f"{auth}@{netloc}"
186
+ path = quote(parts.path or "", safe="/-._~")
187
+ query = quote(parts.query or "", safe="=&?/-._~")
188
+ fragment = quote(parts.fragment or "", safe="/-._~")
189
+ return urlunsplit((parts.scheme.lower(), netloc.lower(), path, query, fragment))
190
+ except Exception:
191
+ return None
192
+
193
  # ========================= Async Start & Poll ==============================
194
 
195
  def _looks_like_ngrok_html(body: Any) -> bool:
 
452
 
453
  # ===================== Helfer für Platzhalter-E-Mail =======================
454
 
455
+ EMAIL_RE = re.compile(r"^[^@\s]+@[^@\s]+\.[^@\s]+$")
456
+
457
  def _slug(val: Any, maxlen: int = 24) -> str:
458
  s = "" if val is None else str(val)
459
  s = s.lower().strip()
 
463
 
464
 
465
  def _make_placeholder_email(record: dict) -> str:
466
+ # Stable, valide, kurz
467
  fn = _slug(record.get("firstname"))
468
  ln = _slug(record.get("lastname"))
469
+ base = (fn + "." + ln).strip(".") or "lead"
470
  cid = _slug(record.get("exclude_hash") or uuid.uuid4().hex[:8], maxlen=16)
471
+ local = f"{base}.{cid}"[:48]
472
+ return f"{local}@example.com" # RFC-Reservedomäne, von Validatoren akzeptiert
473
+
474
+ # ===================== Wholix Store + Fallbacks ============================
475
+
476
+ def _store_with_fallbacks(token: str, payload: dict, module: str) -> dict:
477
+ """
478
+ Versucht 3 Stufen, um 422 zu vermeiden:
479
+ 1) Original-Payload
480
+ 2) Mit reparierten/entfernten URLs
481
+ 3) Minimal-Record (email, firstname, lastname, company_name, exclude_hash, tags)
482
+ """
483
+ url = f"{WHOLIX_BASE_URL}/api/v1/table-object-data/store-objects"
484
+ headers = {"Authorization": f"Bearer {token}"}
485
+
486
+ def _post(body):
487
+ return req(url, method="POST", headers=headers, json_body=body, timeout=(5.0, 30.0))
488
+
489
+ # 1: Original
490
+ body1 = {"module": module, "action": "store", "data": [payload]}
491
+ try:
492
+ return _post(body1)
493
+ except HTTPError as e:
494
+ if e.status != 422:
495
+ raise
496
+
497
+ # 2: URLs reparieren/entfernen
498
+ p2 = dict(payload)
499
+ for k in ("linkedin_url", "company_url"):
500
+ if p2.get(k):
501
+ fixed = _normalize_url(p2.get(k))
502
+ if fixed:
503
+ p2[k] = fixed
504
+ else:
505
+ p2.pop(k, None)
506
+ try:
507
+ return _post({"module": module, "action": "store", "data": [p2]})
508
+ except HTTPError as e:
509
+ if e.status != 422:
510
+ raise
511
+
512
+ # 3: Minimal-Record
513
+ minimal = {
514
+ "email": p2.get("email") or _make_placeholder_email(p2),
515
+ "firstname": p2.get("firstname") or "",
516
+ "lastname": p2.get("lastname") or "",
517
+ "company_name": p2.get("company_name") or "",
518
+ "exclude_hash": p2.get("exclude_hash") or _slug(uuid.uuid4().hex[:8]),
519
+ "tags": p2.get("tags") or {"keys": ["no-email"], "values": ["no-email"]},
520
+ }
521
+ return _post({"module": module, "action": "store", "data": [minimal]})
522
 
523
 
524
  def wholix_store_contact(token: str, record: dict, module: str = "Contacts", allow_placeholder: bool = True) -> dict:
525
  """
526
  Sendet NUR erlaubte Felder an Wholix und saniert problematische Werte.
527
+ IMMER speicherbar: erzeugt bei Bedarf Placeholder-Mail (example.com) + Tag.
528
+ Nutzt _store_with_fallbacks gegen 422.
 
529
  """
 
 
530
  if not isinstance(record, dict):
531
  raise ValueError("Wholix: record muss ein dict sein.")
532
 
 
538
  return s if s else None
539
 
540
  email = _clean_str(record.get("email"))
541
+ if not email or not EMAIL_RE.match(email):
542
+ if allow_placeholder:
543
+ email = _make_placeholder_email(record)
544
+ record["email"] = email
545
+ if isinstance(record.get("tags"), dict):
546
+ for k in ("keys", "values"):
547
+ record["tags"].setdefault(k, [])
548
+ if "no-email" not in record["tags"][k]:
549
+ record["tags"][k].append("no-email")
550
 
551
  ALLOWED = {
552
  "firstname",
 
581
  v = v[1:-1].strip().strip("'\"")
582
  return _clean_str(v)
583
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
584
  out = {}
585
  for k in ALLOWED:
586
  if k not in record:
 
603
  # Pflichtfeld sicher (jetzt inkl. Platzhalter möglich)
604
  out["email"] = _clean_str(email)
605
 
606
+ # POST mit Fallback-Logik
607
+ return _store_with_fallbacks(token, out, module)
 
 
608
 
609
 
610
  # ======= NEW: Wholix-Excludes paginiert laden (wie in deinem JS) ===========
 
921
  return s or None
922
 
923
  def norm_url(u: Any) -> Optional[str]:
924
+ return _normalize_url(u)
 
 
 
 
 
 
 
 
 
 
 
 
925
 
926
  for i in range(1, n_leads + 1):
927
  try:
 
961
  "Checkliste_Landingpage": checklist,
962
  "homepage_url": homepage_url, # optional
963
  "tags": raw_tag, # kommt als "[AI]" → später normiert
964
+ "Touch_Point": "LinkedIn DM", # optional
965
  }
966
  try:
967
  draft = email_generate_async(token_id, variables, items)