Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -11,8 +11,10 @@ Gradio App – robuste Async-Ausführung via Status-Endpoints
|
|
| 11 |
- FIX: exclude_hash richtig laden/berücksichtigen (paginierte Wholix-Suche)
|
| 12 |
- FIX: pro Lead harte Fehlerisolierung -> kein Gradio-Absturz
|
| 13 |
- NEU: Speichert IMMER in Wholix. Falls keine echte E-Mail vorhanden ist,
|
| 14 |
-
wird eine stabile Platzhalter-Adresse generiert (
|
| 15 |
-
|
|
|
|
|
|
|
| 16 |
"""
|
| 17 |
|
| 18 |
import json
|
|
@@ -40,7 +42,7 @@ WHOLIX_BASE_URL = "https://api.wholix.ai"
|
|
| 40 |
|
| 41 |
MAX_LEADS = 100 # Sicherheitskappe
|
| 42 |
MAX_WORKERS = 16 # gleichzeitige Pipelines am Server
|
| 43 |
-
JOB_TTL_SEC = 60 * 60
|
| 44 |
|
| 45 |
# ============================== LOGGING ====================================
|
| 46 |
|
|
@@ -148,6 +150,46 @@ def req(
|
|
| 148 |
return _safe_json(text)
|
| 149 |
return text
|
| 150 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 151 |
# ========================= Async Start & Poll ==============================
|
| 152 |
|
| 153 |
def _looks_like_ngrok_html(body: Any) -> bool:
|
|
@@ -410,6 +452,8 @@ def wholix_login(email: str, password: str) -> str:
|
|
| 410 |
|
| 411 |
# ===================== Helfer für Platzhalter-E-Mail =======================
|
| 412 |
|
|
|
|
|
|
|
| 413 |
def _slug(val: Any, maxlen: int = 24) -> str:
|
| 414 |
s = "" if val is None else str(val)
|
| 415 |
s = s.lower().strip()
|
|
@@ -419,25 +463,70 @@ def _slug(val: Any, maxlen: int = 24) -> str:
|
|
| 419 |
|
| 420 |
|
| 421 |
def _make_placeholder_email(record: dict) -> str:
|
|
|
|
| 422 |
fn = _slug(record.get("firstname"))
|
| 423 |
ln = _slug(record.get("lastname"))
|
| 424 |
-
|
| 425 |
cid = _slug(record.get("exclude_hash") or uuid.uuid4().hex[:8], maxlen=16)
|
| 426 |
-
local = "
|
| 427 |
-
|
| 428 |
-
|
| 429 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 430 |
|
| 431 |
|
| 432 |
def wholix_store_contact(token: str, record: dict, module: str = "Contacts", allow_placeholder: bool = True) -> dict:
|
| 433 |
"""
|
| 434 |
Sendet NUR erlaubte Felder an Wholix und saniert problematische Werte.
|
| 435 |
-
|
| 436 |
-
|
| 437 |
-
damit der Datensatz *immer* gespeichert werden kann.
|
| 438 |
"""
|
| 439 |
-
import urllib.parse as _urlparse
|
| 440 |
-
|
| 441 |
if not isinstance(record, dict):
|
| 442 |
raise ValueError("Wholix: record muss ein dict sein.")
|
| 443 |
|
|
@@ -449,14 +538,15 @@ def wholix_store_contact(token: str, record: dict, module: str = "Contacts", all
|
|
| 449 |
return s if s else None
|
| 450 |
|
| 451 |
email = _clean_str(record.get("email"))
|
| 452 |
-
if not email
|
| 453 |
-
|
| 454 |
-
|
| 455 |
-
|
| 456 |
-
|
| 457 |
-
|
| 458 |
-
|
| 459 |
-
|
|
|
|
| 460 |
|
| 461 |
ALLOWED = {
|
| 462 |
"firstname",
|
|
@@ -491,20 +581,6 @@ def wholix_store_contact(token: str, record: dict, module: str = "Contacts", all
|
|
| 491 |
v = v[1:-1].strip().strip("'\"")
|
| 492 |
return _clean_str(v)
|
| 493 |
|
| 494 |
-
def _normalize_url(u):
|
| 495 |
-
u = _clean_str(u)
|
| 496 |
-
if not u:
|
| 497 |
-
return None
|
| 498 |
-
if not re.match(r"^[a-zA-Z][a-zA-Z0-9+\-.]*://", u):
|
| 499 |
-
u = "https://" + u
|
| 500 |
-
try:
|
| 501 |
-
pr = _urlparse.urlparse(u)
|
| 502 |
-
if not pr.scheme or not pr.netloc:
|
| 503 |
-
return None
|
| 504 |
-
return u
|
| 505 |
-
except Exception:
|
| 506 |
-
return None
|
| 507 |
-
|
| 508 |
out = {}
|
| 509 |
for k in ALLOWED:
|
| 510 |
if k not in record:
|
|
@@ -527,10 +603,8 @@ def wholix_store_contact(token: str, record: dict, module: str = "Contacts", all
|
|
| 527 |
# Pflichtfeld sicher (jetzt inkl. Platzhalter möglich)
|
| 528 |
out["email"] = _clean_str(email)
|
| 529 |
|
| 530 |
-
|
| 531 |
-
|
| 532 |
-
body = {"module": module, "action": "store", "data": [out]}
|
| 533 |
-
return req(url, method="POST", headers=headers, json_body=body, timeout=(5.0, 30.0))
|
| 534 |
|
| 535 |
|
| 536 |
# ======= NEW: Wholix-Excludes paginiert laden (wie in deinem JS) ===========
|
|
@@ -847,19 +921,7 @@ def run_pipeline_bg(job_id: str, curl_text: str, n_leads_ui: int):
|
|
| 847 |
return s or None
|
| 848 |
|
| 849 |
def norm_url(u: Any) -> Optional[str]:
|
| 850 |
-
|
| 851 |
-
if not u:
|
| 852 |
-
return None
|
| 853 |
-
if not re.match(r"^[a-zA-Z][a-zA-Z0-9+\-.]*://", u):
|
| 854 |
-
u = "https://" + u
|
| 855 |
-
try:
|
| 856 |
-
from urllib.parse import urlparse
|
| 857 |
-
pr = urlparse(u)
|
| 858 |
-
if pr.scheme and pr.netloc:
|
| 859 |
-
return u
|
| 860 |
-
except Exception:
|
| 861 |
-
pass
|
| 862 |
-
return None
|
| 863 |
|
| 864 |
for i in range(1, n_leads + 1):
|
| 865 |
try:
|
|
@@ -899,7 +961,7 @@ def run_pipeline_bg(job_id: str, curl_text: str, n_leads_ui: int):
|
|
| 899 |
"Checkliste_Landingpage": checklist,
|
| 900 |
"homepage_url": homepage_url, # optional
|
| 901 |
"tags": raw_tag, # kommt als "[AI]" → später normiert
|
| 902 |
-
"Touch_Point": "LinkedIn DM",
|
| 903 |
}
|
| 904 |
try:
|
| 905 |
draft = email_generate_async(token_id, variables, items)
|
|
|
|
| 11 |
- FIX: exclude_hash richtig laden/berücksichtigen (paginierte Wholix-Suche)
|
| 12 |
- FIX: pro Lead harte Fehlerisolierung -> kein Gradio-Absturz
|
| 13 |
- NEU: Speichert IMMER in Wholix. Falls keine echte E-Mail vorhanden ist,
|
| 14 |
+
wird eine stabile Platzhalter-Adresse generiert (example.com) und
|
| 15 |
+
ein Tag "no-email" gesetzt.
|
| 16 |
+
- NEU: 422-Robustheit -> progressive Degradation & Minimal-Record-Fallback
|
| 17 |
+
- NEU: URL-Normalisierung (Umlaute/IDNA) für Pfad/Query/Fragment
|
| 18 |
"""
|
| 19 |
|
| 20 |
import json
|
|
|
|
| 42 |
|
| 43 |
MAX_LEADS = 100 # Sicherheitskappe
|
| 44 |
MAX_WORKERS = 16 # gleichzeitige Pipelines am Server
|
| 45 |
+
JOB_TTL_SEC = 60 * 60 # wie lange fertige Jobs im Speicher bleiben (1h)
|
| 46 |
|
| 47 |
# ============================== LOGGING ====================================
|
| 48 |
|
|
|
|
| 150 |
return _safe_json(text)
|
| 151 |
return text
|
| 152 |
|
| 153 |
+
# ========================= URL NORMALISIERUNG ==============================
|
| 154 |
+
|
| 155 |
+
from urllib.parse import urlsplit, urlunsplit, quote
|
| 156 |
+
|
| 157 |
+
def _normalize_url(u: Optional[str]) -> Optional[str]:
|
| 158 |
+
"""
|
| 159 |
+
Normalisiert URLs für strenge Validatoren:
|
| 160 |
+
- Scheme ergänzen (https)
|
| 161 |
+
- Host -> IDNA (punycode)
|
| 162 |
+
- Path/Query/Fragment UTF-8 percent-encoden
|
| 163 |
+
"""
|
| 164 |
+
if not u:
|
| 165 |
+
return None
|
| 166 |
+
u = u.strip()
|
| 167 |
+
if not re.match(r"^[a-zA-Z][a-zA-Z0-9+\-.]*://", u):
|
| 168 |
+
u = "https://" + u
|
| 169 |
+
try:
|
| 170 |
+
parts = urlsplit(u)
|
| 171 |
+
if not parts.scheme or not parts.netloc:
|
| 172 |
+
return None
|
| 173 |
+
# Host zu IDNA
|
| 174 |
+
try:
|
| 175 |
+
host = parts.hostname.encode("idna").decode("ascii") if parts.hostname else ""
|
| 176 |
+
except Exception:
|
| 177 |
+
return None
|
| 178 |
+
netloc = host
|
| 179 |
+
if parts.port:
|
| 180 |
+
netloc += f":{parts.port}"
|
| 181 |
+
if parts.username:
|
| 182 |
+
auth = parts.username
|
| 183 |
+
if parts.password:
|
| 184 |
+
auth += f":{parts.password}"
|
| 185 |
+
netloc = f"{auth}@{netloc}"
|
| 186 |
+
path = quote(parts.path or "", safe="/-._~")
|
| 187 |
+
query = quote(parts.query or "", safe="=&?/-._~")
|
| 188 |
+
fragment = quote(parts.fragment or "", safe="/-._~")
|
| 189 |
+
return urlunsplit((parts.scheme.lower(), netloc.lower(), path, query, fragment))
|
| 190 |
+
except Exception:
|
| 191 |
+
return None
|
| 192 |
+
|
| 193 |
# ========================= Async Start & Poll ==============================
|
| 194 |
|
| 195 |
def _looks_like_ngrok_html(body: Any) -> bool:
|
|
|
|
| 452 |
|
| 453 |
# ===================== Helfer für Platzhalter-E-Mail =======================
|
| 454 |
|
| 455 |
+
EMAIL_RE = re.compile(r"^[^@\s]+@[^@\s]+\.[^@\s]+$")
|
| 456 |
+
|
| 457 |
def _slug(val: Any, maxlen: int = 24) -> str:
|
| 458 |
s = "" if val is None else str(val)
|
| 459 |
s = s.lower().strip()
|
|
|
|
| 463 |
|
| 464 |
|
| 465 |
def _make_placeholder_email(record: dict) -> str:
|
| 466 |
+
# Stable, valide, kurz
|
| 467 |
fn = _slug(record.get("firstname"))
|
| 468 |
ln = _slug(record.get("lastname"))
|
| 469 |
+
base = (fn + "." + ln).strip(".") or "lead"
|
| 470 |
cid = _slug(record.get("exclude_hash") or uuid.uuid4().hex[:8], maxlen=16)
|
| 471 |
+
local = f"{base}.{cid}"[:48]
|
| 472 |
+
return f"{local}@example.com" # RFC-Reservedomäne, von Validatoren akzeptiert
|
| 473 |
+
|
| 474 |
+
# ===================== Wholix Store + Fallbacks ============================
|
| 475 |
+
|
| 476 |
+
def _store_with_fallbacks(token: str, payload: dict, module: str) -> dict:
|
| 477 |
+
"""
|
| 478 |
+
Versucht 3 Stufen, um 422 zu vermeiden:
|
| 479 |
+
1) Original-Payload
|
| 480 |
+
2) Mit reparierten/entfernten URLs
|
| 481 |
+
3) Minimal-Record (email, firstname, lastname, company_name, exclude_hash, tags)
|
| 482 |
+
"""
|
| 483 |
+
url = f"{WHOLIX_BASE_URL}/api/v1/table-object-data/store-objects"
|
| 484 |
+
headers = {"Authorization": f"Bearer {token}"}
|
| 485 |
+
|
| 486 |
+
def _post(body):
|
| 487 |
+
return req(url, method="POST", headers=headers, json_body=body, timeout=(5.0, 30.0))
|
| 488 |
+
|
| 489 |
+
# 1: Original
|
| 490 |
+
body1 = {"module": module, "action": "store", "data": [payload]}
|
| 491 |
+
try:
|
| 492 |
+
return _post(body1)
|
| 493 |
+
except HTTPError as e:
|
| 494 |
+
if e.status != 422:
|
| 495 |
+
raise
|
| 496 |
+
|
| 497 |
+
# 2: URLs reparieren/entfernen
|
| 498 |
+
p2 = dict(payload)
|
| 499 |
+
for k in ("linkedin_url", "company_url"):
|
| 500 |
+
if p2.get(k):
|
| 501 |
+
fixed = _normalize_url(p2.get(k))
|
| 502 |
+
if fixed:
|
| 503 |
+
p2[k] = fixed
|
| 504 |
+
else:
|
| 505 |
+
p2.pop(k, None)
|
| 506 |
+
try:
|
| 507 |
+
return _post({"module": module, "action": "store", "data": [p2]})
|
| 508 |
+
except HTTPError as e:
|
| 509 |
+
if e.status != 422:
|
| 510 |
+
raise
|
| 511 |
+
|
| 512 |
+
# 3: Minimal-Record
|
| 513 |
+
minimal = {
|
| 514 |
+
"email": p2.get("email") or _make_placeholder_email(p2),
|
| 515 |
+
"firstname": p2.get("firstname") or "",
|
| 516 |
+
"lastname": p2.get("lastname") or "",
|
| 517 |
+
"company_name": p2.get("company_name") or "",
|
| 518 |
+
"exclude_hash": p2.get("exclude_hash") or _slug(uuid.uuid4().hex[:8]),
|
| 519 |
+
"tags": p2.get("tags") or {"keys": ["no-email"], "values": ["no-email"]},
|
| 520 |
+
}
|
| 521 |
+
return _post({"module": module, "action": "store", "data": [minimal]})
|
| 522 |
|
| 523 |
|
| 524 |
def wholix_store_contact(token: str, record: dict, module: str = "Contacts", allow_placeholder: bool = True) -> dict:
|
| 525 |
"""
|
| 526 |
Sendet NUR erlaubte Felder an Wholix und saniert problematische Werte.
|
| 527 |
+
IMMER speicherbar: erzeugt bei Bedarf Placeholder-Mail (example.com) + Tag.
|
| 528 |
+
Nutzt _store_with_fallbacks gegen 422.
|
|
|
|
| 529 |
"""
|
|
|
|
|
|
|
| 530 |
if not isinstance(record, dict):
|
| 531 |
raise ValueError("Wholix: record muss ein dict sein.")
|
| 532 |
|
|
|
|
| 538 |
return s if s else None
|
| 539 |
|
| 540 |
email = _clean_str(record.get("email"))
|
| 541 |
+
if not email or not EMAIL_RE.match(email):
|
| 542 |
+
if allow_placeholder:
|
| 543 |
+
email = _make_placeholder_email(record)
|
| 544 |
+
record["email"] = email
|
| 545 |
+
if isinstance(record.get("tags"), dict):
|
| 546 |
+
for k in ("keys", "values"):
|
| 547 |
+
record["tags"].setdefault(k, [])
|
| 548 |
+
if "no-email" not in record["tags"][k]:
|
| 549 |
+
record["tags"][k].append("no-email")
|
| 550 |
|
| 551 |
ALLOWED = {
|
| 552 |
"firstname",
|
|
|
|
| 581 |
v = v[1:-1].strip().strip("'\"")
|
| 582 |
return _clean_str(v)
|
| 583 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 584 |
out = {}
|
| 585 |
for k in ALLOWED:
|
| 586 |
if k not in record:
|
|
|
|
| 603 |
# Pflichtfeld sicher (jetzt inkl. Platzhalter möglich)
|
| 604 |
out["email"] = _clean_str(email)
|
| 605 |
|
| 606 |
+
# POST mit Fallback-Logik
|
| 607 |
+
return _store_with_fallbacks(token, out, module)
|
|
|
|
|
|
|
| 608 |
|
| 609 |
|
| 610 |
# ======= NEW: Wholix-Excludes paginiert laden (wie in deinem JS) ===========
|
|
|
|
| 921 |
return s or None
|
| 922 |
|
| 923 |
def norm_url(u: Any) -> Optional[str]:
|
| 924 |
+
return _normalize_url(u)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 925 |
|
| 926 |
for i in range(1, n_leads + 1):
|
| 927 |
try:
|
|
|
|
| 961 |
"Checkliste_Landingpage": checklist,
|
| 962 |
"homepage_url": homepage_url, # optional
|
| 963 |
"tags": raw_tag, # kommt als "[AI]" → später normiert
|
| 964 |
+
"Touch_Point": "LinkedIn DM", # optional
|
| 965 |
}
|
| 966 |
try:
|
| 967 |
draft = email_generate_async(token_id, variables, items)
|