Lukeetah commited on
Commit
db669b7
·
verified ·
1 Parent(s): fff1f95

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +146 -124
app.py CHANGED
@@ -6,11 +6,9 @@ import time
6
  import smtplib
7
  import random
8
  import asyncio
9
- import mimetypes
10
  from dataclasses import dataclass, asdict
11
  from typing import List, Optional, Dict, Any, Tuple
12
  from email.message import EmailMessage
13
- from pathlib import Path
14
  import urllib.parse as ul
15
 
16
  import httpx
@@ -26,15 +24,25 @@ import gradio as gr
26
  DEFAULT_MAX_USD = 90000
27
  DEFAULT_NEIGHBORHOODS = [
28
  "Saavedra", "Nuñez", "La Lucila", "Florida Oeste", "Munro", "Carapachay",
29
- "Olivos", "Villa Martelli"
30
  ]
31
  DEFAULT_TYPES = ["casa", "ph"] # "casa", "ph"
32
- DEFAULT_MIN_ROOMS = 3 # ambientes (asegura oficina)
33
  REQUIRE_BIDET = True
34
  REQUIRE_PET_FRIENDLY = True
35
- REQUIRE_OUTDOOR = True # patio o terraza
 
 
 
 
 
 
 
 
 
 
36
 
37
- # Microzonas residenciales priorizadas (heurística positiva)
38
  MICROZONAS_PRIORITARIAS = [
39
  "Parque Saavedra", "Parque Sarmiento", "Av. Balbín", "Ruiz Huidobro",
40
  "Lomas de Nuñez", "Cabildo", "Plaza Alberti",
@@ -43,7 +51,7 @@ MICROZONAS_PRIORITARIAS = [
43
  "Ugarte", "San Martín", "Panamericana", "Pelliza", "Melo",
44
  ]
45
 
46
- # Anti-scraping: headers, tiempos, rate limit por dominio
47
  USER_AGENT_POOL = [
48
  "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36",
49
  "Mozilla/5.0 (Macintosh; Intel Mac OS X 13_4) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.4 Safari/605.1.15",
@@ -54,9 +62,9 @@ REFERER_POOL = ["https://www.google.com/", "https://www.bing.com/", "https://duc
54
  TIMEOUT = httpx.Timeout(25.0, connect=12.0)
55
  RETRIES = 2
56
  BACKOFF_BASE = 0.9
57
- JITTER_RANGE = (0.12, 0.55) # segundos
58
 
59
- # Rate-limit suave por dominio (segundos min entre hits)
60
  DOMAIN_RATE_LIMIT = {
61
  "www.zonaprop.com.ar": 0.6,
62
  "www.argenprop.com": 0.6,
@@ -70,19 +78,16 @@ DOMAIN_RATE_LIMIT = {
70
  "www.buscatucasa.com.ar": 0.8,
71
  }
72
 
73
- # Proxy opcional (definí PROXY_URL en Secrets si tenés pool)
74
  PROXY_URL = os.getenv("PROXY_URL", "").strip()
75
 
76
- # =========================
77
- # Email (configurado vía Secrets)
78
- # =========================
79
  SMTP_HOST = os.getenv("SMTP_HOST", "").strip()
80
  SMTP_PORT = int(os.getenv("SMTP_PORT", "587"))
81
  SMTP_USER = os.getenv("SMTP_USER", "").strip()
82
  SMTP_PASS = os.getenv("SMTP_PASS", "").strip()
83
  SMTP_FROM = os.getenv("SMTP_FROM", SMTP_USER).strip()
84
  SMTP_USE_SSL = os.getenv("SMTP_USE_SSL", "false").lower() in ("1", "true", "yes")
85
-
86
  EMAIL_REGEX = re.compile(r"^[^@\s]+@[^@\s]+\.[^@\s]+$")
87
 
88
  # =========================
@@ -121,7 +126,7 @@ def to_float_price(value: str) -> Optional[float]:
121
  return float(m.group(1)) if m else None
122
  return None
123
 
124
- def extract_int_from_text(text: str, pattern: str) -> Optional[int]:
125
  if not text:
126
  return None
127
  m = re.search(pattern, text)
@@ -158,13 +163,11 @@ def compute_score(lst: Listing, filters: Dict[str, Any]) -> float:
158
  if filters["require_outdoor"] and (lst.has_patio or lst.has_terrace):
159
  score += 1.0
160
  if filters["require_pet"]:
161
- if lst.pet_friendly:
162
- score += 0.6
163
  else:
164
  score += 0.2
165
  if filters["require_bidet"]:
166
- if lst.has_bidet:
167
- score += 0.6
168
  else:
169
  score += 0.2
170
  score += residential_score(lst.address or "", lst.neighborhood or "", lst.description or "")
@@ -195,7 +198,6 @@ async def domain_throttle(domain: str):
195
  if wait > 0:
196
  await asyncio.sleep(wait)
197
  _last_hit[domain] = time.time()
198
- # jitter suave
199
  await asyncio.sleep(random.uniform(*JITTER_RANGE))
200
 
201
  async def fetch(url: str) -> Optional[str]:
@@ -206,41 +208,36 @@ async def fetch(url: str) -> Optional[str]:
206
  try:
207
  async with httpx.AsyncClient(follow_redirects=True, http2=True, proxies=proxies, timeout=TIMEOUT) as client:
208
  r = await client.get(url, headers=make_headers())
209
- # algunos portales sirven HTML con 200 pero bloquean por JS -> intentamos igualmente parsear
210
- if r.status_code == 200 and r.text and len(r.text) > 1000:
211
  return r.text
212
- # backoff
213
  await asyncio.sleep(BACKOFF_BASE * (2 ** i) + random.uniform(0, 0.35))
214
  except Exception:
215
  await asyncio.sleep(BACKOFF_BASE * (2 ** i) + random.uniform(0, 0.35))
216
  return None
217
 
218
  # =========================
219
- # Portal adapters
220
  # =========================
221
 
222
  class Portal:
223
- def __init__(self, domain: str, search_builder, card_hint: Optional[str] = None):
224
  self.domain = domain
225
- self.search_builder = search_builder # fn(neighs, max_usd, types) -> [urls]
226
- self.card_hint = card_hint # texto para filtrar anchors
227
 
228
  def sb_qparam(base: str, param: str = "q"):
229
  def _builder(neighs: List[str], max_usd: int, types: List[str]) -> List[str]:
230
  urls = []
231
- # permutamos consultas para reducir cache y mejorar recall
232
- syn_outdoor = ["patio", "terraza", "exterior", "pulmón"]
233
- syn_pets = ["mascotas", "pet friendly", "apta mascotas"]
234
- syn_rooms = ["3 ambientes", "tres ambientes", ">=3 ambientes"]
235
  for n in neighs:
236
- for o in syn_outdoor[:2]:
237
- for p in syn_pets[:2]:
238
- q = f"{' o '.join(types)} venta {n} hasta {max_usd} dolares {random.choice(syn_rooms)} {o} {p} bidet"
239
- urls.append(f"{base}?{param}={ul.quote(q)}")
240
  return urls
241
  return _builder
242
 
243
- # Portales contemplados (agregar más es trivial)
244
  PORTALS: List[Portal] = [
245
  Portal("www.zonaprop.com.ar", sb_qparam("https://www.zonaprop.com.ar/propiedades.html", "q")),
246
  Portal("www.argenprop.com", sb_qparam("https://www.argenprop.com/propiedades", "text")),
@@ -254,12 +251,12 @@ PORTALS: List[Portal] = [
254
  Portal("www.buscatucasa.com.ar", sb_qparam("https://www.buscatucasa.com.ar/buscar", "q")),
255
  ]
256
 
 
 
 
 
 
257
  def generic_card_extractor(soup: BeautifulSoup, domain: str) -> List[Dict[str, Any]]:
258
- """
259
- Heurística universal:
260
- - Encuentra <a href> del mismo dominio que parezcan links a avisos.
261
- - Extrae texto cercano para precio y zona.
262
- """
263
  anchors = soup.select("a[href]")
264
  seen = set()
265
  cards = []
@@ -267,44 +264,46 @@ def generic_card_extractor(soup: BeautifulSoup, domain: str) -> List[Dict[str, A
267
  href = a.get("href", "")
268
  if not href:
269
  continue
270
- # normalizar a absoluto
271
  if href.startswith("//"):
272
  href = "https:" + href
273
  elif href.startswith("/"):
274
  href = f"https://{domain}{href}"
 
275
  if domain not in href:
276
  continue
 
277
  if any(x in href for x in ["/login", "/perfil", "/ayuda", "/faq", "/favorito", "/mi-cuenta"]):
278
  continue
 
 
 
 
279
  if href in seen:
280
  continue
281
  seen.add(href)
282
 
283
  title = clean_text(a.get_text(" ", strip=True))
 
 
 
 
 
 
284
  parent = a.find_parent()
285
  block_text = clean_text(parent.get_text(" ", strip=True)) if parent else ""
286
- # precio en USD
287
  m = re.search(r"(U\$S|USD|US\$|D[oó]lares?)\s*([\d\.\,]+)", block_text, re.IGNORECASE)
288
  price_text = m.group(0) if m else ""
289
- # address o barrio clave
290
- addr_m = re.search(r"(Saavedra|Nu[eñ]ez|La Lucila|Florida(?: Oeste)?|Munro|Carapachay|Olivos|Martelli)[^|,]*", block_text, re.IGNORECASE)
291
  addr_text = addr_m.group(0) if addr_m else ""
292
 
293
- # Filtros mínimos para evitar ruido
294
- if len(title) < 8:
295
- continue
296
- if not any(x in href.lower() for x in ["propiedad", "inmueble", "inmuebles", "departamento", "casa", "ph", "detalle", "item", "listing", "publicacion", "aviso", "id"]):
297
- # permitir de todos modos: muchos sitios usan slugs
298
- pass
299
-
300
  cards.append({
301
- "title": title,
302
  "link": href,
303
  "price_text": price_text,
304
  "addr_text": addr_text
305
  })
306
- # quedarnos con primeras N tarjetas decentes
307
- return cards[:40]
308
 
309
  async def scrape_search_page(url: str, domain: str) -> List[Listing]:
310
  html = await fetch(url)
@@ -334,8 +333,7 @@ async def scrape_search_page(url: str, domain: str) -> List[Listing]:
334
  async def scrape_portal(portal: Portal, neighborhoods: List[str], max_usd: int, types: List[str]) -> List[Listing]:
335
  urls = portal.search_builder(neighborhoods, max_usd, types)
336
  results: List[Listing] = []
337
- # tomar un subconjunto para diversidad sin abusar
338
- for u in urls[:6]:
339
  try:
340
  res = await scrape_search_page(u, portal.domain)
341
  results.extend(res)
@@ -343,37 +341,37 @@ async def scrape_portal(portal: Portal, neighborhoods: List[str], max_usd: int,
343
  pass
344
  return results
345
 
346
- async def fetch_detail_and_enrich(lst: Listing) -> Listing:
347
  html = await fetch(lst.link)
348
  if not html:
349
  return lst
350
  soup = BeautifulSoup(html, "lxml")
351
 
352
  # Descripción
353
- desc_el = soup.find(["div", "section"], attrs={"class": re.compile(r"(description|descripcion|post|body|texto)")}) or soup.find("p")
354
- if desc_el:
355
- desc = clean_text(desc_el.get_text(" ", strip=True))
356
- else:
357
- desc = clean_text(" ".join(x.get_text(" ", strip=True) for x in soup.find_all(["p", "li"])[:50]))
358
 
359
  # Inferencias
360
  patio, terraza, mascotas, bidet = feature_guess(desc)
361
 
362
  # Características
363
- features_text = " ".join(
364
  el.get_text(" ", strip=True) for el in soup.find_all(["li", "span", "div"])
365
  if el and el.get_text() and any(x in el.get_text().lower() for x in ["ambiente", "dorm", "bañ"])
366
  ).lower()
367
- rooms = extract_int_from_text(features_text, r"(\d+)\s*ambiente")
368
- bathrooms = extract_int_from_text(features_text, r"(\d+)\s*bañ")
369
- bedrooms = extract_int_from_text(features_text, r"(\d+)\s*dorm")
 
 
370
 
371
- # Dirección si aparece
372
  addr_guess = soup.find(attrs={"class": re.compile(r"(address|ubicacion|ubicación|location|inmo-location)")})
373
  if addr_guess and not lst.address:
374
  lst.address = clean_text(addr_guess.get_text(" ", strip=True))[:200]
375
 
376
  lst.description = desc or lst.description
 
377
  lst.has_patio = lst.has_patio if lst.has_patio is not None else patio
378
  lst.has_terrace = lst.has_terrace if lst.has_terrace is not None else terraza
379
  lst.pet_friendly = lst.pet_friendly if lst.pet_friendly is not None else mascotas
@@ -387,15 +385,18 @@ async def fetch_detail_and_enrich(lst: Listing) -> Listing:
387
  # Orquestación
388
  # =========================
389
 
390
- async def run_agent(
391
- neighborhoods: List[str],
392
- max_price_usd: int,
393
- types: List[str],
394
- min_rooms: int,
395
- require_outdoor: bool,
396
- require_bidet: bool,
397
- require_pet: bool
398
- ) -> List[Listing]:
 
 
 
399
  filters = dict(
400
  max_price_usd=max_price_usd,
401
  min_rooms=min_rooms,
@@ -403,24 +404,12 @@ async def run_agent(
403
  require_bidet=require_bidet,
404
  require_pet=require_pet,
405
  )
406
-
407
- # 1) Scrapeo base multi-portal
408
  tasks = [scrape_portal(p, neighborhoods, max_price_usd, types) for p in PORTALS]
409
  batch = await asyncio.gather(*tasks)
410
  listings = [l for sub in batch for l in sub]
411
 
412
- # 2) Deduplicación por link canónico
413
- def canon(url: str) -> str:
414
- # quitar parámetros de tracking
415
- try:
416
- parsed = ul.urlparse(url)
417
- q = ul.parse_qsl(parsed.query)
418
- q = [(k, v) for (k, v) in q if k.lower() not in {"utm_source", "utm_medium", "utm_campaign", "gclid", "s"}]
419
- new_q = ul.urlencode(q, doseq=True)
420
- return ul.urlunparse((parsed.scheme, parsed.netloc, parsed.path, "", new_q, ""))
421
- except Exception:
422
- return url
423
-
424
  seen = set()
425
  unique: List[Listing] = []
426
  for l in listings:
@@ -431,17 +420,16 @@ async def run_agent(
431
  l.link = key
432
  unique.append(l)
433
 
434
- # 3) Enriquecer en paralelo con control de concurrencia
435
  sem = asyncio.Semaphore(8)
436
- async def enrich_guarded(item: Listing):
437
  async with sem:
438
- enriched = await fetch_detail_and_enrich(item)
439
- # pausa suave entre detalles
440
  await asyncio.sleep(random.uniform(*JITTER_RANGE))
441
  return enriched
442
- enriched = await asyncio.gather(*[enrich_guarded(l) for l in unique])
443
 
444
- # 4) Filtros duros
445
  def passes(l: Listing) -> bool:
446
  if l.price_usd is None or l.price_usd > max_price_usd:
447
  return False
@@ -461,11 +449,43 @@ async def run_agent(
461
 
462
  filtered = [l for l in enriched if passes(l)]
463
 
464
- # 5) Scoring y orden
465
  for l in filtered:
466
  l.score = compute_score(l, filters)
467
  filtered.sort(key=lambda x: (-x.score, x.price_usd or 1e9))
468
- return filtered
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
469
 
470
  def listings_to_df(listings: List[Listing]) -> pd.DataFrame:
471
  rows = []
@@ -500,7 +520,7 @@ def build_email(subject: str, sender: str, to_addr: str, body_html: str, attachm
500
  msg["Subject"] = subject
501
  msg["From"] = sender
502
  msg["To"] = to_addr
503
- msg.set_content("Este mensaje tiene una versión HTML y adjuntos.")
504
  msg.add_alternative(body_html, subtype="html")
505
  for filename, content, mimetype in attachments:
506
  maintype, subtype = (mimetype.split("/", 1) if "/" in mimetype else ("application", "octet-stream"))
@@ -521,9 +541,7 @@ def send_email(to_addr: str, subject: str, html_body: str, attachments: List[Tup
521
  server.send_message(msg)
522
  else:
523
  with smtplib.SMTP(SMTP_HOST, SMTP_PORT) as server:
524
- server.ehlo()
525
- server.starttls()
526
- server.ehlo()
527
  server.login(SMTP_USER, SMTP_PASS)
528
  server.send_message(msg)
529
  return "OK"
@@ -536,11 +554,12 @@ def df_to_csv_bytes(df: pd.DataFrame) -> bytes:
536
  def json_to_bytes(obj: Any) -> bytes:
537
  return json.dumps(obj, ensure_ascii=False, indent=2).encode("utf-8")
538
 
539
- def render_summary_html(df: pd.DataFrame, neighborhoods: List[str], max_usd: int, min_rooms: int) -> str:
540
  count = len(df)
541
- head = f"<h2>Resultados de tu búsqueda</h2><p><b>Zonas:</b> {', '.join(neighborhoods)}<br><b>Precio máx.:</b> USD {max_usd}<br><b>Ambientes mín.:</b> {min_rooms}<br><b>Total:</b> {count}</p>"
 
542
  if count == 0:
543
- return head + "<p>No se encontraron resultados con los filtros actuales.</p>"
544
  top_rows = df.sort_values(by=['Score','Precio USD'], ascending=[False, True]).head(12)
545
  items = []
546
  for _, r in top_rows.iterrows():
@@ -548,30 +567,29 @@ def render_summary_html(df: pd.DataFrame, neighborhoods: List[str], max_usd: int
548
  price = f"USD {int(r['Precio USD'])}" if pd.notna(r['Precio USD']) else "USD —"
549
  addr = r.get("Dirección/Área") or ""
550
  items.append(f"<li><b>{r['Título']}</b> — {price} — {addr} — {flags} — <a href='{r['Link']}'>Abrir</a></li>")
551
- return head + "<ol>" + "\n".join(items) + "</ol>"
552
 
553
  # =========================
554
  # UI (Gradio)
555
  # =========================
556
 
557
  DESCRIPTION = """
558
- Meta-buscador multi-portales para casas/PH entre Saavedra y La Lucila y alrededores.
559
- Filtra: USD ≤ 90k, ≥ 3 ambientes (para oficina), patio/terraza, mascotas, bidet (si figura en descripción).
560
- Al terminar, te puede enviar el resumen a tu email con CSV y JSON adjuntos.
 
561
  """
562
 
563
- async def run_and_present(neighs, max_usd, types, min_rooms, req_outdoor, req_bidet, req_pet, email_to, send_email_flag):
564
  neighs_list = [n.strip() for n in str(neighs).split(",") if n.strip()]
565
  types_list = [t.strip().lower() for t in str(types).split(",") if t.strip()]
 
 
566
 
567
- results = await run_agent(
568
- neighborhoods=neighs_list,
569
- max_price_usd=int(max_usd),
570
- types=types_list,
571
- min_rooms=int(min_rooms),
572
- require_outdoor=bool(req_outdoor),
573
- require_bidet=bool(req_bidet),
574
- require_pet=bool(req_pet)
575
  )
576
  df = listings_to_df(results)
577
  json_blob = [asdict(l) for l in results]
@@ -581,7 +599,7 @@ async def run_and_present(neighs, max_usd, types, min_rooms, req_outdoor, req_bi
581
  if not EMAIL_REGEX.match(email_to or ""):
582
  email_status = "Error: email destino inválido."
583
  else:
584
- html = render_summary_html(df, neighs_list, int(max_usd), int(min_rooms))
585
  attachments: List[Tuple[str, bytes, str]] = []
586
  if not df.empty:
587
  attachments.append(("resultados.csv", df_to_csv_bytes(df), "text/csv"))
@@ -594,7 +612,8 @@ async def run_and_present(neighs, max_usd, types, min_rooms, req_outdoor, req_bi
594
  )
595
  email_status = "Enviado" if status == "OK" else status
596
 
597
- return df, json.dumps(json_blob, ensure_ascii=False, indent=2), email_status
 
598
 
599
  with gr.Blocks(title="Meta-buscador Inmuebles Norte BA (≤ USD 90k)") as demo:
600
  gr.Markdown("# Meta-buscador de casas/PH norte BA (≤ 90 000 USD)")
@@ -609,6 +628,7 @@ with gr.Blocks(title="Meta-buscador Inmuebles Norte BA (≤ USD 90k)") as demo:
609
  req_outdoor = gr.Checkbox(label="Requerir patio o terraza", value=REQUIRE_OUTDOOR)
610
  req_bidet = gr.Checkbox(label="Requerir bidet (si aparece en descripción)", value=REQUIRE_BIDET)
611
  req_pet = gr.Checkbox(label="Requerir pet-friendly (si aparece en descripción)", value=REQUIRE_PET_FRIENDLY)
 
612
 
613
  gr.Markdown("### Envío por email al finalizar (opcional)")
614
  with gr.Row():
@@ -618,16 +638,18 @@ with gr.Blocks(title="Meta-buscador Inmuebles Norte BA (≤ USD 90k)") as demo:
618
  btn = gr.Button("Buscar ahora", variant="primary")
619
  with gr.Tabs():
620
  with gr.Tab("Resultados"):
621
- table = gr.Dataframe(interactive=False)
622
  with gr.Tab("JSON"):
623
  j = gr.Code(language="json")
 
 
624
  with gr.Tab("Estado de email"):
625
  status = gr.Markdown("—")
626
 
627
  btn.click(
628
  run_and_present,
629
- inputs=[neighs, max_usd, types, min_rooms, req_outdoor, req_bidet, req_pet, email_to, send_email_flag],
630
- outputs=[table, j, status]
631
  )
632
 
633
  if __name__ == "__main__":
 
6
  import smtplib
7
  import random
8
  import asyncio
 
9
  from dataclasses import dataclass, asdict
10
  from typing import List, Optional, Dict, Any, Tuple
11
  from email.message import EmailMessage
 
12
  import urllib.parse as ul
13
 
14
  import httpx
 
24
  DEFAULT_MAX_USD = 90000
25
  DEFAULT_NEIGHBORHOODS = [
26
  "Saavedra", "Nuñez", "La Lucila", "Florida Oeste", "Munro", "Carapachay",
27
+ "Olivos", "Villa Martelli", "Florida", "Vicente López"
28
  ]
29
  DEFAULT_TYPES = ["casa", "ph"] # "casa", "ph"
30
+ DEFAULT_MIN_ROOMS = 3
31
  REQUIRE_BIDET = True
32
  REQUIRE_PET_FRIENDLY = True
33
+ REQUIRE_OUTDOOR = True
34
+
35
+ # Auto-relajación si no hay resultados (escalonada)
36
+ AUTO_RELAX_ENABLED = True
37
+ RELAX_STEPS = [
38
+ {"require_bidet": False}, # 1) liberar bidet
39
+ {"require_pet": False}, # 2) liberar mascotas
40
+ {"min_rooms": 2}, # 3) bajar ambientes a 2
41
+ {"require_outdoor": False}, # 4) exterior opcional
42
+ {"max_price_usd_delta": 10000}, # 5) subir precio máx. +10k
43
+ ]
44
 
45
+ # Microzonas (boost de score)
46
  MICROZONAS_PRIORITARIAS = [
47
  "Parque Saavedra", "Parque Sarmiento", "Av. Balbín", "Ruiz Huidobro",
48
  "Lomas de Nuñez", "Cabildo", "Plaza Alberti",
 
51
  "Ugarte", "San Martín", "Panamericana", "Pelliza", "Melo",
52
  ]
53
 
54
+ # Anti-scraping
55
  USER_AGENT_POOL = [
56
  "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36",
57
  "Mozilla/5.0 (Macintosh; Intel Mac OS X 13_4) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.4 Safari/605.1.15",
 
62
  TIMEOUT = httpx.Timeout(25.0, connect=12.0)
63
  RETRIES = 2
64
  BACKOFF_BASE = 0.9
65
+ JITTER_RANGE = (0.13, 0.55) # s
66
 
67
+ # Rate-limit por dominio
68
  DOMAIN_RATE_LIMIT = {
69
  "www.zonaprop.com.ar": 0.6,
70
  "www.argenprop.com": 0.6,
 
78
  "www.buscatucasa.com.ar": 0.8,
79
  }
80
 
81
+ # Proxy opcional (configurable en Secrets)
82
  PROXY_URL = os.getenv("PROXY_URL", "").strip()
83
 
84
+ # Email (configurable en Secrets)
 
 
85
  SMTP_HOST = os.getenv("SMTP_HOST", "").strip()
86
  SMTP_PORT = int(os.getenv("SMTP_PORT", "587"))
87
  SMTP_USER = os.getenv("SMTP_USER", "").strip()
88
  SMTP_PASS = os.getenv("SMTP_PASS", "").strip()
89
  SMTP_FROM = os.getenv("SMTP_FROM", SMTP_USER).strip()
90
  SMTP_USE_SSL = os.getenv("SMTP_USE_SSL", "false").lower() in ("1", "true", "yes")
 
91
  EMAIL_REGEX = re.compile(r"^[^@\s]+@[^@\s]+\.[^@\s]+$")
92
 
93
  # =========================
 
126
  return float(m.group(1)) if m else None
127
  return None
128
 
129
+ def extract_int_from(text: str, pattern: str) -> Optional[int]:
130
  if not text:
131
  return None
132
  m = re.search(pattern, text)
 
163
  if filters["require_outdoor"] and (lst.has_patio or lst.has_terrace):
164
  score += 1.0
165
  if filters["require_pet"]:
166
+ score += 0.6 if lst.pet_friendly else 0.0
 
167
  else:
168
  score += 0.2
169
  if filters["require_bidet"]:
170
+ score += 0.6 if lst.has_bidet else 0.0
 
171
  else:
172
  score += 0.2
173
  score += residential_score(lst.address or "", lst.neighborhood or "", lst.description or "")
 
198
  if wait > 0:
199
  await asyncio.sleep(wait)
200
  _last_hit[domain] = time.time()
 
201
  await asyncio.sleep(random.uniform(*JITTER_RANGE))
202
 
203
  async def fetch(url: str) -> Optional[str]:
 
208
  try:
209
  async with httpx.AsyncClient(follow_redirects=True, http2=True, proxies=proxies, timeout=TIMEOUT) as client:
210
  r = await client.get(url, headers=make_headers())
211
+ # aceptamos HTML corto; algunos portales entregan SSR mínimo
212
+ if r.status_code == 200 and r.text:
213
  return r.text
 
214
  await asyncio.sleep(BACKOFF_BASE * (2 ** i) + random.uniform(0, 0.35))
215
  except Exception:
216
  await asyncio.sleep(BACKOFF_BASE * (2 ** i) + random.uniform(0, 0.35))
217
  return None
218
 
219
  # =========================
220
+ # Portales
221
  # =========================
222
 
223
  class Portal:
224
+ def __init__(self, domain: str, search_builder):
225
  self.domain = domain
226
+ self.search_builder = search_builder # fn(neighs, max_usd, types)->[urls]
 
227
 
228
  def sb_qparam(base: str, param: str = "q"):
229
  def _builder(neighs: List[str], max_usd: int, types: List[str]) -> List[str]:
230
  urls = []
231
+ syn_outdoor = ["patio", "terraza", "exterior"]
232
+ syn_pets = ["mascotas", "pet friendly"]
233
+ rooms_variants = ["3 ambientes", "tres ambientes"]
 
234
  for n in neighs:
235
+ for o in syn_outdoor:
236
+ q = f"{' o '.join(types)} venta {n} hasta {max_usd} dolares {random.choice(rooms_variants)} {o} {random.choice(syn_pets)} bidet"
237
+ urls.append(f"{base}?{param}={ul.quote(q)}")
 
238
  return urls
239
  return _builder
240
 
 
241
  PORTALS: List[Portal] = [
242
  Portal("www.zonaprop.com.ar", sb_qparam("https://www.zonaprop.com.ar/propiedades.html", "q")),
243
  Portal("www.argenprop.com", sb_qparam("https://www.argenprop.com/propiedades", "text")),
 
251
  Portal("www.buscatucasa.com.ar", sb_qparam("https://www.buscatucasa.com.ar/buscar", "q")),
252
  ]
253
 
254
+ ANCHOR_TOKENS = [
255
+ "propiedad", "inmueble", "inmuebles", "departamento", "casa", "ph",
256
+ "detalle", "item", "listing", "publicacion", "aviso", "MLA-"
257
+ ]
258
+
259
  def generic_card_extractor(soup: BeautifulSoup, domain: str) -> List[Dict[str, Any]]:
 
 
 
 
 
260
  anchors = soup.select("a[href]")
261
  seen = set()
262
  cards = []
 
264
  href = a.get("href", "")
265
  if not href:
266
  continue
267
+ # normalizar absoluto
268
  if href.startswith("//"):
269
  href = "https:" + href
270
  elif href.startswith("/"):
271
  href = f"https://{domain}{href}"
272
+ # solo mismo dominio
273
  if domain not in href:
274
  continue
275
+ # filtrar rutas no relevantes
276
  if any(x in href for x in ["/login", "/perfil", "/ayuda", "/faq", "/favorito", "/mi-cuenta"]):
277
  continue
278
+ # heurística de “parece aviso”
279
+ if not any(tok in href.lower() for tok in [t.lower() for t in ANCHOR_TOKENS]):
280
+ continue
281
+ # no duplicados
282
  if href in seen:
283
  continue
284
  seen.add(href)
285
 
286
  title = clean_text(a.get_text(" ", strip=True))
287
+ if len(title) < 8:
288
+ # algunos sitios tienen título en contenedor padre
289
+ parent = a.find_parent()
290
+ if parent:
291
+ title = clean_text(parent.get_text(" ", strip=True))[:160]
292
+ # texto de bloque cercano
293
  parent = a.find_parent()
294
  block_text = clean_text(parent.get_text(" ", strip=True)) if parent else ""
 
295
  m = re.search(r"(U\$S|USD|US\$|D[oó]lares?)\s*([\d\.\,]+)", block_text, re.IGNORECASE)
296
  price_text = m.group(0) if m else ""
297
+ addr_m = re.search(r"(Saavedra|Nu[eñ]ez|La Lucila|Florida(?: Oeste)?|Munro|Carapachay|Olivos|Martelli|Vicente L[oó]pez)[^|,]*", block_text, re.IGNORECASE)
 
298
  addr_text = addr_m.group(0) if addr_m else ""
299
 
 
 
 
 
 
 
 
300
  cards.append({
301
+ "title": title[:160],
302
  "link": href,
303
  "price_text": price_text,
304
  "addr_text": addr_text
305
  })
306
+ return cards[:50]
 
307
 
308
  async def scrape_search_page(url: str, domain: str) -> List[Listing]:
309
  html = await fetch(url)
 
333
  async def scrape_portal(portal: Portal, neighborhoods: List[str], max_usd: int, types: List[str]) -> List[Listing]:
334
  urls = portal.search_builder(neighborhoods, max_usd, types)
335
  results: List[Listing] = []
336
+ for u in urls[:6]: # primeras 6 queries permutadas
 
337
  try:
338
  res = await scrape_search_page(u, portal.domain)
339
  results.extend(res)
 
341
  pass
342
  return results
343
 
344
+ async def enrich_listing(lst: Listing) -> Listing:
345
  html = await fetch(lst.link)
346
  if not html:
347
  return lst
348
  soup = BeautifulSoup(html, "lxml")
349
 
350
  # Descripción
351
+ desc_el = soup.find(["div", "section"], attrs={"class": re.compile(r"(description|descripcion|post|body|texto|descripcion-larga)")}) or soup.find("p")
352
+ desc = clean_text(desc_el.get_text(" ", strip=True)) if desc_el else clean_text(" ".join(x.get_text(" ", strip=True) for x in soup.find_all(["p", "li"])[:60]))
 
 
 
353
 
354
  # Inferencias
355
  patio, terraza, mascotas, bidet = feature_guess(desc)
356
 
357
  # Características
358
+ feat_text = " ".join(
359
  el.get_text(" ", strip=True) for el in soup.find_all(["li", "span", "div"])
360
  if el and el.get_text() and any(x in el.get_text().lower() for x in ["ambiente", "dorm", "bañ"])
361
  ).lower()
362
+ # también mirar el título
363
+ coarse = (lst.title + " " + desc).lower()
364
+ rooms = extract_int_from(feat_text, r"(\d+)\s*ambiente") or extract_int_from(coarse, r"(\d+)\s*amb")
365
+ bathrooms = extract_int_from(feat_text, r"(\d+)\s*bañ") or extract_int_from(coarse, r"(\d+)\s*bañ")
366
+ bedrooms = extract_int_from(feat_text, r"(\d+)\s*dorm") or extract_int_from(coarse, r"(\d+)\s*dormi")
367
 
368
+ # Dirección
369
  addr_guess = soup.find(attrs={"class": re.compile(r"(address|ubicacion|ubicación|location|inmo-location)")})
370
  if addr_guess and not lst.address:
371
  lst.address = clean_text(addr_guess.get_text(" ", strip=True))[:200]
372
 
373
  lst.description = desc or lst.description
374
+ lst.has_patio = lst.has_patro if hasattr(lst, "has_patro") else lst.has_patio # guard
375
  lst.has_patio = lst.has_patio if lst.has_patio is not None else patio
376
  lst.has_terrace = lst.has_terrace if lst.has_terrace is not None else terraza
377
  lst.pet_friendly = lst.pet_friendly if lst.pet_friendly is not None else mascotas
 
385
  # Orquestación
386
  # =========================
387
 
388
+ def canon(url: str) -> str:
389
+ try:
390
+ parsed = ul.urlparse(url)
391
+ q = ul.parse_qsl(parsed.query)
392
+ q = [(k, v) for (k, v) in q if k.lower() not in {"utm_source", "utm_medium", "utm_campaign", "gclid", "s", "utm_term", "utm_content"}]
393
+ new_q = ul.urlencode(q, doseq=True)
394
+ return ul.urlunparse((parsed.scheme, parsed.netloc, parsed.path, "", new_q, ""))
395
+ except Exception:
396
+ return url
397
+
398
+ async def run_agent_once(neighborhoods: List[str], max_price_usd: int, types: List[str],
399
+ min_rooms: int, require_outdoor: bool, require_bidet: bool, require_pet: bool) -> Tuple[List[Listing], str]:
400
  filters = dict(
401
  max_price_usd=max_price_usd,
402
  min_rooms=min_rooms,
 
404
  require_bidet=require_bidet,
405
  require_pet=require_pet,
406
  )
407
+ # 1) Multi-portal
 
408
  tasks = [scrape_portal(p, neighborhoods, max_price_usd, types) for p in PORTALS]
409
  batch = await asyncio.gather(*tasks)
410
  listings = [l for sub in batch for l in sub]
411
 
412
+ # 2) Dedup
 
 
 
 
 
 
 
 
 
 
 
413
  seen = set()
414
  unique: List[Listing] = []
415
  for l in listings:
 
420
  l.link = key
421
  unique.append(l)
422
 
423
+ # 3) Enriquecer
424
  sem = asyncio.Semaphore(8)
425
+ async def guard(item: Listing):
426
  async with sem:
427
+ enriched = await enrich_listing(item)
 
428
  await asyncio.sleep(random.uniform(*JITTER_RANGE))
429
  return enriched
430
+ enriched = await asyncio.gather(*[guard(l) for l in unique])
431
 
432
+ # 4) Filtrar (tolerante: None no bloquea salvo que se exija explícito)
433
  def passes(l: Listing) -> bool:
434
  if l.price_usd is None or l.price_usd > max_price_usd:
435
  return False
 
449
 
450
  filtered = [l for l in enriched if passes(l)]
451
 
452
+ # 5) Score + Orden
453
  for l in filtered:
454
  l.score = compute_score(l, filters)
455
  filtered.sort(key=lambda x: (-x.score, x.price_usd or 1e9))
456
+
457
+ # Trace
458
+ trace = f"Portales: {len(PORTALS)} | Crudos: {len(listings)} | Únicos: {len(unique)} | Enriquecidos: {len(enriched)} | Final: {len(filtered)}"
459
+ return filtered, trace
460
+
461
+ async def run_agent_with_relax(neighborhoods: List[str], max_price_usd: int, types: List[str],
462
+ min_rooms: int, require_outdoor: bool, require_bidet: bool, require_pet: bool,
463
+ auto_relax: bool = True) -> Tuple[List[Listing], List[str]]:
464
+ log = []
465
+ results, trace = await run_agent_once(neighborhoods, max_price_usd, types, min_rooms, require_outdoor, require_bidet, require_pet)
466
+ log.append(f"[Base] {trace}")
467
+ if results or not auto_relax:
468
+ return results, log
469
+
470
+ # no hay resultados: probar escalonado
471
+ base = dict(
472
+ neighborhoods=neighborhoods, max_price_usd=max_price_usd, types=types,
473
+ min_rooms=min_rooms, require_outdoor=require_outdoor, require_bidet=require_bidet, require_pet=require_pet
474
+ )
475
+ price = max_price_usd
476
+ for i, step in enumerate(RELAX_STEPS, 1):
477
+ mr = step.get("min_rooms", base["min_rooms"])
478
+ ro = step.get("require_outdoor", base["require_outdoor"])
479
+ rb = step.get("require_bidet", base["require_bidet"])
480
+ rp = step.get("require_pet", base["require_pet"])
481
+ if "max_price_usd_delta" in step:
482
+ price = max_price_usd + step["max_price_usd_delta"]
483
+ log.append(f"[Relax {i}] rooms={mr} outdoor={ro} bidet={rb} pet={rp} price_max=USD {price}")
484
+ results, trace = await run_agent_once(neighborhoods, price, types, mr, ro, rb, rp)
485
+ log.append(f"[Relax {i}] {trace}")
486
+ if results:
487
+ return results, log
488
+ return results, log
489
 
490
  def listings_to_df(listings: List[Listing]) -> pd.DataFrame:
491
  rows = []
 
520
  msg["Subject"] = subject
521
  msg["From"] = sender
522
  msg["To"] = to_addr
523
+ msg.set_content("Este mensaje tiene versión HTML y adjuntos.")
524
  msg.add_alternative(body_html, subtype="html")
525
  for filename, content, mimetype in attachments:
526
  maintype, subtype = (mimetype.split("/", 1) if "/" in mimetype else ("application", "octet-stream"))
 
541
  server.send_message(msg)
542
  else:
543
  with smtplib.SMTP(SMTP_HOST, SMTP_PORT) as server:
544
+ server.ehlo(); server.starttls(); server.ehlo()
 
 
545
  server.login(SMTP_USER, SMTP_PASS)
546
  server.send_message(msg)
547
  return "OK"
 
554
  def json_to_bytes(obj: Any) -> bytes:
555
  return json.dumps(obj, ensure_ascii=False, indent=2).encode("utf-8")
556
 
557
+ def render_summary_html(df: pd.DataFrame, neighborhoods: List[str], max_usd: int, min_rooms: int, relax_log: List[str]) -> str:
558
  count = len(df)
559
+ head = f"<h2>Resultados</h2><p><b>Zonas:</b> {', '.join(neighborhoods)}<br><b>Precio máx.:</b> USD {max_usd}<br><b>Ambientes mín.:</b> {min_rooms}<br><b>Total:</b> {count}</p>"
560
+ trace = "<pre style='white-space:pre-wrap;font-size:12px;opacity:.85;'>" + "\n".join(relax_log) + "</pre>"
561
  if count == 0:
562
+ return head + "<p>No se encontraron resultados con los filtros actuales.</p>" + trace
563
  top_rows = df.sort_values(by=['Score','Precio USD'], ascending=[False, True]).head(12)
564
  items = []
565
  for _, r in top_rows.iterrows():
 
567
  price = f"USD {int(r['Precio USD'])}" if pd.notna(r['Precio USD']) else "USD —"
568
  addr = r.get("Dirección/Área") or ""
569
  items.append(f"<li><b>{r['Título']}</b> — {price} — {addr} — {flags} — <a href='{r['Link']}'>Abrir</a></li>")
570
+ return head + "<ol>" + "\n".join(items) + "</ol>" + trace
571
 
572
  # =========================
573
  # UI (Gradio)
574
  # =========================
575
 
576
  DESCRIPTION = """
577
+ Meta-buscador multi-portales para casas/PH entre Saavedra y La Lucila y alrededores.
578
+ • Filtros: USD ≤ 90k, ≥ 3 ambientes, patio/terraza, mascotas, bidet (si figura en descripción).
579
+ Anti-scraping: headers rotativos, referers, HTTP/2, rate limit con jitter, reintentos con backoff.
580
+ • Si no hay resultados, activa auto-relajación escalonada (configurable) y documenta los pasos.
581
  """
582
 
583
+ async def run_and_present(neighs, max_usd, types, min_rooms, req_outdoor, req_bidet, req_pet, auto_relax, email_to, send_email_flag):
584
  neighs_list = [n.strip() for n in str(neighs).split(",") if n.strip()]
585
  types_list = [t.strip().lower() for t in str(types).split(",") if t.strip()]
586
+ max_usd = int(max_usd); min_rooms = int(min_rooms)
587
+ req_outdoor = bool(req_outdoor); req_bidet = bool(req_bidet); req_pet = bool(req_pet); auto_relax = bool(auto_relax)
588
 
589
+ results, relax_log = await run_agent_with_relax(
590
+ neighborhoods=neighs_list, max_price_usd=max_usd, types=types_list,
591
+ min_rooms=min_rooms, require_outdoor=req_outdoor, require_bidet=req_bidet, require_pet=req_pet,
592
+ auto_relax=auto_relax
 
 
 
 
593
  )
594
  df = listings_to_df(results)
595
  json_blob = [asdict(l) for l in results]
 
599
  if not EMAIL_REGEX.match(email_to or ""):
600
  email_status = "Error: email destino inválido."
601
  else:
602
+ html = render_summary_html(df, neighs_list, max_usd, min_rooms, relax_log)
603
  attachments: List[Tuple[str, bytes, str]] = []
604
  if not df.empty:
605
  attachments.append(("resultados.csv", df_to_csv_bytes(df), "text/csv"))
 
612
  )
613
  email_status = "Enviado" if status == "OK" else status
614
 
615
+ # Mostrar log en la pestaña de estado
616
+ return df, json.dumps(json_blob, ensure_ascii=False, indent=2), " | ".join(relax_log), email_status
617
 
618
  with gr.Blocks(title="Meta-buscador Inmuebles Norte BA (≤ USD 90k)") as demo:
619
  gr.Markdown("# Meta-buscador de casas/PH norte BA (≤ 90 000 USD)")
 
628
  req_outdoor = gr.Checkbox(label="Requerir patio o terraza", value=REQUIRE_OUTDOOR)
629
  req_bidet = gr.Checkbox(label="Requerir bidet (si aparece en descripción)", value=REQUIRE_BIDET)
630
  req_pet = gr.Checkbox(label="Requerir pet-friendly (si aparece en descripción)", value=REQUIRE_PET_FRIENDLY)
631
+ auto_relax = gr.Checkbox(label="Auto-relajar si no hay resultados", value=AUTO_RELAX_ENABLED)
632
 
633
  gr.Markdown("### Envío por email al finalizar (opcional)")
634
  with gr.Row():
 
638
  btn = gr.Button("Buscar ahora", variant="primary")
639
  with gr.Tabs():
640
  with gr.Tab("Resultados"):
641
+ table = gr.Dataframe(interactive=False) # sin args raros
642
  with gr.Tab("JSON"):
643
  j = gr.Code(language="json")
644
+ with gr.Tab("Estado"):
645
+ trace = gr.Markdown("—")
646
  with gr.Tab("Estado de email"):
647
  status = gr.Markdown("—")
648
 
649
  btn.click(
650
  run_and_present,
651
+ inputs=[neighs, max_usd, types, min_rooms, req_outdoor, req_bidet, req_pet, auto_relax, email_to, send_email_flag],
652
+ outputs=[table, j, trace, status]
653
  )
654
 
655
  if __name__ == "__main__":