Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -6,11 +6,9 @@ import time
|
|
| 6 |
import smtplib
|
| 7 |
import random
|
| 8 |
import asyncio
|
| 9 |
-
import mimetypes
|
| 10 |
from dataclasses import dataclass, asdict
|
| 11 |
from typing import List, Optional, Dict, Any, Tuple
|
| 12 |
from email.message import EmailMessage
|
| 13 |
-
from pathlib import Path
|
| 14 |
import urllib.parse as ul
|
| 15 |
|
| 16 |
import httpx
|
|
@@ -26,15 +24,25 @@ import gradio as gr
|
|
| 26 |
DEFAULT_MAX_USD = 90000
|
| 27 |
DEFAULT_NEIGHBORHOODS = [
|
| 28 |
"Saavedra", "Nuñez", "La Lucila", "Florida Oeste", "Munro", "Carapachay",
|
| 29 |
-
"Olivos", "Villa Martelli"
|
| 30 |
]
|
| 31 |
DEFAULT_TYPES = ["casa", "ph"] # "casa", "ph"
|
| 32 |
-
DEFAULT_MIN_ROOMS = 3
|
| 33 |
REQUIRE_BIDET = True
|
| 34 |
REQUIRE_PET_FRIENDLY = True
|
| 35 |
-
REQUIRE_OUTDOOR = True
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 36 |
|
| 37 |
-
# Microzonas
|
| 38 |
MICROZONAS_PRIORITARIAS = [
|
| 39 |
"Parque Saavedra", "Parque Sarmiento", "Av. Balbín", "Ruiz Huidobro",
|
| 40 |
"Lomas de Nuñez", "Cabildo", "Plaza Alberti",
|
|
@@ -43,7 +51,7 @@ MICROZONAS_PRIORITARIAS = [
|
|
| 43 |
"Ugarte", "San Martín", "Panamericana", "Pelliza", "Melo",
|
| 44 |
]
|
| 45 |
|
| 46 |
-
# Anti-scraping
|
| 47 |
USER_AGENT_POOL = [
|
| 48 |
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36",
|
| 49 |
"Mozilla/5.0 (Macintosh; Intel Mac OS X 13_4) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.4 Safari/605.1.15",
|
|
@@ -54,9 +62,9 @@ REFERER_POOL = ["https://www.google.com/", "https://www.bing.com/", "https://duc
|
|
| 54 |
TIMEOUT = httpx.Timeout(25.0, connect=12.0)
|
| 55 |
RETRIES = 2
|
| 56 |
BACKOFF_BASE = 0.9
|
| 57 |
-
JITTER_RANGE = (0.
|
| 58 |
|
| 59 |
-
# Rate-limit
|
| 60 |
DOMAIN_RATE_LIMIT = {
|
| 61 |
"www.zonaprop.com.ar": 0.6,
|
| 62 |
"www.argenprop.com": 0.6,
|
|
@@ -70,19 +78,16 @@ DOMAIN_RATE_LIMIT = {
|
|
| 70 |
"www.buscatucasa.com.ar": 0.8,
|
| 71 |
}
|
| 72 |
|
| 73 |
-
# Proxy opcional (
|
| 74 |
PROXY_URL = os.getenv("PROXY_URL", "").strip()
|
| 75 |
|
| 76 |
-
#
|
| 77 |
-
# Email (configurado vía Secrets)
|
| 78 |
-
# =========================
|
| 79 |
SMTP_HOST = os.getenv("SMTP_HOST", "").strip()
|
| 80 |
SMTP_PORT = int(os.getenv("SMTP_PORT", "587"))
|
| 81 |
SMTP_USER = os.getenv("SMTP_USER", "").strip()
|
| 82 |
SMTP_PASS = os.getenv("SMTP_PASS", "").strip()
|
| 83 |
SMTP_FROM = os.getenv("SMTP_FROM", SMTP_USER).strip()
|
| 84 |
SMTP_USE_SSL = os.getenv("SMTP_USE_SSL", "false").lower() in ("1", "true", "yes")
|
| 85 |
-
|
| 86 |
EMAIL_REGEX = re.compile(r"^[^@\s]+@[^@\s]+\.[^@\s]+$")
|
| 87 |
|
| 88 |
# =========================
|
|
@@ -121,7 +126,7 @@ def to_float_price(value: str) -> Optional[float]:
|
|
| 121 |
return float(m.group(1)) if m else None
|
| 122 |
return None
|
| 123 |
|
| 124 |
-
def
|
| 125 |
if not text:
|
| 126 |
return None
|
| 127 |
m = re.search(pattern, text)
|
|
@@ -158,13 +163,11 @@ def compute_score(lst: Listing, filters: Dict[str, Any]) -> float:
|
|
| 158 |
if filters["require_outdoor"] and (lst.has_patio or lst.has_terrace):
|
| 159 |
score += 1.0
|
| 160 |
if filters["require_pet"]:
|
| 161 |
-
if lst.pet_friendly
|
| 162 |
-
score += 0.6
|
| 163 |
else:
|
| 164 |
score += 0.2
|
| 165 |
if filters["require_bidet"]:
|
| 166 |
-
if lst.has_bidet
|
| 167 |
-
score += 0.6
|
| 168 |
else:
|
| 169 |
score += 0.2
|
| 170 |
score += residential_score(lst.address or "", lst.neighborhood or "", lst.description or "")
|
|
@@ -195,7 +198,6 @@ async def domain_throttle(domain: str):
|
|
| 195 |
if wait > 0:
|
| 196 |
await asyncio.sleep(wait)
|
| 197 |
_last_hit[domain] = time.time()
|
| 198 |
-
# jitter suave
|
| 199 |
await asyncio.sleep(random.uniform(*JITTER_RANGE))
|
| 200 |
|
| 201 |
async def fetch(url: str) -> Optional[str]:
|
|
@@ -206,41 +208,36 @@ async def fetch(url: str) -> Optional[str]:
|
|
| 206 |
try:
|
| 207 |
async with httpx.AsyncClient(follow_redirects=True, http2=True, proxies=proxies, timeout=TIMEOUT) as client:
|
| 208 |
r = await client.get(url, headers=make_headers())
|
| 209 |
-
#
|
| 210 |
-
if r.status_code == 200 and r.text
|
| 211 |
return r.text
|
| 212 |
-
# backoff
|
| 213 |
await asyncio.sleep(BACKOFF_BASE * (2 ** i) + random.uniform(0, 0.35))
|
| 214 |
except Exception:
|
| 215 |
await asyncio.sleep(BACKOFF_BASE * (2 ** i) + random.uniform(0, 0.35))
|
| 216 |
return None
|
| 217 |
|
| 218 |
# =========================
|
| 219 |
-
#
|
| 220 |
# =========================
|
| 221 |
|
| 222 |
class Portal:
|
| 223 |
-
def __init__(self, domain: str, search_builder
|
| 224 |
self.domain = domain
|
| 225 |
-
self.search_builder = search_builder # fn(neighs, max_usd, types)
|
| 226 |
-
self.card_hint = card_hint # texto para filtrar anchors
|
| 227 |
|
| 228 |
def sb_qparam(base: str, param: str = "q"):
|
| 229 |
def _builder(neighs: List[str], max_usd: int, types: List[str]) -> List[str]:
|
| 230 |
urls = []
|
| 231 |
-
|
| 232 |
-
|
| 233 |
-
|
| 234 |
-
syn_rooms = ["3 ambientes", "tres ambientes", ">=3 ambientes"]
|
| 235 |
for n in neighs:
|
| 236 |
-
for o in syn_outdoor
|
| 237 |
-
|
| 238 |
-
|
| 239 |
-
urls.append(f"{base}?{param}={ul.quote(q)}")
|
| 240 |
return urls
|
| 241 |
return _builder
|
| 242 |
|
| 243 |
-
# Portales contemplados (agregar más es trivial)
|
| 244 |
PORTALS: List[Portal] = [
|
| 245 |
Portal("www.zonaprop.com.ar", sb_qparam("https://www.zonaprop.com.ar/propiedades.html", "q")),
|
| 246 |
Portal("www.argenprop.com", sb_qparam("https://www.argenprop.com/propiedades", "text")),
|
|
@@ -254,12 +251,12 @@ PORTALS: List[Portal] = [
|
|
| 254 |
Portal("www.buscatucasa.com.ar", sb_qparam("https://www.buscatucasa.com.ar/buscar", "q")),
|
| 255 |
]
|
| 256 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 257 |
def generic_card_extractor(soup: BeautifulSoup, domain: str) -> List[Dict[str, Any]]:
|
| 258 |
-
"""
|
| 259 |
-
Heurística universal:
|
| 260 |
-
- Encuentra <a href> del mismo dominio que parezcan links a avisos.
|
| 261 |
-
- Extrae texto cercano para precio y zona.
|
| 262 |
-
"""
|
| 263 |
anchors = soup.select("a[href]")
|
| 264 |
seen = set()
|
| 265 |
cards = []
|
|
@@ -267,44 +264,46 @@ def generic_card_extractor(soup: BeautifulSoup, domain: str) -> List[Dict[str, A
|
|
| 267 |
href = a.get("href", "")
|
| 268 |
if not href:
|
| 269 |
continue
|
| 270 |
-
# normalizar
|
| 271 |
if href.startswith("//"):
|
| 272 |
href = "https:" + href
|
| 273 |
elif href.startswith("/"):
|
| 274 |
href = f"https://{domain}{href}"
|
|
|
|
| 275 |
if domain not in href:
|
| 276 |
continue
|
|
|
|
| 277 |
if any(x in href for x in ["/login", "/perfil", "/ayuda", "/faq", "/favorito", "/mi-cuenta"]):
|
| 278 |
continue
|
|
|
|
|
|
|
|
|
|
|
|
|
| 279 |
if href in seen:
|
| 280 |
continue
|
| 281 |
seen.add(href)
|
| 282 |
|
| 283 |
title = clean_text(a.get_text(" ", strip=True))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 284 |
parent = a.find_parent()
|
| 285 |
block_text = clean_text(parent.get_text(" ", strip=True)) if parent else ""
|
| 286 |
-
# precio en USD
|
| 287 |
m = re.search(r"(U\$S|USD|US\$|D[oó]lares?)\s*([\d\.\,]+)", block_text, re.IGNORECASE)
|
| 288 |
price_text = m.group(0) if m else ""
|
| 289 |
-
|
| 290 |
-
addr_m = re.search(r"(Saavedra|Nu[eñ]ez|La Lucila|Florida(?: Oeste)?|Munro|Carapachay|Olivos|Martelli)[^|,]*", block_text, re.IGNORECASE)
|
| 291 |
addr_text = addr_m.group(0) if addr_m else ""
|
| 292 |
|
| 293 |
-
# Filtros mínimos para evitar ruido
|
| 294 |
-
if len(title) < 8:
|
| 295 |
-
continue
|
| 296 |
-
if not any(x in href.lower() for x in ["propiedad", "inmueble", "inmuebles", "departamento", "casa", "ph", "detalle", "item", "listing", "publicacion", "aviso", "id"]):
|
| 297 |
-
# permitir de todos modos: muchos sitios usan slugs
|
| 298 |
-
pass
|
| 299 |
-
|
| 300 |
cards.append({
|
| 301 |
-
"title": title,
|
| 302 |
"link": href,
|
| 303 |
"price_text": price_text,
|
| 304 |
"addr_text": addr_text
|
| 305 |
})
|
| 306 |
-
|
| 307 |
-
return cards[:40]
|
| 308 |
|
| 309 |
async def scrape_search_page(url: str, domain: str) -> List[Listing]:
|
| 310 |
html = await fetch(url)
|
|
@@ -334,8 +333,7 @@ async def scrape_search_page(url: str, domain: str) -> List[Listing]:
|
|
| 334 |
async def scrape_portal(portal: Portal, neighborhoods: List[str], max_usd: int, types: List[str]) -> List[Listing]:
|
| 335 |
urls = portal.search_builder(neighborhoods, max_usd, types)
|
| 336 |
results: List[Listing] = []
|
| 337 |
-
|
| 338 |
-
for u in urls[:6]:
|
| 339 |
try:
|
| 340 |
res = await scrape_search_page(u, portal.domain)
|
| 341 |
results.extend(res)
|
|
@@ -343,37 +341,37 @@ async def scrape_portal(portal: Portal, neighborhoods: List[str], max_usd: int,
|
|
| 343 |
pass
|
| 344 |
return results
|
| 345 |
|
| 346 |
-
async def
|
| 347 |
html = await fetch(lst.link)
|
| 348 |
if not html:
|
| 349 |
return lst
|
| 350 |
soup = BeautifulSoup(html, "lxml")
|
| 351 |
|
| 352 |
# Descripción
|
| 353 |
-
desc_el = soup.find(["div", "section"], attrs={"class": re.compile(r"(description|descripcion|post|body|texto)")}) or soup.find("p")
|
| 354 |
-
if desc_el:
|
| 355 |
-
desc = clean_text(desc_el.get_text(" ", strip=True))
|
| 356 |
-
else:
|
| 357 |
-
desc = clean_text(" ".join(x.get_text(" ", strip=True) for x in soup.find_all(["p", "li"])[:50]))
|
| 358 |
|
| 359 |
# Inferencias
|
| 360 |
patio, terraza, mascotas, bidet = feature_guess(desc)
|
| 361 |
|
| 362 |
# Características
|
| 363 |
-
|
| 364 |
el.get_text(" ", strip=True) for el in soup.find_all(["li", "span", "div"])
|
| 365 |
if el and el.get_text() and any(x in el.get_text().lower() for x in ["ambiente", "dorm", "bañ"])
|
| 366 |
).lower()
|
| 367 |
-
|
| 368 |
-
|
| 369 |
-
|
|
|
|
|
|
|
| 370 |
|
| 371 |
-
# Dirección
|
| 372 |
addr_guess = soup.find(attrs={"class": re.compile(r"(address|ubicacion|ubicación|location|inmo-location)")})
|
| 373 |
if addr_guess and not lst.address:
|
| 374 |
lst.address = clean_text(addr_guess.get_text(" ", strip=True))[:200]
|
| 375 |
|
| 376 |
lst.description = desc or lst.description
|
|
|
|
| 377 |
lst.has_patio = lst.has_patio if lst.has_patio is not None else patio
|
| 378 |
lst.has_terrace = lst.has_terrace if lst.has_terrace is not None else terraza
|
| 379 |
lst.pet_friendly = lst.pet_friendly if lst.pet_friendly is not None else mascotas
|
|
@@ -387,15 +385,18 @@ async def fetch_detail_and_enrich(lst: Listing) -> Listing:
|
|
| 387 |
# Orquestación
|
| 388 |
# =========================
|
| 389 |
|
| 390 |
-
|
| 391 |
-
|
| 392 |
-
|
| 393 |
-
|
| 394 |
-
|
| 395 |
-
|
| 396 |
-
|
| 397 |
-
|
| 398 |
-
|
|
|
|
|
|
|
|
|
|
| 399 |
filters = dict(
|
| 400 |
max_price_usd=max_price_usd,
|
| 401 |
min_rooms=min_rooms,
|
|
@@ -403,24 +404,12 @@ async def run_agent(
|
|
| 403 |
require_bidet=require_bidet,
|
| 404 |
require_pet=require_pet,
|
| 405 |
)
|
| 406 |
-
|
| 407 |
-
# 1) Scrapeo base multi-portal
|
| 408 |
tasks = [scrape_portal(p, neighborhoods, max_price_usd, types) for p in PORTALS]
|
| 409 |
batch = await asyncio.gather(*tasks)
|
| 410 |
listings = [l for sub in batch for l in sub]
|
| 411 |
|
| 412 |
-
# 2)
|
| 413 |
-
def canon(url: str) -> str:
|
| 414 |
-
# quitar parámetros de tracking
|
| 415 |
-
try:
|
| 416 |
-
parsed = ul.urlparse(url)
|
| 417 |
-
q = ul.parse_qsl(parsed.query)
|
| 418 |
-
q = [(k, v) for (k, v) in q if k.lower() not in {"utm_source", "utm_medium", "utm_campaign", "gclid", "s"}]
|
| 419 |
-
new_q = ul.urlencode(q, doseq=True)
|
| 420 |
-
return ul.urlunparse((parsed.scheme, parsed.netloc, parsed.path, "", new_q, ""))
|
| 421 |
-
except Exception:
|
| 422 |
-
return url
|
| 423 |
-
|
| 424 |
seen = set()
|
| 425 |
unique: List[Listing] = []
|
| 426 |
for l in listings:
|
|
@@ -431,17 +420,16 @@ async def run_agent(
|
|
| 431 |
l.link = key
|
| 432 |
unique.append(l)
|
| 433 |
|
| 434 |
-
# 3) Enriquecer
|
| 435 |
sem = asyncio.Semaphore(8)
|
| 436 |
-
async def
|
| 437 |
async with sem:
|
| 438 |
-
enriched = await
|
| 439 |
-
# pausa suave entre detalles
|
| 440 |
await asyncio.sleep(random.uniform(*JITTER_RANGE))
|
| 441 |
return enriched
|
| 442 |
-
enriched = await asyncio.gather(*[
|
| 443 |
|
| 444 |
-
# 4)
|
| 445 |
def passes(l: Listing) -> bool:
|
| 446 |
if l.price_usd is None or l.price_usd > max_price_usd:
|
| 447 |
return False
|
|
@@ -461,11 +449,43 @@ async def run_agent(
|
|
| 461 |
|
| 462 |
filtered = [l for l in enriched if passes(l)]
|
| 463 |
|
| 464 |
-
# 5)
|
| 465 |
for l in filtered:
|
| 466 |
l.score = compute_score(l, filters)
|
| 467 |
filtered.sort(key=lambda x: (-x.score, x.price_usd or 1e9))
|
| 468 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 469 |
|
| 470 |
def listings_to_df(listings: List[Listing]) -> pd.DataFrame:
|
| 471 |
rows = []
|
|
@@ -500,7 +520,7 @@ def build_email(subject: str, sender: str, to_addr: str, body_html: str, attachm
|
|
| 500 |
msg["Subject"] = subject
|
| 501 |
msg["From"] = sender
|
| 502 |
msg["To"] = to_addr
|
| 503 |
-
msg.set_content("Este mensaje tiene
|
| 504 |
msg.add_alternative(body_html, subtype="html")
|
| 505 |
for filename, content, mimetype in attachments:
|
| 506 |
maintype, subtype = (mimetype.split("/", 1) if "/" in mimetype else ("application", "octet-stream"))
|
|
@@ -521,9 +541,7 @@ def send_email(to_addr: str, subject: str, html_body: str, attachments: List[Tup
|
|
| 521 |
server.send_message(msg)
|
| 522 |
else:
|
| 523 |
with smtplib.SMTP(SMTP_HOST, SMTP_PORT) as server:
|
| 524 |
-
server.ehlo()
|
| 525 |
-
server.starttls()
|
| 526 |
-
server.ehlo()
|
| 527 |
server.login(SMTP_USER, SMTP_PASS)
|
| 528 |
server.send_message(msg)
|
| 529 |
return "OK"
|
|
@@ -536,11 +554,12 @@ def df_to_csv_bytes(df: pd.DataFrame) -> bytes:
|
|
| 536 |
def json_to_bytes(obj: Any) -> bytes:
|
| 537 |
return json.dumps(obj, ensure_ascii=False, indent=2).encode("utf-8")
|
| 538 |
|
| 539 |
-
def render_summary_html(df: pd.DataFrame, neighborhoods: List[str], max_usd: int, min_rooms: int) -> str:
|
| 540 |
count = len(df)
|
| 541 |
-
head = f"<h2>Resultados
|
|
|
|
| 542 |
if count == 0:
|
| 543 |
-
return head + "<p>No se encontraron resultados con los filtros actuales.</p>"
|
| 544 |
top_rows = df.sort_values(by=['Score','Precio USD'], ascending=[False, True]).head(12)
|
| 545 |
items = []
|
| 546 |
for _, r in top_rows.iterrows():
|
|
@@ -548,30 +567,29 @@ def render_summary_html(df: pd.DataFrame, neighborhoods: List[str], max_usd: int
|
|
| 548 |
price = f"USD {int(r['Precio USD'])}" if pd.notna(r['Precio USD']) else "USD —"
|
| 549 |
addr = r.get("Dirección/Área") or ""
|
| 550 |
items.append(f"<li><b>{r['Título']}</b> — {price} — {addr} — {flags} — <a href='{r['Link']}'>Abrir</a></li>")
|
| 551 |
-
return head + "<ol>" + "\n".join(items) + "</ol>"
|
| 552 |
|
| 553 |
# =========================
|
| 554 |
# UI (Gradio)
|
| 555 |
# =========================
|
| 556 |
|
| 557 |
DESCRIPTION = """
|
| 558 |
-
Meta-buscador multi-portales para casas/PH entre Saavedra y La Lucila y alrededores.
|
| 559 |
-
|
| 560 |
-
|
|
|
|
| 561 |
"""
|
| 562 |
|
| 563 |
-
async def run_and_present(neighs, max_usd, types, min_rooms, req_outdoor, req_bidet, req_pet, email_to, send_email_flag):
|
| 564 |
neighs_list = [n.strip() for n in str(neighs).split(",") if n.strip()]
|
| 565 |
types_list = [t.strip().lower() for t in str(types).split(",") if t.strip()]
|
|
|
|
|
|
|
| 566 |
|
| 567 |
-
results = await
|
| 568 |
-
neighborhoods=neighs_list,
|
| 569 |
-
|
| 570 |
-
|
| 571 |
-
min_rooms=int(min_rooms),
|
| 572 |
-
require_outdoor=bool(req_outdoor),
|
| 573 |
-
require_bidet=bool(req_bidet),
|
| 574 |
-
require_pet=bool(req_pet)
|
| 575 |
)
|
| 576 |
df = listings_to_df(results)
|
| 577 |
json_blob = [asdict(l) for l in results]
|
|
@@ -581,7 +599,7 @@ async def run_and_present(neighs, max_usd, types, min_rooms, req_outdoor, req_bi
|
|
| 581 |
if not EMAIL_REGEX.match(email_to or ""):
|
| 582 |
email_status = "Error: email destino inválido."
|
| 583 |
else:
|
| 584 |
-
html = render_summary_html(df, neighs_list,
|
| 585 |
attachments: List[Tuple[str, bytes, str]] = []
|
| 586 |
if not df.empty:
|
| 587 |
attachments.append(("resultados.csv", df_to_csv_bytes(df), "text/csv"))
|
|
@@ -594,7 +612,8 @@ async def run_and_present(neighs, max_usd, types, min_rooms, req_outdoor, req_bi
|
|
| 594 |
)
|
| 595 |
email_status = "Enviado" if status == "OK" else status
|
| 596 |
|
| 597 |
-
|
|
|
|
| 598 |
|
| 599 |
with gr.Blocks(title="Meta-buscador Inmuebles Norte BA (≤ USD 90k)") as demo:
|
| 600 |
gr.Markdown("# Meta-buscador de casas/PH norte BA (≤ 90 000 USD)")
|
|
@@ -609,6 +628,7 @@ with gr.Blocks(title="Meta-buscador Inmuebles Norte BA (≤ USD 90k)") as demo:
|
|
| 609 |
req_outdoor = gr.Checkbox(label="Requerir patio o terraza", value=REQUIRE_OUTDOOR)
|
| 610 |
req_bidet = gr.Checkbox(label="Requerir bidet (si aparece en descripción)", value=REQUIRE_BIDET)
|
| 611 |
req_pet = gr.Checkbox(label="Requerir pet-friendly (si aparece en descripción)", value=REQUIRE_PET_FRIENDLY)
|
|
|
|
| 612 |
|
| 613 |
gr.Markdown("### Envío por email al finalizar (opcional)")
|
| 614 |
with gr.Row():
|
|
@@ -618,16 +638,18 @@ with gr.Blocks(title="Meta-buscador Inmuebles Norte BA (≤ USD 90k)") as demo:
|
|
| 618 |
btn = gr.Button("Buscar ahora", variant="primary")
|
| 619 |
with gr.Tabs():
|
| 620 |
with gr.Tab("Resultados"):
|
| 621 |
-
table = gr.Dataframe(interactive=False)
|
| 622 |
with gr.Tab("JSON"):
|
| 623 |
j = gr.Code(language="json")
|
|
|
|
|
|
|
| 624 |
with gr.Tab("Estado de email"):
|
| 625 |
status = gr.Markdown("—")
|
| 626 |
|
| 627 |
btn.click(
|
| 628 |
run_and_present,
|
| 629 |
-
inputs=[neighs, max_usd, types, min_rooms, req_outdoor, req_bidet, req_pet, email_to, send_email_flag],
|
| 630 |
-
outputs=[table, j, status]
|
| 631 |
)
|
| 632 |
|
| 633 |
if __name__ == "__main__":
|
|
|
|
| 6 |
import smtplib
|
| 7 |
import random
|
| 8 |
import asyncio
|
|
|
|
| 9 |
from dataclasses import dataclass, asdict
|
| 10 |
from typing import List, Optional, Dict, Any, Tuple
|
| 11 |
from email.message import EmailMessage
|
|
|
|
| 12 |
import urllib.parse as ul
|
| 13 |
|
| 14 |
import httpx
|
|
|
|
| 24 |
DEFAULT_MAX_USD = 90000
|
| 25 |
DEFAULT_NEIGHBORHOODS = [
|
| 26 |
"Saavedra", "Nuñez", "La Lucila", "Florida Oeste", "Munro", "Carapachay",
|
| 27 |
+
"Olivos", "Villa Martelli", "Florida", "Vicente López"
|
| 28 |
]
|
| 29 |
DEFAULT_TYPES = ["casa", "ph"] # "casa", "ph"
|
| 30 |
+
DEFAULT_MIN_ROOMS = 3
|
| 31 |
REQUIRE_BIDET = True
|
| 32 |
REQUIRE_PET_FRIENDLY = True
|
| 33 |
+
REQUIRE_OUTDOOR = True
|
| 34 |
+
|
| 35 |
+
# Auto-relajación si no hay resultados (escalonada)
|
| 36 |
+
AUTO_RELAX_ENABLED = True
|
| 37 |
+
RELAX_STEPS = [
|
| 38 |
+
{"require_bidet": False}, # 1) liberar bidet
|
| 39 |
+
{"require_pet": False}, # 2) liberar mascotas
|
| 40 |
+
{"min_rooms": 2}, # 3) bajar ambientes a 2
|
| 41 |
+
{"require_outdoor": False}, # 4) exterior opcional
|
| 42 |
+
{"max_price_usd_delta": 10000}, # 5) subir precio máx. +10k
|
| 43 |
+
]
|
| 44 |
|
| 45 |
+
# Microzonas (boost de score)
|
| 46 |
MICROZONAS_PRIORITARIAS = [
|
| 47 |
"Parque Saavedra", "Parque Sarmiento", "Av. Balbín", "Ruiz Huidobro",
|
| 48 |
"Lomas de Nuñez", "Cabildo", "Plaza Alberti",
|
|
|
|
| 51 |
"Ugarte", "San Martín", "Panamericana", "Pelliza", "Melo",
|
| 52 |
]
|
| 53 |
|
| 54 |
+
# Anti-scraping
|
| 55 |
USER_AGENT_POOL = [
|
| 56 |
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36",
|
| 57 |
"Mozilla/5.0 (Macintosh; Intel Mac OS X 13_4) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.4 Safari/605.1.15",
|
|
|
|
| 62 |
TIMEOUT = httpx.Timeout(25.0, connect=12.0)
|
| 63 |
RETRIES = 2
|
| 64 |
BACKOFF_BASE = 0.9
|
| 65 |
+
JITTER_RANGE = (0.13, 0.55) # s
|
| 66 |
|
| 67 |
+
# Rate-limit por dominio
|
| 68 |
DOMAIN_RATE_LIMIT = {
|
| 69 |
"www.zonaprop.com.ar": 0.6,
|
| 70 |
"www.argenprop.com": 0.6,
|
|
|
|
| 78 |
"www.buscatucasa.com.ar": 0.8,
|
| 79 |
}
|
| 80 |
|
| 81 |
+
# Proxy opcional (configurable en Secrets)
|
| 82 |
PROXY_URL = os.getenv("PROXY_URL", "").strip()
|
| 83 |
|
| 84 |
+
# Email (configurable en Secrets)
|
|
|
|
|
|
|
| 85 |
SMTP_HOST = os.getenv("SMTP_HOST", "").strip()
|
| 86 |
SMTP_PORT = int(os.getenv("SMTP_PORT", "587"))
|
| 87 |
SMTP_USER = os.getenv("SMTP_USER", "").strip()
|
| 88 |
SMTP_PASS = os.getenv("SMTP_PASS", "").strip()
|
| 89 |
SMTP_FROM = os.getenv("SMTP_FROM", SMTP_USER).strip()
|
| 90 |
SMTP_USE_SSL = os.getenv("SMTP_USE_SSL", "false").lower() in ("1", "true", "yes")
|
|
|
|
| 91 |
EMAIL_REGEX = re.compile(r"^[^@\s]+@[^@\s]+\.[^@\s]+$")
|
| 92 |
|
| 93 |
# =========================
|
|
|
|
| 126 |
return float(m.group(1)) if m else None
|
| 127 |
return None
|
| 128 |
|
| 129 |
+
def extract_int_from(text: str, pattern: str) -> Optional[int]:
|
| 130 |
if not text:
|
| 131 |
return None
|
| 132 |
m = re.search(pattern, text)
|
|
|
|
| 163 |
if filters["require_outdoor"] and (lst.has_patio or lst.has_terrace):
|
| 164 |
score += 1.0
|
| 165 |
if filters["require_pet"]:
|
| 166 |
+
score += 0.6 if lst.pet_friendly else 0.0
|
|
|
|
| 167 |
else:
|
| 168 |
score += 0.2
|
| 169 |
if filters["require_bidet"]:
|
| 170 |
+
score += 0.6 if lst.has_bidet else 0.0
|
|
|
|
| 171 |
else:
|
| 172 |
score += 0.2
|
| 173 |
score += residential_score(lst.address or "", lst.neighborhood or "", lst.description or "")
|
|
|
|
| 198 |
if wait > 0:
|
| 199 |
await asyncio.sleep(wait)
|
| 200 |
_last_hit[domain] = time.time()
|
|
|
|
| 201 |
await asyncio.sleep(random.uniform(*JITTER_RANGE))
|
| 202 |
|
| 203 |
async def fetch(url: str) -> Optional[str]:
|
|
|
|
| 208 |
try:
|
| 209 |
async with httpx.AsyncClient(follow_redirects=True, http2=True, proxies=proxies, timeout=TIMEOUT) as client:
|
| 210 |
r = await client.get(url, headers=make_headers())
|
| 211 |
+
# aceptamos HTML corto; algunos portales entregan SSR mínimo
|
| 212 |
+
if r.status_code == 200 and r.text:
|
| 213 |
return r.text
|
|
|
|
| 214 |
await asyncio.sleep(BACKOFF_BASE * (2 ** i) + random.uniform(0, 0.35))
|
| 215 |
except Exception:
|
| 216 |
await asyncio.sleep(BACKOFF_BASE * (2 ** i) + random.uniform(0, 0.35))
|
| 217 |
return None
|
| 218 |
|
| 219 |
# =========================
|
| 220 |
+
# Portales
|
| 221 |
# =========================
|
| 222 |
|
| 223 |
class Portal:
|
| 224 |
+
def __init__(self, domain: str, search_builder):
|
| 225 |
self.domain = domain
|
| 226 |
+
self.search_builder = search_builder # fn(neighs, max_usd, types)->[urls]
|
|
|
|
| 227 |
|
| 228 |
def sb_qparam(base: str, param: str = "q"):
|
| 229 |
def _builder(neighs: List[str], max_usd: int, types: List[str]) -> List[str]:
|
| 230 |
urls = []
|
| 231 |
+
syn_outdoor = ["patio", "terraza", "exterior"]
|
| 232 |
+
syn_pets = ["mascotas", "pet friendly"]
|
| 233 |
+
rooms_variants = ["3 ambientes", "tres ambientes"]
|
|
|
|
| 234 |
for n in neighs:
|
| 235 |
+
for o in syn_outdoor:
|
| 236 |
+
q = f"{' o '.join(types)} venta {n} hasta {max_usd} dolares {random.choice(rooms_variants)} {o} {random.choice(syn_pets)} bidet"
|
| 237 |
+
urls.append(f"{base}?{param}={ul.quote(q)}")
|
|
|
|
| 238 |
return urls
|
| 239 |
return _builder
|
| 240 |
|
|
|
|
| 241 |
PORTALS: List[Portal] = [
|
| 242 |
Portal("www.zonaprop.com.ar", sb_qparam("https://www.zonaprop.com.ar/propiedades.html", "q")),
|
| 243 |
Portal("www.argenprop.com", sb_qparam("https://www.argenprop.com/propiedades", "text")),
|
|
|
|
| 251 |
Portal("www.buscatucasa.com.ar", sb_qparam("https://www.buscatucasa.com.ar/buscar", "q")),
|
| 252 |
]
|
| 253 |
|
| 254 |
+
ANCHOR_TOKENS = [
|
| 255 |
+
"propiedad", "inmueble", "inmuebles", "departamento", "casa", "ph",
|
| 256 |
+
"detalle", "item", "listing", "publicacion", "aviso", "MLA-"
|
| 257 |
+
]
|
| 258 |
+
|
| 259 |
def generic_card_extractor(soup: BeautifulSoup, domain: str) -> List[Dict[str, Any]]:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 260 |
anchors = soup.select("a[href]")
|
| 261 |
seen = set()
|
| 262 |
cards = []
|
|
|
|
| 264 |
href = a.get("href", "")
|
| 265 |
if not href:
|
| 266 |
continue
|
| 267 |
+
# normalizar absoluto
|
| 268 |
if href.startswith("//"):
|
| 269 |
href = "https:" + href
|
| 270 |
elif href.startswith("/"):
|
| 271 |
href = f"https://{domain}{href}"
|
| 272 |
+
# solo mismo dominio
|
| 273 |
if domain not in href:
|
| 274 |
continue
|
| 275 |
+
# filtrar rutas no relevantes
|
| 276 |
if any(x in href for x in ["/login", "/perfil", "/ayuda", "/faq", "/favorito", "/mi-cuenta"]):
|
| 277 |
continue
|
| 278 |
+
# heurística de “parece aviso”
|
| 279 |
+
if not any(tok in href.lower() for tok in [t.lower() for t in ANCHOR_TOKENS]):
|
| 280 |
+
continue
|
| 281 |
+
# no duplicados
|
| 282 |
if href in seen:
|
| 283 |
continue
|
| 284 |
seen.add(href)
|
| 285 |
|
| 286 |
title = clean_text(a.get_text(" ", strip=True))
|
| 287 |
+
if len(title) < 8:
|
| 288 |
+
# algunos sitios tienen título en contenedor padre
|
| 289 |
+
parent = a.find_parent()
|
| 290 |
+
if parent:
|
| 291 |
+
title = clean_text(parent.get_text(" ", strip=True))[:160]
|
| 292 |
+
# texto de bloque cercano
|
| 293 |
parent = a.find_parent()
|
| 294 |
block_text = clean_text(parent.get_text(" ", strip=True)) if parent else ""
|
|
|
|
| 295 |
m = re.search(r"(U\$S|USD|US\$|D[oó]lares?)\s*([\d\.\,]+)", block_text, re.IGNORECASE)
|
| 296 |
price_text = m.group(0) if m else ""
|
| 297 |
+
addr_m = re.search(r"(Saavedra|Nu[eñ]ez|La Lucila|Florida(?: Oeste)?|Munro|Carapachay|Olivos|Martelli|Vicente L[oó]pez)[^|,]*", block_text, re.IGNORECASE)
|
|
|
|
| 298 |
addr_text = addr_m.group(0) if addr_m else ""
|
| 299 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 300 |
cards.append({
|
| 301 |
+
"title": title[:160],
|
| 302 |
"link": href,
|
| 303 |
"price_text": price_text,
|
| 304 |
"addr_text": addr_text
|
| 305 |
})
|
| 306 |
+
return cards[:50]
|
|
|
|
| 307 |
|
| 308 |
async def scrape_search_page(url: str, domain: str) -> List[Listing]:
|
| 309 |
html = await fetch(url)
|
|
|
|
| 333 |
async def scrape_portal(portal: Portal, neighborhoods: List[str], max_usd: int, types: List[str]) -> List[Listing]:
|
| 334 |
urls = portal.search_builder(neighborhoods, max_usd, types)
|
| 335 |
results: List[Listing] = []
|
| 336 |
+
for u in urls[:6]: # primeras 6 queries permutadas
|
|
|
|
| 337 |
try:
|
| 338 |
res = await scrape_search_page(u, portal.domain)
|
| 339 |
results.extend(res)
|
|
|
|
| 341 |
pass
|
| 342 |
return results
|
| 343 |
|
| 344 |
+
async def enrich_listing(lst: Listing) -> Listing:
|
| 345 |
html = await fetch(lst.link)
|
| 346 |
if not html:
|
| 347 |
return lst
|
| 348 |
soup = BeautifulSoup(html, "lxml")
|
| 349 |
|
| 350 |
# Descripción
|
| 351 |
+
desc_el = soup.find(["div", "section"], attrs={"class": re.compile(r"(description|descripcion|post|body|texto|descripcion-larga)")}) or soup.find("p")
|
| 352 |
+
desc = clean_text(desc_el.get_text(" ", strip=True)) if desc_el else clean_text(" ".join(x.get_text(" ", strip=True) for x in soup.find_all(["p", "li"])[:60]))
|
|
|
|
|
|
|
|
|
|
| 353 |
|
| 354 |
# Inferencias
|
| 355 |
patio, terraza, mascotas, bidet = feature_guess(desc)
|
| 356 |
|
| 357 |
# Características
|
| 358 |
+
feat_text = " ".join(
|
| 359 |
el.get_text(" ", strip=True) for el in soup.find_all(["li", "span", "div"])
|
| 360 |
if el and el.get_text() and any(x in el.get_text().lower() for x in ["ambiente", "dorm", "bañ"])
|
| 361 |
).lower()
|
| 362 |
+
# también mirar el título
|
| 363 |
+
coarse = (lst.title + " " + desc).lower()
|
| 364 |
+
rooms = extract_int_from(feat_text, r"(\d+)\s*ambiente") or extract_int_from(coarse, r"(\d+)\s*amb")
|
| 365 |
+
bathrooms = extract_int_from(feat_text, r"(\d+)\s*bañ") or extract_int_from(coarse, r"(\d+)\s*bañ")
|
| 366 |
+
bedrooms = extract_int_from(feat_text, r"(\d+)\s*dorm") or extract_int_from(coarse, r"(\d+)\s*dormi")
|
| 367 |
|
| 368 |
+
# Dirección
|
| 369 |
addr_guess = soup.find(attrs={"class": re.compile(r"(address|ubicacion|ubicación|location|inmo-location)")})
|
| 370 |
if addr_guess and not lst.address:
|
| 371 |
lst.address = clean_text(addr_guess.get_text(" ", strip=True))[:200]
|
| 372 |
|
| 373 |
lst.description = desc or lst.description
|
| 374 |
+
lst.has_patio = lst.has_patro if hasattr(lst, "has_patro") else lst.has_patio # guard
|
| 375 |
lst.has_patio = lst.has_patio if lst.has_patio is not None else patio
|
| 376 |
lst.has_terrace = lst.has_terrace if lst.has_terrace is not None else terraza
|
| 377 |
lst.pet_friendly = lst.pet_friendly if lst.pet_friendly is not None else mascotas
|
|
|
|
| 385 |
# Orquestación
|
| 386 |
# =========================
|
| 387 |
|
| 388 |
+
def canon(url: str) -> str:
|
| 389 |
+
try:
|
| 390 |
+
parsed = ul.urlparse(url)
|
| 391 |
+
q = ul.parse_qsl(parsed.query)
|
| 392 |
+
q = [(k, v) for (k, v) in q if k.lower() not in {"utm_source", "utm_medium", "utm_campaign", "gclid", "s", "utm_term", "utm_content"}]
|
| 393 |
+
new_q = ul.urlencode(q, doseq=True)
|
| 394 |
+
return ul.urlunparse((parsed.scheme, parsed.netloc, parsed.path, "", new_q, ""))
|
| 395 |
+
except Exception:
|
| 396 |
+
return url
|
| 397 |
+
|
| 398 |
+
async def run_agent_once(neighborhoods: List[str], max_price_usd: int, types: List[str],
|
| 399 |
+
min_rooms: int, require_outdoor: bool, require_bidet: bool, require_pet: bool) -> Tuple[List[Listing], str]:
|
| 400 |
filters = dict(
|
| 401 |
max_price_usd=max_price_usd,
|
| 402 |
min_rooms=min_rooms,
|
|
|
|
| 404 |
require_bidet=require_bidet,
|
| 405 |
require_pet=require_pet,
|
| 406 |
)
|
| 407 |
+
# 1) Multi-portal
|
|
|
|
| 408 |
tasks = [scrape_portal(p, neighborhoods, max_price_usd, types) for p in PORTALS]
|
| 409 |
batch = await asyncio.gather(*tasks)
|
| 410 |
listings = [l for sub in batch for l in sub]
|
| 411 |
|
| 412 |
+
# 2) Dedup
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 413 |
seen = set()
|
| 414 |
unique: List[Listing] = []
|
| 415 |
for l in listings:
|
|
|
|
| 420 |
l.link = key
|
| 421 |
unique.append(l)
|
| 422 |
|
| 423 |
+
# 3) Enriquecer
|
| 424 |
sem = asyncio.Semaphore(8)
|
| 425 |
+
async def guard(item: Listing):
|
| 426 |
async with sem:
|
| 427 |
+
enriched = await enrich_listing(item)
|
|
|
|
| 428 |
await asyncio.sleep(random.uniform(*JITTER_RANGE))
|
| 429 |
return enriched
|
| 430 |
+
enriched = await asyncio.gather(*[guard(l) for l in unique])
|
| 431 |
|
| 432 |
+
# 4) Filtrar (tolerante: None no bloquea salvo que se exija explícito)
|
| 433 |
def passes(l: Listing) -> bool:
|
| 434 |
if l.price_usd is None or l.price_usd > max_price_usd:
|
| 435 |
return False
|
|
|
|
| 449 |
|
| 450 |
filtered = [l for l in enriched if passes(l)]
|
| 451 |
|
| 452 |
+
# 5) Score + Orden
|
| 453 |
for l in filtered:
|
| 454 |
l.score = compute_score(l, filters)
|
| 455 |
filtered.sort(key=lambda x: (-x.score, x.price_usd or 1e9))
|
| 456 |
+
|
| 457 |
+
# Trace
|
| 458 |
+
trace = f"Portales: {len(PORTALS)} | Crudos: {len(listings)} | Únicos: {len(unique)} | Enriquecidos: {len(enriched)} | Final: {len(filtered)}"
|
| 459 |
+
return filtered, trace
|
| 460 |
+
|
| 461 |
+
async def run_agent_with_relax(neighborhoods: List[str], max_price_usd: int, types: List[str],
|
| 462 |
+
min_rooms: int, require_outdoor: bool, require_bidet: bool, require_pet: bool,
|
| 463 |
+
auto_relax: bool = True) -> Tuple[List[Listing], List[str]]:
|
| 464 |
+
log = []
|
| 465 |
+
results, trace = await run_agent_once(neighborhoods, max_price_usd, types, min_rooms, require_outdoor, require_bidet, require_pet)
|
| 466 |
+
log.append(f"[Base] {trace}")
|
| 467 |
+
if results or not auto_relax:
|
| 468 |
+
return results, log
|
| 469 |
+
|
| 470 |
+
# no hay resultados: probar escalonado
|
| 471 |
+
base = dict(
|
| 472 |
+
neighborhoods=neighborhoods, max_price_usd=max_price_usd, types=types,
|
| 473 |
+
min_rooms=min_rooms, require_outdoor=require_outdoor, require_bidet=require_bidet, require_pet=require_pet
|
| 474 |
+
)
|
| 475 |
+
price = max_price_usd
|
| 476 |
+
for i, step in enumerate(RELAX_STEPS, 1):
|
| 477 |
+
mr = step.get("min_rooms", base["min_rooms"])
|
| 478 |
+
ro = step.get("require_outdoor", base["require_outdoor"])
|
| 479 |
+
rb = step.get("require_bidet", base["require_bidet"])
|
| 480 |
+
rp = step.get("require_pet", base["require_pet"])
|
| 481 |
+
if "max_price_usd_delta" in step:
|
| 482 |
+
price = max_price_usd + step["max_price_usd_delta"]
|
| 483 |
+
log.append(f"[Relax {i}] rooms={mr} outdoor={ro} bidet={rb} pet={rp} price_max=USD {price}")
|
| 484 |
+
results, trace = await run_agent_once(neighborhoods, price, types, mr, ro, rb, rp)
|
| 485 |
+
log.append(f"[Relax {i}] {trace}")
|
| 486 |
+
if results:
|
| 487 |
+
return results, log
|
| 488 |
+
return results, log
|
| 489 |
|
| 490 |
def listings_to_df(listings: List[Listing]) -> pd.DataFrame:
|
| 491 |
rows = []
|
|
|
|
| 520 |
msg["Subject"] = subject
|
| 521 |
msg["From"] = sender
|
| 522 |
msg["To"] = to_addr
|
| 523 |
+
msg.set_content("Este mensaje tiene versión HTML y adjuntos.")
|
| 524 |
msg.add_alternative(body_html, subtype="html")
|
| 525 |
for filename, content, mimetype in attachments:
|
| 526 |
maintype, subtype = (mimetype.split("/", 1) if "/" in mimetype else ("application", "octet-stream"))
|
|
|
|
| 541 |
server.send_message(msg)
|
| 542 |
else:
|
| 543 |
with smtplib.SMTP(SMTP_HOST, SMTP_PORT) as server:
|
| 544 |
+
server.ehlo(); server.starttls(); server.ehlo()
|
|
|
|
|
|
|
| 545 |
server.login(SMTP_USER, SMTP_PASS)
|
| 546 |
server.send_message(msg)
|
| 547 |
return "OK"
|
|
|
|
| 554 |
def json_to_bytes(obj: Any) -> bytes:
|
| 555 |
return json.dumps(obj, ensure_ascii=False, indent=2).encode("utf-8")
|
| 556 |
|
| 557 |
+
def render_summary_html(df: pd.DataFrame, neighborhoods: List[str], max_usd: int, min_rooms: int, relax_log: List[str]) -> str:
|
| 558 |
count = len(df)
|
| 559 |
+
head = f"<h2>Resultados</h2><p><b>Zonas:</b> {', '.join(neighborhoods)}<br><b>Precio máx.:</b> USD {max_usd}<br><b>Ambientes mín.:</b> {min_rooms}<br><b>Total:</b> {count}</p>"
|
| 560 |
+
trace = "<pre style='white-space:pre-wrap;font-size:12px;opacity:.85;'>" + "\n".join(relax_log) + "</pre>"
|
| 561 |
if count == 0:
|
| 562 |
+
return head + "<p>No se encontraron resultados con los filtros actuales.</p>" + trace
|
| 563 |
top_rows = df.sort_values(by=['Score','Precio USD'], ascending=[False, True]).head(12)
|
| 564 |
items = []
|
| 565 |
for _, r in top_rows.iterrows():
|
|
|
|
| 567 |
price = f"USD {int(r['Precio USD'])}" if pd.notna(r['Precio USD']) else "USD —"
|
| 568 |
addr = r.get("Dirección/Área") or ""
|
| 569 |
items.append(f"<li><b>{r['Título']}</b> — {price} — {addr} — {flags} — <a href='{r['Link']}'>Abrir</a></li>")
|
| 570 |
+
return head + "<ol>" + "\n".join(items) + "</ol>" + trace
|
| 571 |
|
| 572 |
# =========================
|
| 573 |
# UI (Gradio)
|
| 574 |
# =========================
|
| 575 |
|
| 576 |
DESCRIPTION = """
|
| 577 |
+
Meta-buscador multi-portales para casas/PH entre Saavedra y La Lucila y alrededores.
|
| 578 |
+
• Filtros: USD ≤ 90k, ≥ 3 ambientes, patio/terraza, mascotas, bidet (si figura en descripción).
|
| 579 |
+
• Anti-scraping: headers rotativos, referers, HTTP/2, rate limit con jitter, reintentos con backoff.
|
| 580 |
+
• Si no hay resultados, activa auto-relajación escalonada (configurable) y documenta los pasos.
|
| 581 |
"""
|
| 582 |
|
| 583 |
+
async def run_and_present(neighs, max_usd, types, min_rooms, req_outdoor, req_bidet, req_pet, auto_relax, email_to, send_email_flag):
|
| 584 |
neighs_list = [n.strip() for n in str(neighs).split(",") if n.strip()]
|
| 585 |
types_list = [t.strip().lower() for t in str(types).split(",") if t.strip()]
|
| 586 |
+
max_usd = int(max_usd); min_rooms = int(min_rooms)
|
| 587 |
+
req_outdoor = bool(req_outdoor); req_bidet = bool(req_bidet); req_pet = bool(req_pet); auto_relax = bool(auto_relax)
|
| 588 |
|
| 589 |
+
results, relax_log = await run_agent_with_relax(
|
| 590 |
+
neighborhoods=neighs_list, max_price_usd=max_usd, types=types_list,
|
| 591 |
+
min_rooms=min_rooms, require_outdoor=req_outdoor, require_bidet=req_bidet, require_pet=req_pet,
|
| 592 |
+
auto_relax=auto_relax
|
|
|
|
|
|
|
|
|
|
|
|
|
| 593 |
)
|
| 594 |
df = listings_to_df(results)
|
| 595 |
json_blob = [asdict(l) for l in results]
|
|
|
|
| 599 |
if not EMAIL_REGEX.match(email_to or ""):
|
| 600 |
email_status = "Error: email destino inválido."
|
| 601 |
else:
|
| 602 |
+
html = render_summary_html(df, neighs_list, max_usd, min_rooms, relax_log)
|
| 603 |
attachments: List[Tuple[str, bytes, str]] = []
|
| 604 |
if not df.empty:
|
| 605 |
attachments.append(("resultados.csv", df_to_csv_bytes(df), "text/csv"))
|
|
|
|
| 612 |
)
|
| 613 |
email_status = "Enviado" if status == "OK" else status
|
| 614 |
|
| 615 |
+
# Mostrar log en la pestaña de estado
|
| 616 |
+
return df, json.dumps(json_blob, ensure_ascii=False, indent=2), " | ".join(relax_log), email_status
|
| 617 |
|
| 618 |
with gr.Blocks(title="Meta-buscador Inmuebles Norte BA (≤ USD 90k)") as demo:
|
| 619 |
gr.Markdown("# Meta-buscador de casas/PH norte BA (≤ 90 000 USD)")
|
|
|
|
| 628 |
req_outdoor = gr.Checkbox(label="Requerir patio o terraza", value=REQUIRE_OUTDOOR)
|
| 629 |
req_bidet = gr.Checkbox(label="Requerir bidet (si aparece en descripción)", value=REQUIRE_BIDET)
|
| 630 |
req_pet = gr.Checkbox(label="Requerir pet-friendly (si aparece en descripción)", value=REQUIRE_PET_FRIENDLY)
|
| 631 |
+
auto_relax = gr.Checkbox(label="Auto-relajar si no hay resultados", value=AUTO_RELAX_ENABLED)
|
| 632 |
|
| 633 |
gr.Markdown("### Envío por email al finalizar (opcional)")
|
| 634 |
with gr.Row():
|
|
|
|
| 638 |
btn = gr.Button("Buscar ahora", variant="primary")
|
| 639 |
with gr.Tabs():
|
| 640 |
with gr.Tab("Resultados"):
|
| 641 |
+
table = gr.Dataframe(interactive=False) # sin args raros
|
| 642 |
with gr.Tab("JSON"):
|
| 643 |
j = gr.Code(language="json")
|
| 644 |
+
with gr.Tab("Estado"):
|
| 645 |
+
trace = gr.Markdown("—")
|
| 646 |
with gr.Tab("Estado de email"):
|
| 647 |
status = gr.Markdown("—")
|
| 648 |
|
| 649 |
btn.click(
|
| 650 |
run_and_present,
|
| 651 |
+
inputs=[neighs, max_usd, types, min_rooms, req_outdoor, req_bidet, req_pet, auto_relax, email_to, send_email_flag],
|
| 652 |
+
outputs=[table, j, trace, status]
|
| 653 |
)
|
| 654 |
|
| 655 |
if __name__ == "__main__":
|