Lukeetah commited on
Commit
3d30d20
·
verified ·
1 Parent(s): 699ed82

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +262 -279
app.py CHANGED
@@ -1,20 +1,23 @@
1
  import os
2
  import re
3
- import time
4
- import math
5
  import json
 
 
 
6
  import asyncio
7
  import random
 
 
8
  from dataclasses import dataclass, asdict
9
  from typing import List, Optional, Dict, Any, Tuple
10
  import urllib.parse as ul
11
- from pathlib import Path
12
 
13
  import httpx
14
  from bs4 import BeautifulSoup
15
  from rapidfuzz import fuzz
16
  import pandas as pd
17
  import gradio as gr
 
18
 
19
  # =========================
20
  # Configuración principal
@@ -23,6 +26,7 @@ import gradio as gr
23
  DEFAULT_MAX_USD = 90000
24
  DEFAULT_NEIGHBORHOODS = [
25
  "Saavedra", "Nuñez", "La Lucila", "Florida Oeste", "Munro", "Carapachay",
 
26
  "Olivos", "Villa Martelli"
27
  ]
28
  DEFAULT_TYPES = ["casa", "ph"] # casa / ph
@@ -31,34 +35,46 @@ REQUIRE_BIDET = True
31
  REQUIRE_PET_FRIENDLY = True
32
  REQUIRE_OUTDOOR = True # patio o terraza
33
 
34
- # Alertas por Telegram
35
- TELEGRAM_BOT_TOKEN = os.getenv("TELEGRAM_BOT_TOKEN", "").strip()
36
- TELEGRAM_CHAT_ID = os.getenv("TELEGRAM_CHAT_ID", "").strip()
37
- DEFAULT_MIN_SCORE_ALERT = 2.2
38
- DEFAULT_MONITOR_INTERVAL_MIN = 60
39
-
40
- # Persistencia ligera (para no reenviar duplicados)
41
- CACHE_PATH = Path("cache_listings.json")
42
 
 
43
  USER_AGENT_POOL = [
44
  "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36",
45
  "Mozilla/5.0 (Macintosh; Intel Mac OS X 13_4) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.4 Safari/605.1.15",
46
  "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36",
47
  ]
48
-
 
 
 
 
49
  TIMEOUT = httpx.Timeout(20.0, connect=10.0)
50
  MAX_CONCURRENCY = 6
51
  RETRIES = 2
52
- BACKOFF_BASE = 0.8
 
53
 
54
- # Microzonas residenciales priorizadas (heurística positiva)
55
- MICROZONAS_PRIORITARIAS = [
56
- "Parque Saavedra", "Parque Sarmiento", "Av. Balbín", "Ruiz Huidobro",
57
- "Lomas de Nuñez", "Cabildo", "Plaza Alberti",
58
- "Estación La Lucila", "Rawson", "Paraná", "Maipú",
59
- "Estación Florida", "Estación Carapachay", "Estación Munro",
60
- "Ugarte", "San Martín", "Panamericana", "Pelliza", "Melo",
61
- ]
 
 
 
 
 
 
62
 
63
  # =========================
64
  # Modelos y utilidades
@@ -84,14 +100,17 @@ class Listing:
84
  description: Optional[str]
85
  score: float
86
 
 
 
 
87
  def to_float_price(value: str) -> Optional[float]:
88
  if not value:
89
  return None
90
  txt = value.replace(".", "").replace(",", ".").upper()
91
- if "USD" in txt or "U$S" in txt or "U$D" in txt or "DOLAR" in txt or "US$" in txt:
92
  m = re.search(r"(\d+(?:\.\d+)?)", txt)
93
  return float(m.group(1)) if m else None
94
- return None
95
 
96
  def extract_int(text: str) -> Optional[int]:
97
  if not text:
@@ -99,15 +118,6 @@ def extract_int(text: str) -> Optional[int]:
99
  m = re.search(r"(\d+)", text)
100
  return int(m.group(1)) if m else None
101
 
102
- def clean_text(s: str) -> str:
103
- return re.sub(r"\s+", " ", (s or "").strip())
104
-
105
- def text_has_any(text: str, keywords: List[str]) -> bool:
106
- if not text:
107
- return False
108
- t = text.lower()
109
- return any(kw.lower() in t for kw in keywords)
110
-
111
  def fuzzy_any(text: str, keywords: List[str], thresh: int = 80) -> bool:
112
  if not text:
113
  return False
@@ -135,69 +145,89 @@ def compute_score(lst: Listing, filters: Dict[str, Any]) -> float:
135
  score += (filters["max_price_usd"] - lst.price_usd) / max(filters["max_price_usd"], 1) * 1.0
136
  if lst.rooms and lst.rooms >= filters["min_rooms"]:
137
  score += 1.0
138
- if filters["require_outdoor"]:
139
- if (lst.has_patio or lst.has_terrace):
140
- score += 1.0
141
- if not filters["require_pet"]:
142
- score += 0.2
143
- else:
144
  if lst.pet_friendly:
145
  score += 0.6
146
- if not filters["require_bidet"]:
147
- score += 0.2
148
  else:
 
 
149
  if lst.has_bidet:
150
  score += 0.6
 
 
151
  score += residential_score(lst.address or "", lst.neighborhood or "", lst.description or "")
152
  return round(score, 3)
153
 
154
- def headers():
155
- return {"User-Agent": random.choice(USER_AGENT_POOL)}
 
 
 
 
 
 
 
156
 
157
- async def fetch(client: httpx.AsyncClient, url: str) -> Optional[str]:
 
 
 
 
 
158
  for i in range(RETRIES + 1):
159
  try:
160
- r = await client.get(url, headers=headers(), timeout=TIMEOUT)
161
- if r.status_code == 200 and r.text:
162
- return r.text
163
- await asyncio.sleep(BACKOFF_BASE * (2 ** i))
 
 
164
  except Exception:
165
- await asyncio.sleep(BACKOFF_BASE * (2 ** i))
166
  return None
167
 
168
- async def fetch_detail_and_enrich(client: httpx.AsyncClient, lst: Listing) -> Listing:
169
- html = await fetch(client, lst.link)
170
  if not html:
171
  return lst
172
  soup = BeautifulSoup(html, "lxml")
173
 
 
174
  desc_el = soup.find(["div", "section"], attrs={"class": re.compile(r"(description|Description|post|body)")}) or soup.find("p")
175
  if desc_el:
176
  desc = clean_text(desc_el.get_text(" ", strip=True))
177
  else:
178
- desc = clean_text(" ".join(t.get_text(" ", strip=True) for t in soup.find_all(["p", "li"])[:30]))
179
 
180
  patio, terraza, mascotas, bidet = feature_guess(desc)
181
 
 
182
  features_text = " ".join(
183
  el.get_text(" ", strip=True)
184
  for el in soup.find_all(["li", "span", "div"])
185
  if el and el.get_text() and any(x in el.get_text().lower() for x in ["ambiente", "dorm", "bañ"])
186
- )
187
- ft = features_text.lower()
188
- rooms = extract_int(re.search(r"(\d+)\s*ambiente", ft).group(1)) if re.search(r"(\d+)\s*ambiente", ft) else lst.rooms
189
- bathrooms = extract_int(re.search(r"(\d+)\s*bañ", ft).group(1)) if re.search(r"(\d+)\s*bañ", ft) else lst.bathrooms
190
- bedrooms = extract_int(re.search(r"(\d+)\s*dorm", ft).group(1)) if re.search(r"(\d+)\s*dorm", ft) else lst.bedrooms
 
 
 
 
 
191
 
192
  addr_guess = soup.find(attrs={"class": re.compile(r"(address|ubicacion|location|inmo-location)")})
193
  if addr_guess and not lst.address:
194
  lst.address = clean_text(addr_guess.get_text(" ", strip=True))[:200]
195
 
196
  lst.description = desc or lst.description
197
- lst.has_patio = lst.has_patio if lst.has_patio is not None else patio
198
- lst.has_terrace = lst.has_terrace if lst.has_terrace is not None else terraza
199
- lst.pet_friendly = lst.pet_friendly if lst.pet_friendly is not None else mascotas
200
- lst.has_bidet = lst.has_bidet if lst.has_bidet is not None else bidet
201
  lst.rooms = rooms
202
  lst.bathrooms = bathrooms
203
  lst.bedrooms = bedrooms
@@ -252,23 +282,27 @@ def generic_card_extractor(soup: BeautifulSoup, domain: str) -> List[Dict[str, A
252
  price_text = (m.group(0) if m else "")
253
  addr_m = re.search(r"(Saavedra|Nu[eñ]ez|La Lucila|Florida|Munro|Carapachay|Olivos|Martelli)[^|,]*", block_text, re.IGNORECASE)
254
  address_text = addr_m.group(0) if addr_m else ""
 
 
255
  cards.append({
256
  "title": title or "",
257
- "link": href if href.startswith("http") else f"https://{domain}{href}",
258
  "price_text": price_text,
259
  "addr_text": address_text
260
  })
 
261
  filtered = []
262
  for c in cards:
263
  if len(c["title"]) < 8:
264
  continue
265
- if any(tok in c["link"] for tok in ["/perfil/", "/inmobiliaria/", "/ayuda", "/faq", "/login", "/like"]):
266
  continue
267
  filtered.append(c)
268
  return filtered
269
 
270
- async def scrape_search_page(client: httpx.AsyncClient, url: str, domain: str) -> List[Listing]:
271
- html = await fetch(client, url)
 
272
  if not html:
273
  return []
274
  soup = BeautifulSoup(html, "lxml")
@@ -290,17 +324,18 @@ async def scrape_search_page(client: httpx.AsyncClient, url: str, domain: str) -
290
  description=None,
291
  score=0.0
292
  ))
 
293
  return listings[:25]
294
 
295
- async def scrape_portal(client: httpx.AsyncClient, portal: str, urls: List[str]) -> List[Listing]:
296
  out: List[Listing] = []
 
297
  for u in urls[:4]:
298
  try:
299
- res = await scrape_search_page(client, u, portal)
300
  out.extend(res)
301
- await asyncio.sleep(0.5)
302
  except Exception:
303
- continue
304
  return out
305
 
306
  # =========================
@@ -324,57 +359,63 @@ async def run_agent(
324
  require_pet=require_pet,
325
  )
326
 
327
- async with httpx.AsyncClient(follow_redirects=True) as client:
328
- z_urls = zonaprop_search_urls(neighborhoods, max_price_usd, types)
329
- a_urls = argenprop_search_urls(neighborhoods, max_price_usd, types)
330
- p_urls = properati_search_urls(neighborhoods, max_price_usd, types)
331
-
332
- tasks = [
333
- scrape_portal(client, "www.zonaprop.com.ar", z_urls),
334
- scrape_portal(client, "www.argenprop.com", a_urls),
335
- scrape_portal(client, "www.properati.com.ar", p_urls),
336
- ]
337
- batch_lists = await asyncio.gather(*tasks)
338
- listings = [l for batch in batch_lists for l in batch]
339
-
340
- seen = set()
341
- unique: List[Listing] = []
342
- for l in listings:
343
- if l.link in seen:
344
- continue
345
- seen.add(l.link)
346
- unique.append(l)
347
-
348
- sem = asyncio.Semaphore(MAX_CONCURRENCY)
349
- async def enrich_guarded(l: Listing):
350
- async with sem:
351
- return await fetch_detail_and_enrich(client, l)
352
-
353
- enriched = await asyncio.gather(*[enrich_guarded(l) for l in unique])
354
-
355
- def passes(l: Listing) -> bool:
356
- if l.price_usd is None or l.price_usd > max_price_usd:
357
- return False
358
- if l.rooms is not None and l.rooms < min_rooms:
359
- return False
360
- if require_outdoor and not ((l.has_patio is True) or (l.has_terrace is True)):
361
- return False
362
- if require_bidet and l.has_bidet is not True:
363
- return False
364
- if require_pet and l.pet_friendly is not True:
365
- return False
366
- type_hit = any(t in (l.title.lower() + " " + (l.description or "").lower()) for t in types)
367
- if not type_hit:
368
- type_hit = True
369
- return type_hit
370
-
371
- filtered = [l for l in enriched if passes(l)]
372
-
373
- for l in filtered:
374
- l.score = compute_score(l, filters)
375
-
376
- filtered.sort(key=lambda x: (-x.score, x.price_usd or 1e9))
377
- return filtered
 
 
 
 
 
 
378
 
379
  def listings_to_df(listings: List[Listing]) -> pd.DataFrame:
380
  rows = []
@@ -401,118 +442,63 @@ def listings_to_df(listings: List[Listing]) -> pd.DataFrame:
401
  return df
402
 
403
  # =========================
404
- # Cache + Telegram
405
- # =========================
406
-
407
- def load_cache() -> Dict[str, Any]:
408
- if CACHE_PATH.exists():
409
- try:
410
- return json.loads(CACHE_PATH.read_text(encoding="utf-8"))
411
- except Exception:
412
- return {"sent_links": []}
413
- return {"sent_links": []}
414
-
415
- def save_cache(cache: Dict[str, Any]) -> None:
416
- try:
417
- CACHE_PATH.write_text(json.dumps(cache, ensure_ascii=False, indent=2), encoding="utf-8")
418
- except Exception:
419
- pass
420
-
421
- async def telegram_send_message(text: str, disable_web_page_preview: bool = False) -> bool:
422
- if not TELEGRAM_BOT_TOKEN or not TELEGRAM_CHAT_ID:
423
- return False
424
- api = f"https://api.telegram.org/bot{TELEGRAM_BOT_TOKEN}/sendMessage"
425
- payload = {
426
- "chat_id": TELEGRAM_CHAT_ID,
427
- "text": text,
428
- "parse_mode": "HTML",
429
- "disable_web_page_preview": disable_web_page_preview
430
- }
431
- try:
432
- async with httpx.AsyncClient() as client:
433
- r = await client.post(api, data=payload, timeout=TIMEOUT)
434
- return r.status_code == 200
435
- except Exception:
436
- return False
437
-
438
- def fmt_listing_msg(l: Listing) -> str:
439
- price = f"USD {int(l.price_usd)}" if l.price_usd else "USD -"
440
- flags = []
441
- if l.has_patio: flags.append("Patio")
442
- if l.has_terrace: flags.append("Terraza")
443
- if l.pet_friendly: flags.append("Mascotas")
444
- if l.has_bidet: flags.append("Bidet")
445
- flags_txt = " · ".join(flags) if flags else "—"
446
- addr = l.address or "Zona: —"
447
- return (
448
- f"🏡 <b>{l.title[:70]}</b>\n"
449
- f"{addr}\n"
450
- f"💰 {price} · ⭐ {l.score}\n"
451
- f"🔖 {l.rooms or '-'} amb · {l.bedrooms or '-'} dorm · {l.bathrooms or '-'} baños\n"
452
- f"✅ {flags_txt}\n"
453
- f"🔗 <a href=\"{l.link}\">Ver aviso</a> · {l.source.replace('www.', '')}"
454
- )
455
-
456
- # =========================
457
- # Monitor en background
458
  # =========================
459
 
460
- monitor_task: Optional[asyncio.Task] = None
461
- monitor_stop_event = asyncio.Event()
462
- monitor_running = False
463
-
464
- async def monitor_loop(
465
- neighs: List[str],
466
- max_usd: int,
467
- types: List[str],
468
- min_rooms: int,
469
- req_outdoor: bool,
470
- req_bidet: bool,
471
- req_pet: bool,
472
- min_score_alert: float,
473
- interval_min: int,
474
- max_alerts_per_run: int = 5
475
- ):
476
- global monitor_running
477
- cache = load_cache()
478
- sent_links = set(cache.get("sent_links", []))
479
- monitor_running = True
480
- await telegram_send_message("✅ Monitor de avisos iniciado. Te aviso lo que valga la pena. 🐶🏡", True)
481
  try:
482
- while not monitor_stop_event.is_set():
483
- try:
484
- results = await run_agent(
485
- neighborhoods=neighs,
486
- max_price_usd=max_usd,
487
- types=types,
488
- min_rooms=min_rooms,
489
- require_outdoor=req_outdoor,
490
- require_bidet=req_bidet,
491
- require_pet=req_pet
492
- )
493
- # Filtrar nuevos con buen score
494
- new_hits = [l for l in results if l.score >= min_score_alert and l.link not in sent_links]
495
- if new_hits:
496
- for l in new_hits[:max_alerts_per_run]:
497
- ok = await telegram_send_message(fmt_listing_msg(l))
498
- if ok:
499
- sent_links.add(l.link)
500
- cache["sent_links"] = list(sent_links)
501
- save_cache(cache)
502
- else:
503
- # ping silencioso cada tanto para saber que sigue vivo (opcional)
504
- pass
505
- except Exception:
506
- # Evita caída total del loop
507
- await asyncio.sleep(3)
508
- # Espera
509
- await asyncio.wait_for(monitor_stop_event.wait(), timeout=interval_min * 60)
510
- except asyncio.TimeoutError:
511
- # Timeout esperado por wait_for; continúa loop
512
- pass
513
- finally:
514
- monitor_running = False
515
- await telegram_send_message("⏹️ Monitor de avisos detenido.", True)
516
 
517
  # =========================
518
  # UI (Gradio)
@@ -521,47 +507,45 @@ async def monitor_loop(
521
  DESCRIPTION = """
522
  Agente agregador de avisos (Zonaprop, Argenprop, Properati) para Saavedra → La Lucila y alrededores.
523
  Filtra: USD ≤ 90k, ≥ 3 ambientes (para oficina), patio/terraza, mascotas, bidet (si figura en descripción).
524
-
525
- Alertas por Telegram: configurá TELEGRAM_BOT_TOKEN y TELEGRAM_CHAT_ID en los Secrets del Space. Luego, iniciá el monitor.
526
  """
527
 
528
- async def run_and_present(neighs, max_usd, types, min_rooms, req_outdoor, req_bidet, req_pet):
529
- neighs = [n.strip() for n in neighs.split(",") if n.strip()]
530
- types = [t.strip().lower() for t in types.split(",") if t.strip()]
 
531
  results = await run_agent(
532
- neighborhoods=neighs,
533
- max_price_usd=max_usd,
534
- types=types,
535
- min_rooms=min_rooms,
536
- require_outdoor=req_outdoor,
537
- require_bidet=req_bidet,
538
- require_pet=req_pet
539
  )
540
  df = listings_to_df(results)
541
- json_blob = json.dumps([asdict(l) for l in results], ensure_ascii=False, indent=2)
542
- return df, json_blob
543
-
544
- async def start_monitor(neighs, max_usd, types, min_rooms, req_outdoor, req_bidet, req_pet, min_score_alert, interval_min):
545
- global monitor_task
546
- if monitor_task and not monitor_task.done():
547
- return "El monitor ya está corriendo."
548
- if not TELEGRAM_BOT_TOKEN or not TELEGRAM_CHAT_ID:
549
- return "Faltan TELEGRAM_BOT_TOKEN o TELEGRAM_CHAT_ID en los Secrets del Space."
550
- monitor_stop_event.clear()
551
- neighs_l = [n.strip() for n in neighs.split(",") if n.strip()]
552
- types_l = [t.strip().lower() for t in types.split(",") if t.strip()]
553
- monitor_task = asyncio.create_task(monitor_loop(
554
- neighs_l, int(max_usd), types_l, int(min_rooms),
555
- bool(req_outdoor), bool(req_bidet), bool(req_pet),
556
- float(min_score_alert), int(interval_min)
557
- ))
558
- return "Monitor iniciado. Te aviso por Telegram."
559
-
560
- async def stop_monitor():
561
- if monitor_task and not monitor_task.done():
562
- monitor_stop_event.set()
563
- return "Solicitada detención. Se detendrá en el próximo ciclo."
564
- return "El monitor no estaba corriendo."
565
 
566
  with gr.Blocks(title="Agente Inmuebles Norte BA (≤ USD 90k)") as demo:
567
  gr.Markdown("# Agente de casas/PH norte BA (≤ 90 000 USD)")
@@ -576,27 +560,26 @@ with gr.Blocks(title="Agente Inmuebles Norte BA (≤ USD 90k)") as demo:
576
  req_outdoor = gr.Checkbox(label="Requerir patio o terraza", value=REQUIRE_OUTDOOR)
577
  req_bidet = gr.Checkbox(label="Requerir bidet (solo si aparece en descripción)", value=REQUIRE_BIDET)
578
  req_pet = gr.Checkbox(label="Requerir pet-friendly (si aparece en descripción)", value=REQUIRE_PET_FRIENDLY)
 
 
 
 
579
 
580
  btn = gr.Button("Buscar ahora", variant="primary")
 
581
  with gr.Tabs():
582
  with gr.Tab("Resultados"):
583
- table = gr.Dataframe(interactive=False, wrap=True, overflow_row_behaviour="paginate", max_rows=300)
584
  with gr.Tab("JSON"):
585
  j = gr.Code(language="json")
 
 
586
 
587
- gr.Markdown("---")
588
- gr.Markdown("## Alertas por Telegram")
589
- with gr.Row():
590
- min_score_alert = gr.Slider(label="Score mínimo para alertar", minimum=1.0, maximum=4.0, step=0.1, value=DEFAULT_MIN_SCORE_ALERT)
591
- interval_min = gr.Slider(label="Intervalo de monitoreo (minutos)", minimum=10, maximum=240, step=5, value=DEFAULT_MONITOR_INTERVAL_MIN)
592
- with gr.Row():
593
- start_btn = gr.Button("Iniciar monitor", variant="primary")
594
- stop_btn = gr.Button("Detener monitor")
595
- status = gr.Markdown("Estado: —")
596
-
597
- btn.click(run_and_present, inputs=[neighs, max_usd, types, min_rooms, req_outdoor, req_bidet, req_pet], outputs=[table, j])
598
- start_btn.click(start_monitor, inputs=[neighs, max_usd, types, min_rooms, req_outdoor, req_bidet, req_pet, min_score_alert, interval_min], outputs=[status])
599
- stop_btn.click(stop_monitor, outputs=[status])
600
 
601
  if __name__ == "__main__":
602
- demo.launch()
 
1
  import os
2
  import re
 
 
3
  import json
4
+ import time
5
+ import ssl
6
+ import smtplib
7
  import asyncio
8
  import random
9
+ import mimetypes
10
+ from pathlib import Path
11
  from dataclasses import dataclass, asdict
12
  from typing import List, Optional, Dict, Any, Tuple
13
  import urllib.parse as ul
 
14
 
15
  import httpx
16
  from bs4 import BeautifulSoup
17
  from rapidfuzz import fuzz
18
  import pandas as pd
19
  import gradio as gr
20
+ from email.message import EmailMessage
21
 
22
  # =========================
23
  # Configuración principal
 
26
  DEFAULT_MAX_USD = 90000
27
  DEFAULT_NEIGHBORHOODS = [
28
  "Saavedra", "Nuñez", "La Lucila", "Florida Oeste", "Munro", "Carapachay",
29
+ # Cercanos útiles
30
  "Olivos", "Villa Martelli"
31
  ]
32
  DEFAULT_TYPES = ["casa", "ph"] # casa / ph
 
35
  REQUIRE_PET_FRIENDLY = True
36
  REQUIRE_OUTDOOR = True # patio o terraza
37
 
38
+ # Microzonas residenciales priorizadas (heurística positiva)
39
+ MICROZONAS_PRIORITARIAS = [
40
+ "Parque Saavedra", "Parque Sarmiento", "Av. Balbín", "Ruiz Huidobro",
41
+ "Lomas de Nuñez", "Cabildo", "Plaza Alberti",
42
+ "Estación La Lucila", "Rawson", "Paraná", "Maipú",
43
+ "Estación Florida", "Estación Carapachay", "Estación Munro",
44
+ "Ugarte", "San Martín", "Panamericana", "Pelliza", "Melo",
45
+ ]
46
 
47
+ # Anti-scraping: headers y tiempos
48
  USER_AGENT_POOL = [
49
  "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36",
50
  "Mozilla/5.0 (Macintosh; Intel Mac OS X 13_4) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.4 Safari/605.1.15",
51
  "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36",
52
  ]
53
+ REFERER_POOL = [
54
+ "https://www.google.com/",
55
+ "https://www.bing.com/",
56
+ "https://duckduckgo.com/",
57
+ ]
58
  TIMEOUT = httpx.Timeout(20.0, connect=10.0)
59
  MAX_CONCURRENCY = 6
60
  RETRIES = 2
61
+ BACKOFF_BASE = 0.9
62
+ JITTER_RANGE = (0.15, 0.6) # segundos
63
 
64
+ # Proxy opcional (si definís en Secrets)
65
+ # Ejemplos: http://user:pass@host:port
66
+ PROXY_URL = os.getenv("PROXY_URL", "").strip() # se aplica a todo el cliente si está presente
67
+
68
+ # =========================
69
+ # Email (usa tu SMTP)
70
+ # =========================
71
+ # Configuralo en Settings → Secrets del Space
72
+ SMTP_HOST = os.getenv("SMTP_HOST", "").strip() # ej: smtp.gmail.com
73
+ SMTP_PORT = int(os.getenv("SMTP_PORT", "587")) # 587 (STARTTLS) o 465 (SSL)
74
+ SMTP_USER = os.getenv("SMTP_USER", "").strip() # tu usuario/alias
75
+ SMTP_PASS = os.getenv("SMTP_PASS", "").strip() # password o app password
76
+ SMTP_FROM = os.getenv("SMTP_FROM", SMTP_USER).strip()
77
+ SMTP_USE_SSL = os.getenv("SMTP_USE_SSL", "false").lower() in ("1", "true", "yes")
78
 
79
  # =========================
80
  # Modelos y utilidades
 
100
  description: Optional[str]
101
  score: float
102
 
103
+ def clean_text(s: str) -> str:
104
+ return re.sub(r"\s+", " ", (s or "").strip())
105
+
106
  def to_float_price(value: str) -> Optional[float]:
107
  if not value:
108
  return None
109
  txt = value.replace(".", "").replace(",", ".").upper()
110
+ if any(k in txt for k in ["USD", "U$S", "US$", "DOLAR", "U$D"]):
111
  m = re.search(r"(\d+(?:\.\d+)?)", txt)
112
  return float(m.group(1)) if m else None
113
+ return None # si es ARS, omitimos
114
 
115
  def extract_int(text: str) -> Optional[int]:
116
  if not text:
 
118
  m = re.search(r"(\d+)", text)
119
  return int(m.group(1)) if m else None
120
 
 
 
 
 
 
 
 
 
 
121
  def fuzzy_any(text: str, keywords: List[str], thresh: int = 80) -> bool:
122
  if not text:
123
  return False
 
145
  score += (filters["max_price_usd"] - lst.price_usd) / max(filters["max_price_usd"], 1) * 1.0
146
  if lst.rooms and lst.rooms >= filters["min_rooms"]:
147
  score += 1.0
148
+ if filters["require_outdoor"] and (lst.has_patio or lst.has_terrace):
149
+ score += 1.0
150
+ if filters["require_pet"]:
 
 
 
151
  if lst.pet_friendly:
152
  score += 0.6
 
 
153
  else:
154
+ score += 0.2
155
+ if filters["require_bidet"]:
156
  if lst.has_bidet:
157
  score += 0.6
158
+ else:
159
+ score += 0.2
160
  score += residential_score(lst.address or "", lst.neighborhood or "", lst.description or "")
161
  return round(score, 3)
162
 
163
+ def make_headers() -> Dict[str, str]:
164
+ return {
165
+ "User-Agent": random.choice(USER_AGENT_POOL),
166
+ "Accept-Language": "es-AR,es;q=0.9,en;q=0.8",
167
+ "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
168
+ "Referer": random.choice(REFERER_POOL),
169
+ "Cache-Control": "no-cache",
170
+ "Pragma": "no-cache",
171
+ }
172
 
173
+ async def polite_pause():
174
+ await asyncio.sleep(random.uniform(*JITTER_RANGE))
175
+
176
+ async def fetch(url: str) -> Optional[str]:
177
+ # Cliente por request para poder variar headers y evitar fingerprinting básico
178
+ proxies = {"all://": PROXY_URL} if PROXY_URL else None
179
  for i in range(RETRIES + 1):
180
  try:
181
+ async with httpx.AsyncClient(follow_redirects=True, http2=True, proxies=proxies, timeout=TIMEOUT) as client:
182
+ r = await client.get(url, headers=make_headers())
183
+ if r.status_code == 200 and r.text:
184
+ return r.text
185
+ # manejar 4xx/5xx con backoff
186
+ await asyncio.sleep(BACKOFF_BASE * (2 ** i) + random.uniform(0, 0.3))
187
  except Exception:
188
+ await asyncio.sleep(BACKOFF_BASE * (2 ** i) + random.uniform(0, 0.3))
189
  return None
190
 
191
+ async def fetch_detail_and_enrich(lst: Listing) -> Listing:
192
+ html = await fetch(lst.link)
193
  if not html:
194
  return lst
195
  soup = BeautifulSoup(html, "lxml")
196
 
197
+ # Descripción
198
  desc_el = soup.find(["div", "section"], attrs={"class": re.compile(r"(description|Description|post|body)")}) or soup.find("p")
199
  if desc_el:
200
  desc = clean_text(desc_el.get_text(" ", strip=True))
201
  else:
202
+ desc = clean_text(" ".join(t.get_text(" ", strip=True) for t in soup.find_all(["p", "li"])[:40]))
203
 
204
  patio, terraza, mascotas, bidet = feature_guess(desc)
205
 
206
+ # Features (ambientes / baños / dormitorios)
207
  features_text = " ".join(
208
  el.get_text(" ", strip=True)
209
  for el in soup.find_all(["li", "span", "div"])
210
  if el and el.get_text() and any(x in el.get_text().lower() for x in ["ambiente", "dorm", "bañ"])
211
+ ).lower()
212
+ rooms = lst.rooms
213
+ bathrooms = lst.bathrooms
214
+ bedrooms = lst.bedrooms
215
+ m = re.search(r"(\d+)\s*ambiente", features_text)
216
+ if m: rooms = extract_int(m.group(1))
217
+ m = re.search(r"(\d+)\s*bañ", features_text)
218
+ if m: bathrooms = extract_int(m.group(1))
219
+ m = re.search(r"(\d+)\s*dorm", features_text)
220
+ if m: bedrooms = extract_int(m.group(1))
221
 
222
  addr_guess = soup.find(attrs={"class": re.compile(r"(address|ubicacion|location|inmo-location)")})
223
  if addr_guess and not lst.address:
224
  lst.address = clean_text(addr_guess.get_text(" ", strip=True))[:200]
225
 
226
  lst.description = desc or lst.description
227
+ if lst.has_patio is None: lst.has_patio = patio
228
+ if lst.has_terrace is None: lst.has_terrace = terraza
229
+ if lst.pet_friendly is None: lst.pet_friendly = mascotas
230
+ if lst.has_bidet is None: lst.has_bidet = bidet
231
  lst.rooms = rooms
232
  lst.bathrooms = bathrooms
233
  lst.bedrooms = bedrooms
 
282
  price_text = (m.group(0) if m else "")
283
  addr_m = re.search(r"(Saavedra|Nu[eñ]ez|La Lucila|Florida|Munro|Carapachay|Olivos|Martelli)[^|,]*", block_text, re.IGNORECASE)
284
  address_text = addr_m.group(0) if addr_m else ""
285
+ # Armar link absoluto si fuera relativo
286
+ link_abs = href if href.startswith("http") else f"https://{domain}{href}"
287
  cards.append({
288
  "title": title or "",
289
+ "link": link_abs,
290
  "price_text": price_text,
291
  "addr_text": address_text
292
  })
293
+ # Filtrar ruido
294
  filtered = []
295
  for c in cards:
296
  if len(c["title"]) < 8:
297
  continue
298
+ if any(tok in c["link"] for tok in ["/perfil/", "/inmobiliaria/", "/ayuda", "/faq", "/login", "/like", "/favorito"]):
299
  continue
300
  filtered.append(c)
301
  return filtered
302
 
303
+ async def scrape_search_page(url: str, domain: str) -> List[Listing]:
304
+ html = await fetch(url)
305
+ await polite_pause()
306
  if not html:
307
  return []
308
  soup = BeautifulSoup(html, "lxml")
 
324
  description=None,
325
  score=0.0
326
  ))
327
+ # Limitar por página para evitar ruido excesivo
328
  return listings[:25]
329
 
330
+ async def scrape_portal(urls: List[str], domain: str) -> List[Listing]:
331
  out: List[Listing] = []
332
+ # Toma hasta 4 queries por portal para hacerlo rápido y gentil
333
  for u in urls[:4]:
334
  try:
335
+ res = await scrape_search_page(u, domain)
336
  out.extend(res)
 
337
  except Exception:
338
+ pass
339
  return out
340
 
341
  # =========================
 
359
  require_pet=require_pet,
360
  )
361
 
362
+ # 1) Generar URLs de búsqueda
363
+ z_urls = zonaprop_search_urls(neighborhoods, max_price_usd, types)
364
+ a_urls = argenprop_search_urls(neighborhoods, max_price_usd, types)
365
+ p_urls = properati_search_urls(neighborhoods, max_price_usd, types)
366
+
367
+ # 2) Scrapeo base
368
+ batch_lists = await asyncio.gather(
369
+ scrape_portal(z_urls, "www.zonaprop.com.ar"),
370
+ scrape_portal(a_urls, "www.argenprop.com"),
371
+ scrape_portal(p_urls, "www.properati.com.ar"),
372
+ )
373
+ listings = [l for batch in batch_lists for l in batch]
374
+
375
+ # 3) Deduplicar por link
376
+ seen = set()
377
+ unique: List[Listing] = []
378
+ for l in listings:
379
+ if l.link in seen:
380
+ continue
381
+ seen.add(l.link)
382
+ unique.append(l)
383
+
384
+ # 4) Enriquecer con detalle en paralelo (concurrencia acotada)
385
+ sem = asyncio.Semaphore(MAX_CONCURRENCY)
386
+ async def enrich_guarded(l: Listing):
387
+ async with sem:
388
+ enriched = await fetch_detail_and_enrich(l)
389
+ await polite_pause()
390
+ return enriched
391
+
392
+ enriched = await asyncio.gather(*[enrich_guarded(l) for l in unique])
393
+
394
+ # 5) Filtros duros
395
+ def passes(l: Listing) -> bool:
396
+ if l.price_usd is None or l.price_usd > max_price_usd:
397
+ return False
398
+ if l.rooms is not None and l.rooms < min_rooms:
399
+ return False
400
+ if require_outdoor and not ((l.has_patio is True) or (l.has_terrace is True)):
401
+ return False
402
+ if require_bidet and l.has_bidet is not True:
403
+ return False
404
+ if require_pet and l.pet_friendly is not True:
405
+ return False
406
+ # Tipo: tolerante si no se menciona explícitamente
407
+ text_mix = (l.title + " " + (l.description or "")).lower()
408
+ if not any(t in text_mix for t in types):
409
+ pass
410
+ return True
411
+
412
+ filtered = [l for l in enriched if passes(l)]
413
+
414
+ # 6) Scoring y orden
415
+ for l in filtered:
416
+ l.score = compute_score(l, filters)
417
+ filtered.sort(key=lambda x: (-x.score, x.price_usd or 1e9))
418
+ return filtered
419
 
420
  def listings_to_df(listings: List[Listing]) -> pd.DataFrame:
421
  rows = []
 
442
  return df
443
 
444
  # =========================
445
+ # Email sender
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
446
  # =========================
447
 
448
+ EMAIL_REGEX = re.compile(r"^[^@\s]+@[^@\s]+\.[^@\s]+$")
449
+
450
+ def build_email(subject: str, sender: str, to_addr: str, body_html: str, attachments: List[Tuple[str, bytes, str]]) -> EmailMessage:
451
+ msg = EmailMessage()
452
+ msg["Subject"] = subject
453
+ msg["From"] = sender
454
+ msg["To"] = to_addr
455
+ msg.set_content("Este mensaje requiere un cliente compatible HTML.")
456
+ msg.add_alternative(body_html, subtype="html")
457
+ for filename, content, mimetype in attachments:
458
+ maintype, subtype = (mimetype.split("/", 1) if "/" in mimetype else ("application", "octet-stream"))
459
+ msg.add_attachment(content, maintype=maintype, subtype=subtype, filename=filename)
460
+ return msg
461
+
462
+ def send_email(to_addr: str, subject: str, html_body: str, attachments: List[Tuple[str, bytes, str]]) -> str:
463
+ if not (SMTP_HOST and SMTP_PORT and SMTP_USER and SMTP_PASS and SMTP_FROM):
464
+ return "Error: SMTP no configurado en Secrets (SMTP_HOST, SMTP_PORT, SMTP_USER, SMTP_PASS, SMTP_FROM)."
465
+ if not EMAIL_REGEX.match(to_addr):
466
+ return "Error: email destino inválido."
467
+ msg = build_email(subject, SMTP_FROM, to_addr, html_body, attachments)
 
468
  try:
469
+ if SMTP_USE_SSL or SMTP_PORT == 465:
470
+ context = ssl.create_default_context()
471
+ with smtplib.SMTP_SSL(SMTP_HOST, SMTP_PORT, context=context) as server:
472
+ server.login(SMTP_USER, SMTP_PASS)
473
+ server.send_message(msg)
474
+ else:
475
+ with smtplib.SMTP(SMTP_HOST, SMTP_PORT) as server:
476
+ server.ehlo()
477
+ server.starttls()
478
+ server.ehlo()
479
+ server.login(SMTP_USER, SMTP_PASS)
480
+ server.send_message(msg)
481
+ return "OK"
482
+ except Exception as e:
483
+ return f"Error enviando email: {e}"
484
+
485
+ def df_to_csv_bytes(df: pd.DataFrame) -> bytes:
486
+ return df.to_csv(index=False).encode("utf-8")
487
+
488
+ def json_to_bytes(obj: Any) -> bytes:
489
+ return json.dumps(obj, ensure_ascii=False, indent=2).encode("utf-8")
490
+
491
+ def render_summary_html(df: pd.DataFrame, neighborhoods: List[str], max_usd: int, min_rooms: int) -> str:
492
+ count = len(df)
493
+ head = f"<h2>Resultados de tu búsqueda</h2><p><b>Zonas:</b> {', '.join(neighborhoods)}<br><b>Precio máx.:</b> USD {max_usd}<br><b>Ambientes mín.:</b> {min_rooms}<br><b>Total:</b> {count}</p>"
494
+ if count == 0:
495
+ return head + "<p>No se encontraron resultados con los filtros actuales.</p>"
496
+ top_rows = df.sort_values(by=['Score','Precio USD'], ascending=[False, True]).head(10)
497
+ items = []
498
+ for _, r in top_rows.iterrows():
499
+ flags = " · ".join([k for k in ["Patio","Terraza","Mascotas","Bidet"] if bool(r.get(k))]) or "—"
500
+ items.append(f"<li><b>{r['Título']}</b> — USD {int(r['Precio USD']) if pd.notna(r['Precio USD']) else '-'} — {r.get('Dirección/Área') or ''} — {flags} — <a href='{r['Link']}'>Link</a></li>")
501
+ return head + "<ol>" + "\n".join(items) + "</ol>"
 
502
 
503
  # =========================
504
  # UI (Gradio)
 
507
  DESCRIPTION = """
508
  Agente agregador de avisos (Zonaprop, Argenprop, Properati) para Saavedra → La Lucila y alrededores.
509
  Filtra: USD ≤ 90k, ≥ 3 ambientes (para oficina), patio/terraza, mascotas, bidet (si figura en descripción).
510
+ Al finalizar, podés enviar el resumen a tu email con CSV y JSON adjuntos.
 
511
  """
512
 
513
+ async def run_and_present(neighs, max_usd, types, min_rooms, req_outdoor, req_bidet, req_pet, email_to, send_email_flag):
514
+ neighs_list = [n.strip() for n in str(neighs).split(",") if n.strip()]
515
+ types_list = [t.strip().lower() for t in str(types).split(",") if t.strip()]
516
+
517
  results = await run_agent(
518
+ neighborhoods=neighs_list,
519
+ max_price_usd=int(max_usd),
520
+ types=types_list,
521
+ min_rooms=int(min_rooms),
522
+ require_outdoor=bool(req_outdoor),
523
+ require_bidet=bool(req_bidet),
524
+ require_pet=bool(req_pet)
525
  )
526
  df = listings_to_df(results)
527
+ json_blob = [asdict(l) for l in results]
528
+
529
+ # Email opcional
530
+ email_status = "Email no enviado."
531
+ if send_email_flag:
532
+ if not EMAIL_REGEX.match(email_to or ""):
533
+ email_status = "Error: email destino inválido."
534
+ else:
535
+ html = render_summary_html(df, neighs_list, int(max_usd), int(min_rooms))
536
+ attachments = []
537
+ if not df.empty:
538
+ attachments.append(("resultados.csv", df_to_csv_bytes(df), "text/csv"))
539
+ attachments.append(("resultados.json", json_to_bytes(json_blob), "application/json"))
540
+ status = send_email(
541
+ to_addr=email_to,
542
+ subject="Resultados de casas/PH (≤ USD 90k) – Norte BA",
543
+ html_body=html,
544
+ attachments=attachments
545
+ )
546
+ email_status = "Enviado" if status == "OK" else status
547
+
548
+ return df, json.dumps(json_blob, ensure_ascii=False, indent=2), email_status
 
 
549
 
550
  with gr.Blocks(title="Agente Inmuebles Norte BA (≤ USD 90k)") as demo:
551
  gr.Markdown("# Agente de casas/PH norte BA (≤ 90 000 USD)")
 
560
  req_outdoor = gr.Checkbox(label="Requerir patio o terraza", value=REQUIRE_OUTDOOR)
561
  req_bidet = gr.Checkbox(label="Requerir bidet (solo si aparece en descripción)", value=REQUIRE_BIDET)
562
  req_pet = gr.Checkbox(label="Requerir pet-friendly (si aparece en descripción)", value=REQUIRE_PET_FRIENDLY)
563
+ gr.Markdown("### Envío por email (opcional al finalizar)")
564
+ with gr.Row():
565
+ email_to = gr.Textbox(label="Tu email para recibir los resultados", placeholder="tu@correo.com")
566
+ send_email_flag = gr.Checkbox(label="Enviar email al finalizar", value=True)
567
 
568
  btn = gr.Button("Buscar ahora", variant="primary")
569
+
570
  with gr.Tabs():
571
  with gr.Tab("Resultados"):
572
+ table = gr.Dataframe(interactive=False, wrap=True, max_rows=300)
573
  with gr.Tab("JSON"):
574
  j = gr.Code(language="json")
575
+ with gr.Tab("Estado de email"):
576
+ status = gr.Markdown("—")
577
 
578
+ btn.click(
579
+ run_and_present,
580
+ inputs=[neighs, max_usd, types, min_rooms, req_outdoor, req_bidet, req_pet, email_to, send_email_flag],
581
+ outputs=[table, j, status]
582
+ )
 
 
 
 
 
 
 
 
583
 
584
  if __name__ == "__main__":
585
+ demo.launch()