Lukeetah commited on
Commit
0c27eef
·
verified ·
1 Parent(s): fe1eacf

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +474 -377
app.py CHANGED
@@ -1,400 +1,497 @@
1
- # app.py
2
- # SamuelHouseFinder - FastAPI + Playwright scrapers + Gradio UI (single file)
3
- # Requisitos: ejecutar en contenedor con Playwright browsers (Dockerfile incluido abajo)
4
  import os
5
- import asyncio
6
- import json
7
  import re
8
- import sqlite3
9
- from typing import List, Dict, Any, Optional
10
- from datetime import datetime
11
- from urllib.parse import urljoin
12
-
13
- from fastapi import FastAPI, HTTPException, Request, BackgroundTasks
14
- from fastapi.responses import JSONResponse, HTMLResponse
15
- import uvicorn
 
16
  import httpx
17
  from bs4 import BeautifulSoup
18
-
19
- # Gradio UI
 
20
  import gradio as gr
21
 
22
- # Playwright
23
- from playwright.async_api import async_playwright, TimeoutError as PWTimeout
24
-
25
- # ---------------- CONFIG ----------------
26
- DB_PATH = os.environ.get("DB_PATH", "data/properties.db")
27
- PROXY_LIST = os.environ.get("PROXY_LIST") # comma separated http://user:pass@ip:port
28
- CAPTCHA_API_KEY = os.environ.get("CAPTCHA_API_KEY") # optional
29
- MAX_CONCURRENT_BROWSERS = int(os.environ.get("MAX_BROWSERS", "2"))
30
- DEFAULT_MAX_PAGES = int(os.environ.get("DEFAULT_MAX_PAGES", "2"))
31
- USER_AGENTS = [
32
- # Expand this list for production
33
- "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36",
34
- "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36",
35
- "Mozilla/5.0 (Macintosh; Intel Mac OS X 13_6) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.0 Safari/605.1.15",
36
  ]
 
 
 
 
 
37
 
38
- os.makedirs(os.path.dirname(DB_PATH) or ".", exist_ok=True)
39
-
40
- # ---------------- DB ----------------
41
- def init_db():
42
- conn = sqlite3.connect(DB_PATH)
43
- c = conn.cursor()
44
- c.execute("""
45
- CREATE TABLE IF NOT EXISTS properties (
46
- id INTEGER PRIMARY KEY AUTOINCREMENT,
47
- title TEXT,
48
- price TEXT,
49
- currency TEXT,
50
- address TEXT,
51
- neighbourhood TEXT,
52
- lat REAL,
53
- lon REAL,
54
- bedrooms INTEGER,
55
- bathrooms INTEGER,
56
- surface REAL,
57
- amenities TEXT,
58
- source TEXT,
59
- url TEXT UNIQUE,
60
- scraped_at TEXT,
61
- raw_html TEXT
62
- )
63
- """)
64
- conn.commit()
65
- conn.close()
66
-
67
- def save_property(item: Dict[str,Any]) -> bool:
68
- conn = sqlite3.connect(DB_PATH)
69
- c = conn.cursor()
70
- try:
71
- c.execute("""
72
- INSERT INTO properties
73
- (title,price,currency,address,neighbourhood,lat,lon,bedrooms,bathrooms,surface,amenities,source,url,scraped_at,raw_html)
74
- VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)
75
- """, (
76
- item.get("title"),
77
- item.get("price"),
78
- item.get("currency"),
79
- item.get("address"),
80
- item.get("neighbourhood"),
81
- item.get("lat"),
82
- item.get("lon"),
83
- item.get("bedrooms"),
84
- item.get("bathrooms"),
85
- item.get("surface"),
86
- json.dumps(item.get("amenities",[]), ensure_ascii=False),
87
- item.get("source"),
88
- item.get("url"),
89
- datetime.utcnow().isoformat(),
90
- item.get("raw_html","")[:10000]
91
- ))
92
- conn.commit()
93
- return True
94
- except sqlite3.IntegrityError:
95
- # already exists
 
 
 
 
 
 
 
 
 
 
 
 
96
  return False
97
- except Exception as e:
98
- print("DB save error:", e)
 
 
 
99
  return False
100
- finally:
101
- conn.close()
102
-
103
- def query_db(q: Optional[str]=None, min_price: Optional[int]=None, max_price: Optional[int]=None, bedrooms: Optional[int]=None, source: Optional[str]=None, limit:int=200):
104
- conn = sqlite3.connect(DB_PATH)
105
- conn.row_factory = sqlite3.Row
106
- c = conn.cursor()
107
- sql = "SELECT * FROM properties WHERE 1=1"
108
- params = []
109
- if q:
110
- sql += " AND (title LIKE ? OR address LIKE ? OR neighbourhood LIKE ?)"
111
- qv = f"%{q}%"
112
- params += [qv,qv,qv]
113
- if source:
114
- sql += " AND source = ?"
115
- params.append(source)
116
- # NOTE: price is stored as text (different formats). For production parse and store numeric.
117
- sql += " ORDER BY scraped_at DESC LIMIT ?"
118
- params.append(limit)
119
- rows = c.execute(sql, params).fetchall()
120
- conn.close()
121
- return [dict(r) for r in rows]
122
-
123
- # ---------------- UTIL ----------------
124
- def get_proxies_list():
125
- if not PROXY_LIST:
126
- return []
127
- return [p.strip() for p in PROXY_LIST.split(",") if p.strip()]
128
 
129
- def pick_proxy(idx=0):
130
- lst = get_proxies_list()
131
- if not lst:
132
- return None
133
- return lst[idx % len(lst)]
 
 
134
 
135
- async def validate_url(client: httpx.AsyncClient, url: str) -> bool:
136
- try:
137
- r = await client.head(url, follow_redirects=True, timeout=15)
138
- return r.status_code == 200
139
- except Exception:
140
- try:
141
- r2 = await client.get(url, follow_redirects=True, timeout=20)
142
- return r2.status_code == 200
143
- except Exception:
144
- return False
145
-
146
- # ---------------- SCRAPERS (Playwright) ----------------
147
- # Each scraper returns list[dict] with canonical fields (see save_property)
148
-
149
- async def scrape_mercadolibre(pw, location:str, max_pages:int=1, idx_offset=0) -> List[Dict[str,Any]]:
150
- """Scrapes MercadoLibre Inmuebles listing pages (rendered)."""
151
- out = []
152
- base = "https://listado.mercadolibre.com.ar"
153
- # build query: try location as-is and also appended 'venta'
154
- q = location.replace(" ", "-")
155
- async with pw.chromium.launch(headless=True, args=["--no-sandbox"]) as browser:
156
- for p in range(1, max_pages+1):
157
- # MercadoLibre pagination is usually offset-based; try two patterns
158
- page_path = f"/{q}_Desde_{(p-1)*50+1}"
159
- url = urljoin(base, page_path)
160
- proxy = pick_proxy(p-1)
161
- context_args = {}
162
- if proxy:
163
- context_args["proxy"] = {"server": proxy}
164
- ua = USER_AGENTS[(idx_offset + p) % len(USER_AGENTS)]
165
- context_args["user_agent"] = ua
166
- context = await browser.new_context(**context_args)
167
- page = await context.new_page()
168
- try:
169
- await page.goto(url, wait_until="networkidle", timeout=30000)
170
- # ML often lazy-loads; ensure content loaded
171
- await page.wait_for_timeout(1500)
172
- html = await page.content()
173
- except PWTimeout:
174
- html = await page.content()
175
- except Exception as e:
176
- print("ML page error:", e)
177
- html = ""
178
- finally:
179
- try:
180
- await page.close()
181
- await context.close()
182
- except Exception:
183
- pass
184
- if not html:
185
- continue
186
- soup = BeautifulSoup(html, "html.parser")
187
- # Search for anchor tags that likely link to properties
188
- anchors = soup.select("a[href]")
189
- found = set()
190
- for a in anchors:
191
- href = a.get("href")
192
- if not href:
193
- continue
194
- # heuristics: property detail urls often contain '/MLA-' or '/MLO-' or '/inmuebles'
195
- if re.search(r"/MLA-|/MLO-|/inmuebles/", href):
196
- full = href if href.startswith("http") else urljoin(base, href)
197
- if full in found:
198
- continue
199
- found.add(full)
200
- title = (a.get_text(strip=True) or "Propiedad MercadoLibre")[:300]
201
- out.append({
202
- "title": title,
203
- "price": None,
204
- "currency": "ARS",
205
- "address": None,
206
- "neighbourhood": None,
207
- "lat": None,
208
- "lon": None,
209
- "bedrooms": None,
210
- "bathrooms": None,
211
- "surface": None,
212
- "amenities": [],
213
- "source": "MercadoLibre",
214
- "url": full,
215
- "raw_html": str(a)[:8000]
216
- })
217
- return out
218
 
219
- async def scrape_properati(pw, location:str, max_pages:int=1, idx_offset=0) -> List[Dict[str,Any]]:
220
- out = []
221
- base = "https://www.properati.com.ar"
222
- url = f"{base}/search?q={location}"
223
- proxy = pick_proxy(idx_offset)
224
- ua = USER_AGENTS[idx_offset % len(USER_AGENTS)]
225
- async with pw.chromium.launch(headless=True, args=["--no-sandbox"]) as browser:
226
- context_args = {"user_agent": ua}
227
- if proxy:
228
- context_args["proxy"] = {"server": proxy}
229
- context = await browser.new_context(**context_args)
230
- page = await context.new_page()
231
- try:
232
- await page.goto(url, wait_until="networkidle", timeout=30000)
233
- await page.wait_for_timeout(1200)
234
- html = await page.content()
235
- except Exception as e:
236
- print("Properati error:", e)
237
- html = ""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
238
  try:
239
- await page.close()
240
- await context.close()
 
 
241
  except Exception:
242
- pass
 
 
 
 
243
  if not html:
244
- return out
245
- soup = BeautifulSoup(html, "html.parser")
246
- cards = soup.select("a[href]")
247
- found=set()
248
- for a in cards:
249
- href=a.get("href")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
250
  if not href:
251
  continue
252
- if "/property/" in href or "/inmuebles/" in href or "/propiedad" in href:
253
- full = href if href.startswith("http") else urljoin(base, href)
254
- if full in found: continue
255
- found.add(full)
256
- title=(a.get_text(strip=True) or "Propiedad Properati")[:300]
257
- out.append({
258
- "title": title,
259
- "price": None,
260
- "currency": "ARS",
261
- "address": None,
262
- "neighbourhood": None,
263
- "lat": None,
264
- "lon": None,
265
- "bedrooms": None,
266
- "bathrooms": None,
267
- "surface": None,
268
- "amenities": [],
269
- "source": "Properati",
270
- "url": full,
271
- "raw_html": str(a)[:8000]
 
 
272
  })
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
273
  return out
274
 
275
- # Extendable: add ZonaProp, Inmuebles, ArgenProp, etc.
276
-
277
- # ---------------- ORCHESTRATOR ----------------
278
- app = FastAPI(title="SamuelHouseFinder API")
279
- init_db()
280
-
281
- async def run_all_scrapers(location: str, sources: List[str], max_pages:int=1, force:bool=False) -> Dict[str,Any]:
282
- results = []
283
- async with async_playwright() as pw:
284
- tasks=[]
285
- idx=0
286
- for s in sources:
287
- if s.lower()=="mercadolibre":
288
- tasks.append(scrape_mercadolibre(pw, location, max_pages=max_pages, idx_offset=idx))
289
- elif s.lower()=="properati":
290
- tasks.append(scrape_properati(pw, location, max_pages=max_pages, idx_offset=idx))
291
- else:
292
- # unknown: skip for now
293
- pass
294
- idx+=1
295
- # run concurrently but throttle by MAX_CONCURRENT_BROWSERS
296
- gathered=[]
297
- sem = asyncio.Semaphore(MAX_CONCURRENT_BROWSERS)
298
- async def sem_task(coro):
299
- async with sem:
300
- return await coro
301
- gathered = await asyncio.gather(*[sem_task(t) for t in tasks], return_exceptions=True)
302
- # flatten
303
- all_items=[]
304
- for g in gathered:
305
- if isinstance(g, Exception):
306
- print("scrape exception:", g)
 
 
 
 
 
 
 
 
 
307
  continue
308
- all_items.extend(g)
309
- # validate urls and save
310
- async with httpx.AsyncClient(follow_redirects=True, timeout=20) as client:
311
- valid=[]
312
- for i,it in enumerate(all_items):
313
- ok = await validate_url(client, it["url"])
314
- if ok:
315
- saved = save_property(it)
316
- valid.append(it)
317
- return {"found": len(all_items), "validated": len(valid)}
318
-
319
- @app.post("/api/scrape")
320
- async def api_scrape(req: Request):
321
- payload = await req.json()
322
- location = payload.get("location")
323
- if not location:
324
- raise HTTPException(status_code=400, detail="location required")
325
- sources = payload.get("sources", ["mercadolibre","properati"])
326
- max_pages = int(payload.get("max_pages", DEFAULT_MAX_PAGES))
327
- force = bool(payload.get("force", False))
328
- result = await run_all_scrapers(location, sources, max_pages=max_pages, force=force)
329
- return JSONResponse(result)
330
-
331
- @app.get("/api/search")
332
- async def api_search(q: Optional[str]=None, source: Optional[str]=None, limit:int=200):
333
- data = query_db(q=q, source=source, limit=limit)
334
- return JSONResponse({"items": data, "count": len(data)})
335
-
336
- @app.get("/api/health")
337
- async def health():
338
- return JSONResponse({"ok":True,"time":datetime.utcnow().isoformat()})
339
-
340
- # ---------------- GRADIO UI (simple) ----------------
341
- def frontend_invoke_scrape(location, sources, max_pages, force_flag):
342
- # call local API (same process) synchronously
343
- import requests
344
- payload = {"location":location, "sources": [s.strip() for s in sources.split(",") if s.strip()], "max_pages":int(max_pages), "force":bool(force_flag)}
345
- try:
346
- r = requests.post("http://127.0.0.1:8000/api/scrape", json=payload, timeout=600)
347
- r.raise_for_status()
348
- return f"Scrape iniciado: {r.json()}"
349
- except Exception as e:
350
- return f"Error al iniciar scrape: {e}"
351
-
352
- def frontend_query(q_text, source):
353
- import requests
354
- try:
355
- params = {}
356
- if q_text: params["q"] = q_text
357
- if source: params["source"] = source
358
- r = requests.get("http://127.0.0.1:8000/api/search", params=params, timeout=60)
359
- r.raise_for_status()
360
- items = r.json().get("items", [])
361
- # convert to table-friendly list
362
- rows = []
363
- for it in items:
364
- rows.append([it.get("title"), it.get("price"), it.get("currency"), it.get("source"), it.get("url")])
365
- return rows
366
- except Exception as e:
367
- return [["Error", str(e), "", "", ""]]
368
-
369
- def mount_gradio():
370
- with gr.Blocks(title="SamuelHouseFinder") as demo:
371
- gr.Markdown("## SamuelHouseFinder — Zona Norte (Saavedra → La Lucila)\nBackend con Playwright. Usá con cuidado y respetá TOS de portales.")
372
- with gr.Row():
373
- with gr.Column():
374
- loc = gr.Textbox(label="Ubicación (ej: Martinez, Olivos, Saavedra)", value="Saavedra")
375
- srcs = gr.Textbox(label="Fuentes (csv)", value="mercadolibre,properati")
376
- pages = gr.Slider(label="Páginas por fuente", minimum=1, maximum=5, value=1)
377
- force = gr.Checkbox(label="Forzar re-scrape", value=False)
378
- btn = gr.Button("Buscar y Scrappear")
379
- out = gr.Textbox(label="Estado")
380
- with gr.Column():
381
- qtxt = gr.Textbox(label="Buscar en DB (texto libre)", value="")
382
- qsrc = gr.Textbox(label="Fuente (opcional)", value="")
383
- qbtn = gr.Button("Consultar DB")
384
- table = gr.Dataframe(headers=["title","price","currency","source","url"], datatype=["str","str","str","str","str"])
385
- btn.click(frontend_invoke_scrape, inputs=[loc, srcs, pages, force], outputs=[out])
386
- qbtn.click(frontend_query, inputs=[qtxt, qsrc], outputs=[table])
387
- return demo
388
-
389
- # ---------------- RUN ----------------
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
390
  if __name__ == "__main__":
391
- # Run FastAPI + Gradio in same process: start FastAPI in background, then Gradio
392
- import threading, time
393
- def start_uvicorn():
394
- uvicorn.run("app:app", host="0.0.0.0", port=8000, log_level="info")
395
- t = threading.Thread(target=start_uvicorn, daemon=True)
396
- t.start()
397
- # wait a moment for server
398
- time.sleep(1.5)
399
- demo = mount_gradio()
400
- demo.launch(server_name="0.0.0.0", server_port=7860)
 
 
 
 
1
  import os
 
 
2
  import re
3
+ import time
4
+ import math
5
+ import json
6
+ import asyncio
7
+ import random
8
+ from dataclasses import dataclass, asdict
9
+ from typing import List, Optional, Dict, Any, Tuple
10
+ import urllib.parse as ul
11
+
12
  import httpx
13
  from bs4 import BeautifulSoup
14
+ from rapidfuzz import fuzz, process
15
+ from pydantic import BaseModel, HttpUrl, ValidationError
16
+ import pandas as pd
17
  import gradio as gr
18
 
19
+ # =========================
20
+ # Configuración principal
21
+ # =========================
22
+
23
+ DEFAULT_MAX_USD = 90000
24
+ DEFAULT_NEIGHBORHOODS = [
25
+ # Núcleo pedido por vos
26
+ "Saavedra", "Nuñez", "La Lucila", "Florida Oeste", "Munro", "Carapachay",
27
+ # Cercanos útiles para ampliar stock
28
+ "Olivos", "Villa Martelli"
 
 
 
 
29
  ]
30
+ DEFAULT_TYPES = ["casa", "ph"] # casa / ph
31
+ DEFAULT_MIN_ROOMS = 3 # ambientes (para asegurar oficina)
32
+ REQUIRE_BIDET = True
33
+ REQUIRE_PET_FRIENDLY = True
34
+ REQUIRE_OUTDOOR = True # patio o terraza
35
 
36
+ USER_AGENT_POOL = [
37
+ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
38
+ "(KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36",
39
+ "Mozilla/5.0 (Macintosh; Intel Mac OS X 13_4) AppleWebKit/605.1.15 "
40
+ "(KHTML, like Gecko) Version/16.4 Safari/605.1.15",
41
+ "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
42
+ "(KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36",
43
+ ]
44
+
45
+ TIMEOUT = httpx.Timeout(20.0, connect=10.0)
46
+ MAX_CONCURRENCY = 6
47
+ RETRIES = 2
48
+ BACKOFF_BASE = 0.8
49
+
50
+ # Microzonas residenciales priorizadas (heurística positiva, editable)
51
+ MICROZONAS_PRIORITARIAS = [
52
+ # Saavedra
53
+ "Parque Saavedra", "Parque Sarmiento", "Av. Balbín", "Ruiz Huidobro",
54
+ # Núñez
55
+ "Lomas de Nuñez", "Cabildo", "Plaza Alberti",
56
+ # La Lucila
57
+ "Estación La Lucila", "Rawson", "Paraná", "Maipú",
58
+ # Florida/Carapachay/Munro/Martelli/Olivos
59
+ "Estación Florida", "Estación Carapachay", "Estación Munro",
60
+ "Ugarte", "San Martín", "Panamericana", "Paraná", "Pelliza", "Melo",
61
+ ]
62
+
63
+ # =========================
64
+ # Modelos y utilidades
65
+ # =========================
66
+
67
+ @dataclass
68
+ class Listing:
69
+ source: str
70
+ title: str
71
+ link: str
72
+ price_usd: Optional[float]
73
+ currency: Optional[str]
74
+ address: Optional[str]
75
+ neighborhood: Optional[str]
76
+ city: Optional[str]
77
+ rooms: Optional[int]
78
+ bedrooms: Optional[int]
79
+ bathrooms: Optional[int]
80
+ has_patio: Optional[bool]
81
+ has_terrace: Optional[bool]
82
+ pet_friendly: Optional[bool]
83
+ has_bidet: Optional[bool]
84
+ description: Optional[str]
85
+ score: float
86
+
87
+ def to_float_price(value: str) -> Optional[float]:
88
+ if not value:
89
+ return None
90
+ txt = value.replace(".", "").replace(",", ".").upper()
91
+ # Detect currency
92
+ if "USD" in txt or "U$S" in txt or "U$D" in txt or "DOLAR" in txt:
93
+ m = re.search(r"(\d+(?:\.\d+)?)", txt)
94
+ return float(m.group(1)) if m else None
95
+ # If ARS, ignore conversion (no FX in this agent) -> return None to skip
96
+ return None
97
+
98
+ def extract_int(text: str) -> Optional[int]:
99
+ if not text:
100
+ return None
101
+ m = re.search(r"(\d+)", text)
102
+ return int(m.group(1)) if m else None
103
+
104
+ def text_has_any(text: str, keywords: List[str]) -> bool:
105
+ if not text:
106
  return False
107
+ t = text.lower()
108
+ return any(kw.lower() in t for kw in keywords)
109
+
110
+ def fuzzy_any(text: str, keywords: List[str], thresh: int = 80) -> bool:
111
+ if not text:
112
  return False
113
+ choices = [(kw, fuzz.partial_ratio(text.lower(), kw.lower())) for kw in keywords]
114
+ return any(score >= thresh for _, score in choices)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
115
 
116
+ def feature_guess(desc: str) -> Tuple[Optional[bool], Optional[bool], Optional[bool], Optional[bool]]:
117
+ # patio, terraza, mascotas, bidet
118
+ patio = fuzzy_any(desc, ["patio", "patio propio", "patio descubierto", "fondo", "jardín"])
119
+ terraza = fuzzy_any(desc, ["terraza", "terraza propia", "terraza transitable", "azotea"])
120
+ mascotas = fuzzy_any(desc, ["se aceptan mascotas", "pet friendly", "apta mascotas"])
121
+ bidet = fuzzy_any(desc, ["bidet"]) # estricto: si no lo menciona, muchas veces igual hay, pero filtramos a pedido
122
+ return patio or None, terraza or None, mascotas or None, bidet or None
123
 
124
+ def residential_score(address: str, neighborhood: str, desc: str) -> float:
125
+ text = " ".join([address or "", neighborhood or "", desc or ""])
126
+ boost = 0.0
127
+ for kw in MICROZONAS_PRIORITARIAS:
128
+ if fuzz.partial_ratio(text.lower(), kw.lower()) >= 80:
129
+ boost += 0.5
130
+ # Tope para no desbalancear
131
+ return min(boost, 2.0)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
132
 
133
+ def compute_score(lst: Listing, filters: Dict[str, Any]) -> float:
134
+ score = 0.0
135
+ if lst.price_usd is not None and lst.price_usd <= filters["max_price_usd"]:
136
+ score += 1.0
137
+ # Mejor precio más bajo
138
+ score += (filters["max_price_usd"] - lst.price_usd) / max(filters["max_price_usd"], 1) * 1.0
139
+ # Ambientes
140
+ if lst.rooms and lst.rooms >= filters["min_rooms"]:
141
+ score += 1.0
142
+ # Exterior
143
+ if filters["require_outdoor"]:
144
+ if (lst.has_patio or lst.has_terrace):
145
+ score += 1.0
146
+ # Mascotas
147
+ if not filters["require_pet"]:
148
+ score += 0.2
149
+ else:
150
+ if lst.pet_friendly:
151
+ score += 0.6
152
+ # Bidet
153
+ if not filters["require_bidet"]:
154
+ score += 0.2
155
+ else:
156
+ if lst.has_bidet:
157
+ score += 0.6
158
+ # Microzonas residenciales
159
+ score += residential_score(lst.address or "", lst.neighborhood or "", lst.description or "")
160
+ return round(score, 3)
161
+
162
+ def clean_text(s: str) -> str:
163
+ return re.sub(r"\s+", " ", (s or "").strip())
164
+
165
+ def headers():
166
+ return {"User-Agent": random.choice(USER_AGENT_POOL)}
167
+
168
+ async def fetch(client: httpx.AsyncClient, url: str) -> Optional[str]:
169
+ for i in range(RETRIES + 1):
170
  try:
171
+ r = await client.get(url, headers=headers(), timeout=TIMEOUT)
172
+ if r.status_code == 200 and r.text:
173
+ return r.text
174
+ await asyncio.sleep(BACKOFF_BASE * (2 ** i))
175
  except Exception:
176
+ await asyncio.sleep(BACKOFF_BASE * (2 ** i))
177
+ return None
178
+
179
+ async def fetch_detail_and_enrich(client: httpx.AsyncClient, lst: Listing) -> Listing:
180
+ html = await fetch(client, lst.link)
181
  if not html:
182
+ return lst
183
+ soup = BeautifulSoup(html, "lxml")
184
+
185
+ # Descripción
186
+ desc_el = soup.find(["div", "section"], string=None, attrs={"class": re.compile(r"(description|Description|post|body)")}) \
187
+ or soup.find("p")
188
+ desc = ""
189
+ if desc_el:
190
+ desc = clean_text(desc_el.get_text(" ", strip=True))
191
+ else:
192
+ # fallback: join many text nodes (best effort)
193
+ desc = clean_text(" ".join(t.get_text(" ", strip=True) for t in soup.find_all(["p", "li"])[:30]))
194
+
195
+ # Features heurísticos
196
+ patio, terraza, mascotas, bidet = feature_guess(desc)
197
+
198
+ # Ambientes / baños de cards de features típicas
199
+ features_text = " ".join(el.get_text(" ", strip=True) for el in soup.find_all(["li", "span", "div"]) if el and el.get_text() and any(x in el.get_text().lower() for x in ["ambiente", "dorm", "baño"]))
200
+ rooms = lst.rooms or extract_int(re.search(r"(\d+)\s*ambiente", features_text.lower()).group(1)) if re.search(r"(\d+)\s*ambiente", features_text.lower()) else lst.rooms
201
+ bathrooms = lst.bathrooms or extract_int(re.search(r"(\d+)\s*bañ", features_text.lower()).group(1)) if re.search(r"(\d+)\s*bañ", features_text.lower()) else lst.bathrooms
202
+ bedrooms = lst.bedrooms or extract_int(re.search(r"(\d+)\s*dorm", features_text.lower()).group(1)) if re.search(r"(\d+)\s*dorm", features_text.lower()) else lst.bedrooms
203
+
204
+ # Dirección si aparece
205
+ addr_guess = soup.find(attrs={"class": re.compile(r"(address|ubicacion|location|inmo-location)")})
206
+ if addr_guess and not lst.address:
207
+ lst.address = clean_text(addr_guess.get_text(" ", strip=True))[:200]
208
+
209
+ # Merge
210
+ lst.description = desc or lst.description
211
+ lst.has_patio = lst.has_patio if lst.has_patio is not None else patio
212
+ lst.has_terrace = lst.has_terrace if lst.has_terrace is not None else terraza
213
+ lst.pet_friendly = lst.pet_friendly if lst.pet_friendly is not None else mascotas
214
+ lst.has_bidet = lst.has_bidet if lst.has_bidet is not None else bidet
215
+ lst.rooms = rooms
216
+ lst.bathrooms = bathrooms
217
+ lst.bedrooms = bedrooms
218
+ return lst
219
+
220
+ # =========================
221
+ # Scrapers (adaptadores)
222
+ # =========================
223
+ # Estrategia: usar búsquedas textuales robustas por sitio para barrios y filtros.
224
+ # Luego, para cada aviso, enriquecemos con la página de detalle.
225
+
226
+ def zonaprop_search_urls(neighs: List[str], max_usd: int, types: List[str]) -> List[str]:
227
+ urls = []
228
+ for n in neighs:
229
+ base = "https://www.zonaprop.com.ar/propiedades.html"
230
+ # Consulta textual robusta (evita slugs frágiles)
231
+ q = f"{' o '.join(types)} venta {n} hasta {max_usd} dolares 3 ambientes patio terraza mascotas bidet"
232
+ urls.append(f"{base}?q={ul.quote(q)}")
233
+ return urls
234
+
235
+ def argenprop_search_urls(neighs: List[str], max_usd: int, types: List[str]) -> List[str]:
236
+ urls = []
237
+ for n in neighs:
238
+ base = "https://www.argenprop.com/propiedades"
239
+ q = f"{' o '.join(types)} venta {n} hasta {max_usd} dolares 3 ambientes patio terraza mascotas bidet"
240
+ urls.append(f"{base}?text={ul.quote(q)}")
241
+ return urls
242
+
243
+ def properati_search_urls(neighs: List[str], max_usd: int, types: List[str]) -> List[str]:
244
+ urls = []
245
+ for n in neighs:
246
+ base = "https://www.properati.com.ar/s/venta/propiedades"
247
+ q = f"{' o '.join(types)} {n} hasta {max_usd} dolares 3 ambientes patio terraza mascotas bidet"
248
+ urls.append(f"{base}?q={ul.quote(q)}")
249
+ return urls
250
+
251
+ def generic_card_extractor(soup: BeautifulSoup, source: str) -> List[Dict[str, Any]]:
252
+ """
253
+ Extrae tarjetas de resultados de modo flexible en sitios comunes.
254
+ Retorna dicts con title, link, price_text, addr_text, neighborhood.
255
+ """
256
+ cards = []
257
+ # Buscar anchors con href a la misma base
258
+ anchors = soup.select("a[href]")
259
+ seen = set()
260
+ for a in anchors:
261
+ href = a.get("href", "")
262
  if not href:
263
  continue
264
+ if source in href and href not in seen:
265
+ seen.add(href)
266
+ # Título cercano
267
+ title = a.get_text(" ", strip=True)
268
+ # Precio y dirección cercanos (padres cercanos)
269
+ parent = a.find_parent()
270
+ block_text = ""
271
+ price_text = ""
272
+ address_text = ""
273
+ if parent:
274
+ block_text = clean_text(parent.get_text(" ", strip=True))
275
+ # Precio
276
+ m = re.search(r"(U\$S|USD|US\$|D[oó]lares?)\s*([\d\.\,]+)", block_text, re.IGNORECASE)
277
+ price_text = (m.group(0) if m else "")
278
+ # Dirección aproximada
279
+ addr_m = re.search(r"(Saavedra|Nu[eñ]ez|La Lucila|Florida|Munro|Carapachay|Olivos|Martelli)[^|,]*", block_text, re.IGNORECASE)
280
+ address_text = addr_m.group(0) if addr_m else ""
281
+ cards.append({
282
+ "title": title or "",
283
+ "link": href if href.startswith("http") else f"https://{source}{href}",
284
+ "price_text": price_text,
285
+ "addr_text": address_text
286
  })
287
+ # Heurística: filtrar duplicados y ruido por título/link
288
+ filtered = []
289
+ for c in cards:
290
+ if len(c["title"]) < 8:
291
+ continue
292
+ if any(tok in c["link"] for tok in ["/perfil/", "/inmobiliaria/", "/ayuda", "/faq", "/login", "/like"]):
293
+ continue
294
+ filtered.append(c)
295
+ return filtered
296
+
297
+ async def scrape_search_page(client: httpx.AsyncClient, url: str, domain: str) -> List[Listing]:
298
+ html = await fetch(client, url)
299
+ if not html:
300
+ return []
301
+ soup = BeautifulSoup(html, "lxml")
302
+ cards = generic_card_extractor(soup, domain)
303
+ listings: List[Listing] = []
304
+ for c in cards:
305
+ price = to_float_price(c["price_text"])
306
+ listings.append(Listing(
307
+ source=domain,
308
+ title=clean_text(c["title"])[:140],
309
+ link=c["link"],
310
+ price_usd=price,
311
+ currency="USD" if price is not None else None,
312
+ address=c["addr_text"],
313
+ neighborhood=None,
314
+ city="Vicente López / CABA",
315
+ rooms=None, bedrooms=None, bathrooms=None,
316
+ has_patio=None, has_terrace=None, pet_friendly=None, has_bidet=None,
317
+ description=None,
318
+ score=0.0
319
+ ))
320
+ # Limitar para no abusar (páginas pueden traer mucha basura)
321
+ return listings[:25]
322
+
323
+ async def scrape_portal(client: httpx.AsyncClient, portal: str, urls: List[str]) -> List[Listing]:
324
+ out: List[Listing] = []
325
+ for u in urls[:4]: # primeras 4 búsquedas por portal para limitar carga
326
+ try:
327
+ res = await scrape_search_page(client, u, portal)
328
+ out.extend(res)
329
+ await asyncio.sleep(0.5)
330
+ except Exception:
331
+ continue
332
  return out
333
 
334
+ # =========================
335
+ # Orquestación
336
+ # =========================
337
+
338
+ async def run_agent(
339
+ neighborhoods: List[str],
340
+ max_price_usd: int,
341
+ types: List[str],
342
+ min_rooms: int,
343
+ require_outdoor: bool,
344
+ require_bidet: bool,
345
+ require_pet: bool
346
+ ) -> List[Listing]:
347
+ filters = dict(
348
+ max_price_usd=max_price_usd,
349
+ min_rooms=min_rooms,
350
+ require_outdoor=require_outdoor,
351
+ require_bidet=require_bidet,
352
+ require_pet=require_pet,
353
+ )
354
+
355
+ async with httpx.AsyncClient(follow_redirects=True) as client:
356
+ # 1) Generar URLs de búsqueda
357
+ z_urls = zonaprop_search_urls(neighborhoods, max_price_usd, types)
358
+ a_urls = argenprop_search_urls(neighborhoods, max_price_usd, types)
359
+ p_urls = properati_search_urls(neighborhoods, max_price_usd, types)
360
+
361
+ # 2) Scrapeo base de resultados
362
+ tasks = [
363
+ scrape_portal(client, "www.zonaprop.com.ar", z_urls),
364
+ scrape_portal(client, "www.argenprop.com", a_urls),
365
+ scrape_portal(client, "www.properati.com.ar", p_urls),
366
+ ]
367
+ batch_lists = await asyncio.gather(*tasks)
368
+ listings = [l for batch in batch_lists for l in batch]
369
+
370
+ # 3) Deduplicar por link
371
+ seen = set()
372
+ unique: List[Listing] = []
373
+ for l in listings:
374
+ if l.link in seen:
375
  continue
376
+ seen.add(l.link)
377
+ unique.append(l)
378
+
379
+ # 4) Enriquecer con detalle (concurrencia controlada)
380
+ sem = asyncio.Semaphore(MAX_CONCURRENCY)
381
+ async def enrich_guarded(l: Listing):
382
+ async with sem:
383
+ return await fetch_detail_and_enrich(client, l)
384
+
385
+ enriched = await asyncio.gather(*[enrich_guarded(l) for l in unique])
386
+
387
+ # 5) Aplicar filtros duros
388
+ def passes(l: Listing) -> bool:
389
+ # Precio
390
+ if l.price_usd is None or l.price_usd > max_price_usd:
391
+ return False
392
+ # Ambientes
393
+ if l.rooms is not None and l.rooms < min_rooms:
394
+ return False
395
+ # Exterior
396
+ if require_outdoor and not ((l.has_patio is True) or (l.has_terrace is True)):
397
+ return False
398
+ # Bidet
399
+ if require_bidet and l.has_bidet is not True:
400
+ return False
401
+ # Mascotas
402
+ if require_pet and l.pet_friendly is not True:
403
+ return False
404
+ # Tipos: filtrar por título/desc
405
+ type_hit = any(t in (l.title.lower() + " " + (l.description or "").lower()) for t in types)
406
+ if not type_hit:
407
+ # fallback: permitir si no se menciona pero cumple todo lo demás
408
+ type_hit = True
409
+ return type_hit
410
+
411
+ filtered = [l for l in enriched if passes(l)]
412
+
413
+ # 6) Scoring
414
+ for l in filtered:
415
+ l.score = compute_score(l, filters)
416
+
417
+ # 7) Orden final
418
+ filtered.sort(key=lambda x: (-x.score, x.price_usd or 1e9))
419
+
420
+ return filtered
421
+
422
+ def listings_to_df(listings: List[Listing]) -> pd.DataFrame:
423
+ rows = []
424
+ for l in listings:
425
+ rows.append({
426
+ "Fuente": l.source.replace("www.", ""),
427
+ "Título": l.title,
428
+ "Precio USD": l.price_usd,
429
+ "Ambientes": l.rooms,
430
+ "Dormitorios": l.bedrooms,
431
+ "Baños": l.bathrooms,
432
+ "Patio": l.has_patio,
433
+ "Terraza": l.has_terrace,
434
+ "Mascotas": l.pet_friendly,
435
+ "Bidet": l.has_bidet,
436
+ "Dirección/Área": l.address,
437
+ "Link": l.link,
438
+ "Score": l.score
439
+ })
440
+ df = pd.DataFrame(rows)
441
+ if not df.empty:
442
+ # Columnas ordenadas
443
+ cols = ["Fuente","Título","Precio USD","Ambientes","Dormitorios","Baños","Patio","Terraza","Mascotas","Bidet","Dirección/Área","Link","Score"]
444
+ df = df[cols]
445
+ return df
446
+
447
+ # =========================
448
+ # UI (Gradio)
449
+ # =========================
450
+
451
+ DESCRIPTION = """
452
+ Agente agregador de avisos (Zonaprop, Argenprop, Properati) para Saavedra → La Lucila y alrededores.
453
+ Filtra: USD ≤ 90k, ≥ 3 ambientes (para oficina), patio/terraza, mascotas, bidet (si figura en descripción).
454
+ Tip: si ves pocos resultados, desactiva “Bidet requerido” o “Pet-friendly requerido” (muchos avisos no lo escriben, aunque lo tengan).
455
+ """
456
+
457
+ async def run_and_present(neighs, max_usd, types, min_rooms, req_outdoor, req_bidet, req_pet):
458
+ neighs = [n.strip() for n in neighs.split(",") if n.strip()]
459
+ types = [t.strip().lower() for t in types.split(",") if t.strip()]
460
+ results = await run_agent(
461
+ neighborhoods=neighs,
462
+ max_price_usd=max_usd,
463
+ types=types,
464
+ min_rooms=min_rooms,
465
+ require_outdoor=req_outdoor,
466
+ require_bidet=req_bidet,
467
+ require_pet=req_pet
468
+ )
469
+ df = listings_to_df(results)
470
+ # Export JSON también
471
+ json_blob = json.dumps([asdict(l) for l in results], ensure_ascii=False, indent=2)
472
+ return df, json_blob
473
+
474
+ with gr.Blocks(title="Agente Inmuebles Norte BA (≤ USD 90k)") as demo:
475
+ gr.Markdown("# Agente de casas/PH norte BA (≤ 90 000 USD)")
476
+ gr.Markdown(DESCRIPTION)
477
+ with gr.Row():
478
+ neighs = gr.Textbox(label="Barrios (coma separada)", value=", ".join(DEFAULT_NEIGHBORHOODS))
479
+ max_usd = gr.Number(label="Precio máx. (USD)", value=DEFAULT_MAX_USD, precision=0)
480
+ with gr.Row():
481
+ types = gr.Textbox(label="Tipos (coma separada)", value=", ".join(DEFAULT_TYPES))
482
+ min_rooms = gr.Number(label="Mínimo ambientes", value=DEFAULT_MIN_ROOMS, precision=0)
483
+ with gr.Row():
484
+ req_outdoor = gr.Checkbox(label="Requerir patio o terraza", value=REQUIRE_OUTDOOR)
485
+ req_bidet = gr.Checkbox(label="Requerir bidet (solo si aparece en descripción)", value=REQUIRE_BIDET)
486
+ req_pet = gr.Checkbox(label="Requerir pet-friendly (si aparece en descripción)", value=REQUIRE_PET_FRIENDLY)
487
+ btn = gr.Button("Buscar ahora", variant="primary")
488
+ with gr.Tabs():
489
+ with gr.Tab("Resultados"):
490
+ table = gr.Dataframe(interactive=False, wrap=True, overflow_row_behaviour="paginate", max_rows=300)
491
+ with gr.Tab("JSON"):
492
+ j = gr.Code(language="json")
493
+
494
+ btn.click(run_and_present, inputs=[neighs, max_usd, types, min_rooms, req_outdoor, req_bidet, req_pet], outputs=[table, j])
495
+
496
  if __name__ == "__main__":
497
+ demo.launch()