Spaces:

Lukeetah
/

SamuelHouseFinderApp

Sleeping

App Files Files Community

Lukeetah commited on Aug 11, 2025

Commit

0c27eef

verified ·

1 Parent(s): fe1eacf

Update app.py

Browse files

Files changed (1) hide show

app.py +474 -377

app.py CHANGED Viewed

@@ -1,400 +1,497 @@
-# app.py
-# SamuelHouseFinder - FastAPI + Playwright scrapers + Gradio UI (single file)
-# Requisitos: ejecutar en contenedor con Playwright browsers (Dockerfile incluido abajo)
 import os
-import asyncio
-import json
 import re
-import sqlite3
-from typing import List, Dict, Any, Optional
-from datetime import datetime
-from urllib.parse import urljoin
-from fastapi import FastAPI, HTTPException, Request, BackgroundTasks
-from fastapi.responses import JSONResponse, HTMLResponse
-import uvicorn
 import httpx
 from bs4 import BeautifulSoup
-# Gradio UI
 import gradio as gr
-# Playwright
-from playwright.async_api import async_playwright, TimeoutError as PWTimeout
-# ---------------- CONFIG ----------------
-DB_PATH = os.environ.get("DB_PATH", "data/properties.db")
-PROXY_LIST = os.environ.get("PROXY_LIST")  # comma separated http://user:pass@ip:port
-CAPTCHA_API_KEY = os.environ.get("CAPTCHA_API_KEY")  # optional
-MAX_CONCURRENT_BROWSERS = int(os.environ.get("MAX_BROWSERS", "2"))
-DEFAULT_MAX_PAGES = int(os.environ.get("DEFAULT_MAX_PAGES", "2"))
-USER_AGENTS = [
-    # Expand this list for production
-    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36",
-    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36",
-    "Mozilla/5.0 (Macintosh; Intel Mac OS X 13_6) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.0 Safari/605.1.15",
 ]
-os.makedirs(os.path.dirname(DB_PATH) or ".", exist_ok=True)
-# ---------------- DB ----------------
-def init_db():
-    conn = sqlite3.connect(DB_PATH)
-    c = conn.cursor()
-    c.execute("""
-        CREATE TABLE IF NOT EXISTS properties (
-            id INTEGER PRIMARY KEY AUTOINCREMENT,
-            title TEXT,
-            price TEXT,
-            currency TEXT,
-            address TEXT,
-            neighbourhood TEXT,
-            lat REAL,
-            lon REAL,
-            bedrooms INTEGER,
-            bathrooms INTEGER,
-            surface REAL,
-            amenities TEXT,
-            source TEXT,
-            url TEXT UNIQUE,
-            scraped_at TEXT,
-            raw_html TEXT
-        )
-    """)
-    conn.commit()
-    conn.close()
-def save_property(item: Dict[str,Any]) -> bool:
-    conn = sqlite3.connect(DB_PATH)
-    c = conn.cursor()
-    try:
-        c.execute("""
-            INSERT INTO properties
-            (title,price,currency,address,neighbourhood,lat,lon,bedrooms,bathrooms,surface,amenities,source,url,scraped_at,raw_html)
-            VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)
-        """, (
-            item.get("title"),
-            item.get("price"),
-            item.get("currency"),
-            item.get("address"),
-            item.get("neighbourhood"),
-            item.get("lat"),
-            item.get("lon"),
-            item.get("bedrooms"),
-            item.get("bathrooms"),
-            item.get("surface"),
-            json.dumps(item.get("amenities",[]), ensure_ascii=False),
-            item.get("source"),
-            item.get("url"),
-            datetime.utcnow().isoformat(),
-            item.get("raw_html","")[:10000]
-        ))
-        conn.commit()
-        return True
-    except sqlite3.IntegrityError:
-        # already exists
         return False
-    except Exception as e:
-        print("DB save error:", e)
         return False
-    finally:
-        conn.close()
-def query_db(q: Optional[str]=None, min_price: Optional[int]=None, max_price: Optional[int]=None, bedrooms: Optional[int]=None, source: Optional[str]=None, limit:int=200):
-    conn = sqlite3.connect(DB_PATH)
-    conn.row_factory = sqlite3.Row
-    c = conn.cursor()
-    sql = "SELECT * FROM properties WHERE 1=1"
-    params = []
-    if q:
-        sql += " AND (title LIKE ? OR address LIKE ? OR neighbourhood LIKE ?)"
-        qv = f"%{q}%"
-        params += [qv,qv,qv]
-    if source:
-        sql += " AND source = ?"
-        params.append(source)
-    # NOTE: price is stored as text (different formats). For production parse and store numeric.
-    sql += " ORDER BY scraped_at DESC LIMIT ?"
-    params.append(limit)
-    rows = c.execute(sql, params).fetchall()
-    conn.close()
-    return [dict(r) for r in rows]
-# ---------------- UTIL ----------------
-def get_proxies_list():
-    if not PROXY_LIST:
-        return []
-    return [p.strip() for p in PROXY_LIST.split(",") if p.strip()]
-def pick_proxy(idx=0):
-    lst = get_proxies_list()
-    if not lst:
-        return None
-    return lst[idx % len(lst)]
-async def validate_url(client: httpx.AsyncClient, url: str) -> bool:
-    try:
-        r = await client.head(url, follow_redirects=True, timeout=15)
-        return r.status_code == 200
-    except Exception:
-        try:
-            r2 = await client.get(url, follow_redirects=True, timeout=20)
-            return r2.status_code == 200
-        except Exception:
-            return False
-# ---------------- SCRAPERS (Playwright) ----------------
-# Each scraper returns list[dict] with canonical fields (see save_property)
-async def scrape_mercadolibre(pw, location:str, max_pages:int=1, idx_offset=0) -> List[Dict[str,Any]]:
-    """Scrapes MercadoLibre Inmuebles listing pages (rendered)."""
-    out = []
-    base = "https://listado.mercadolibre.com.ar"
-    # build query: try location as-is and also appended 'venta'
-    q = location.replace(" ", "-")
-    async with pw.chromium.launch(headless=True, args=["--no-sandbox"]) as browser:
-        for p in range(1, max_pages+1):
-            # MercadoLibre pagination is usually offset-based; try two patterns
-            page_path = f"/{q}_Desde_{(p-1)*50+1}"
-            url = urljoin(base, page_path)
-            proxy = pick_proxy(p-1)
-            context_args = {}
-            if proxy:
-                context_args["proxy"] = {"server": proxy}
-            ua = USER_AGENTS[(idx_offset + p) % len(USER_AGENTS)]
-            context_args["user_agent"] = ua
-            context = await browser.new_context(**context_args)
-            page = await context.new_page()
-            try:
-                await page.goto(url, wait_until="networkidle", timeout=30000)
-                # ML often lazy-loads; ensure content loaded
-                await page.wait_for_timeout(1500)
-                html = await page.content()
-            except PWTimeout:
-                html = await page.content()
-            except Exception as e:
-                print("ML page error:", e)
-                html = ""
-            finally:
-                try:
-                    await page.close()
-                    await context.close()
-                except Exception:
-                    pass
-            if not html:
-                continue
-            soup = BeautifulSoup(html, "html.parser")
-            # Search for anchor tags that likely link to properties
-            anchors = soup.select("a[href]")
-            found = set()
-            for a in anchors:
-                href = a.get("href")
-                if not href:
-                    continue
-                # heuristics: property detail urls often contain '/MLA-' or '/MLO-' or '/inmuebles'
-                if re.search(r"/MLA-|/MLO-|/inmuebles/", href):
-                    full = href if href.startswith("http") else urljoin(base, href)
-                    if full in found:
-                        continue
-                    found.add(full)
-                    title = (a.get_text(strip=True) or "Propiedad MercadoLibre")[:300]
-                    out.append({
-                        "title": title,
-                        "price": None,
-                        "currency": "ARS",
-                        "address": None,
-                        "neighbourhood": None,
-                        "lat": None,
-                        "lon": None,
-                        "bedrooms": None,
-                        "bathrooms": None,
-                        "surface": None,
-                        "amenities": [],
-                        "source": "MercadoLibre",
-                        "url": full,
-                        "raw_html": str(a)[:8000]
-                    })
-    return out
-async def scrape_properati(pw, location:str, max_pages:int=1, idx_offset=0) -> List[Dict[str,Any]]:
-    out = []
-    base = "https://www.properati.com.ar"
-    url = f"{base}/search?q={location}"
-    proxy = pick_proxy(idx_offset)
-    ua = USER_AGENTS[idx_offset % len(USER_AGENTS)]
-    async with pw.chromium.launch(headless=True, args=["--no-sandbox"]) as browser:
-        context_args = {"user_agent": ua}
-        if proxy:
-            context_args["proxy"] = {"server": proxy}
-        context = await browser.new_context(**context_args)
-        page = await context.new_page()
-        try:
-            await page.goto(url, wait_until="networkidle", timeout=30000)
-            await page.wait_for_timeout(1200)
-            html = await page.content()
-        except Exception as e:
-            print("Properati error:", e)
-            html = ""
         try:
-            await page.close()
-            await context.close()
         except Exception:
-            pass
     if not html:
-        return out
-    soup = BeautifulSoup(html, "html.parser")
-    cards = soup.select("a[href]")
-    found=set()
-    for a in cards:
-        href=a.get("href")
         if not href:
             continue
-        if "/property/" in href or "/inmuebles/" in href or "/propiedad" in href:
-            full = href if href.startswith("http") else urljoin(base, href)
-            if full in found: continue
-            found.add(full)
-            title=(a.get_text(strip=True) or "Propiedad Properati")[:300]
-            out.append({
-                "title": title,
-                "price": None,
-                "currency": "ARS",
-                "address": None,
-                "neighbourhood": None,
-                "lat": None,
-                "lon": None,
-                "bedrooms": None,
-                "bathrooms": None,
-                "surface": None,
-                "amenities": [],
-                "source": "Properati",
-                "url": full,
-                "raw_html": str(a)[:8000]
             })
     return out
-# Extendable: add ZonaProp, Inmuebles, ArgenProp, etc.
-# ---------------- ORCHESTRATOR ----------------
-app = FastAPI(title="SamuelHouseFinder API")
-init_db()
-async def run_all_scrapers(location: str, sources: List[str], max_pages:int=1, force:bool=False) -> Dict[str,Any]:
-    results = []
-    async with async_playwright() as pw:
-        tasks=[]
-        idx=0
-        for s in sources:
-            if s.lower()=="mercadolibre":
-                tasks.append(scrape_mercadolibre(pw, location, max_pages=max_pages, idx_offset=idx))
-            elif s.lower()=="properati":
-                tasks.append(scrape_properati(pw, location, max_pages=max_pages, idx_offset=idx))
-            else:
-                # unknown: skip for now
-                pass
-            idx+=1
-        # run concurrently but throttle by MAX_CONCURRENT_BROWSERS
-        gathered=[]
-        sem = asyncio.Semaphore(MAX_CONCURRENT_BROWSERS)
-        async def sem_task(coro):
-            async with sem:
-                return await coro
-        gathered = await asyncio.gather(*[sem_task(t) for t in tasks], return_exceptions=True)
-        # flatten
-        all_items=[]
-        for g in gathered:
-            if isinstance(g, Exception):
-                print("scrape exception:", g)
                 continue
-            all_items.extend(g)
-    # validate urls and save
-    async with httpx.AsyncClient(follow_redirects=True, timeout=20) as client:
-        valid=[]
-        for i,it in enumerate(all_items):
-            ok = await validate_url(client, it["url"])
-            if ok:
-                saved = save_property(it)
-                valid.append(it)
-    return {"found": len(all_items), "validated": len(valid)}
-@app.post("/api/scrape")
-async def api_scrape(req: Request):
-    payload = await req.json()
-    location = payload.get("location")
-    if not location:
-        raise HTTPException(status_code=400, detail="location required")
-    sources = payload.get("sources", ["mercadolibre","properati"])
-    max_pages = int(payload.get("max_pages", DEFAULT_MAX_PAGES))
-    force = bool(payload.get("force", False))
-    result = await run_all_scrapers(location, sources, max_pages=max_pages, force=force)
-    return JSONResponse(result)
-@app.get("/api/search")
-async def api_search(q: Optional[str]=None, source: Optional[str]=None, limit:int=200):
-    data = query_db(q=q, source=source, limit=limit)
-    return JSONResponse({"items": data, "count": len(data)})
-@app.get("/api/health")
-async def health():
-    return JSONResponse({"ok":True,"time":datetime.utcnow().isoformat()})
-# ---------------- GRADIO UI (simple) ----------------
-def frontend_invoke_scrape(location, sources, max_pages, force_flag):
-    # call local API (same process) synchronously
-    import requests
-    payload = {"location":location, "sources": [s.strip() for s in sources.split(",") if s.strip()], "max_pages":int(max_pages), "force":bool(force_flag)}
-    try:
-        r = requests.post("http://127.0.0.1:8000/api/scrape", json=payload, timeout=600)
-        r.raise_for_status()
-        return f"Scrape iniciado: {r.json()}"
-    except Exception as e:
-        return f"Error al iniciar scrape: {e}"
-def frontend_query(q_text, source):
-    import requests
-    try:
-        params = {}
-        if q_text: params["q"] = q_text
-        if source: params["source"] = source
-        r = requests.get("http://127.0.0.1:8000/api/search", params=params, timeout=60)
-        r.raise_for_status()
-        items = r.json().get("items", [])
-        # convert to table-friendly list
-        rows = []
-        for it in items:
-            rows.append([it.get("title"), it.get("price"), it.get("currency"), it.get("source"), it.get("url")])
-        return rows
-    except Exception as e:
-        return [["Error", str(e), "", "", ""]]
-def mount_gradio():
-    with gr.Blocks(title="SamuelHouseFinder") as demo:
-        gr.Markdown("## SamuelHouseFinder — Zona Norte (Saavedra → La Lucila)\nBackend con Playwright. Usá con cuidado y respetá TOS de portales.")
-        with gr.Row():
-            with gr.Column():
-                loc = gr.Textbox(label="Ubicación (ej: Martinez, Olivos, Saavedra)", value="Saavedra")
-                srcs = gr.Textbox(label="Fuentes (csv)", value="mercadolibre,properati")
-                pages = gr.Slider(label="Páginas por fuente", minimum=1, maximum=5, value=1)
-                force = gr.Checkbox(label="Forzar re-scrape", value=False)
-                btn = gr.Button("Buscar y Scrappear")
-                out = gr.Textbox(label="Estado")
-            with gr.Column():
-                qtxt = gr.Textbox(label="Buscar en DB (texto libre)", value="")
-                qsrc = gr.Textbox(label="Fuente (opcional)", value="")
-                qbtn = gr.Button("Consultar DB")
-                table = gr.Dataframe(headers=["title","price","currency","source","url"], datatype=["str","str","str","str","str"])
-        btn.click(frontend_invoke_scrape, inputs=[loc, srcs, pages, force], outputs=[out])
-        qbtn.click(frontend_query, inputs=[qtxt, qsrc], outputs=[table])
-    return demo
-# ---------------- RUN ----------------
 if __name__ == "__main__":
-    # Run FastAPI + Gradio in same process: start FastAPI in background, then Gradio
-    import threading, time
-    def start_uvicorn():
-        uvicorn.run("app:app", host="0.0.0.0", port=8000, log_level="info")
-    t = threading.Thread(target=start_uvicorn, daemon=True)
-    t.start()
-    # wait a moment for server
-    time.sleep(1.5)
-    demo = mount_gradio()
-    demo.launch(server_name="0.0.0.0", server_port=7860)

 import os
 import re
+import time
+import math
+import json
+import asyncio
+import random
+from dataclasses import dataclass, asdict
+from typing import List, Optional, Dict, Any, Tuple
+import urllib.parse as ul
 import httpx
 from bs4 import BeautifulSoup
+from rapidfuzz import fuzz, process
+from pydantic import BaseModel, HttpUrl, ValidationError
+import pandas as pd
 import gradio as gr
+# =========================
+# Configuración principal
+# =========================
+DEFAULT_MAX_USD = 90000
+DEFAULT_NEIGHBORHOODS = [
+    # Núcleo pedido por vos
+    "Saavedra", "Nuñez", "La Lucila", "Florida Oeste", "Munro", "Carapachay",
+    # Cercanos útiles para ampliar stock
+    "Olivos", "Villa Martelli"
 ]
+DEFAULT_TYPES = ["casa", "ph"]  # casa / ph
+DEFAULT_MIN_ROOMS = 3            # ambientes (para asegurar oficina)
+REQUIRE_BIDET = True
+REQUIRE_PET_FRIENDLY = True
+REQUIRE_OUTDOOR = True  # patio o terraza
+USER_AGENT_POOL = [
+    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
+    "(KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36",
+    "Mozilla/5.0 (Macintosh; Intel Mac OS X 13_4) AppleWebKit/605.1.15 "
+    "(KHTML, like Gecko) Version/16.4 Safari/605.1.15",
+    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
+    "(KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36",
+]
+TIMEOUT = httpx.Timeout(20.0, connect=10.0)
+MAX_CONCURRENCY = 6
+RETRIES = 2
+BACKOFF_BASE = 0.8
+# Microzonas residenciales priorizadas (heurística positiva, editable)
+MICROZONAS_PRIORITARIAS = [
+    # Saavedra
+    "Parque Saavedra", "Parque Sarmiento", "Av. Balbín", "Ruiz Huidobro",
+    # Núñez
+    "Lomas de Nuñez", "Cabildo", "Plaza Alberti",
+    # La Lucila
+    "Estación La Lucila", "Rawson", "Paraná", "Maipú",
+    # Florida/Carapachay/Munro/Martelli/Olivos
+    "Estación Florida", "Estación Carapachay", "Estación Munro",
+    "Ugarte", "San Martín", "Panamericana", "Paraná", "Pelliza", "Melo",
+]
+# =========================
+# Modelos y utilidades
+# =========================
+@dataclass
+class Listing:
+    source: str
+    title: str
+    link: str
+    price_usd: Optional[float]
+    currency: Optional[str]
+    address: Optional[str]
+    neighborhood: Optional[str]
+    city: Optional[str]
+    rooms: Optional[int]
+    bedrooms: Optional[int]
+    bathrooms: Optional[int]
+    has_patio: Optional[bool]
+    has_terrace: Optional[bool]
+    pet_friendly: Optional[bool]
+    has_bidet: Optional[bool]
+    description: Optional[str]
+    score: float
+def to_float_price(value: str) -> Optional[float]:
+    if not value:
+        return None
+    txt = value.replace(".", "").replace(",", ".").upper()
+    # Detect currency
+    if "USD" in txt or "U$S" in txt or "U$D" in txt or "DOLAR" in txt:
+        m = re.search(r"(\d+(?:\.\d+)?)", txt)
+        return float(m.group(1)) if m else None
+    # If ARS, ignore conversion (no FX in this agent) -> return None to skip
+    return None
+def extract_int(text: str) -> Optional[int]:
+    if not text:
+        return None
+    m = re.search(r"(\d+)", text)
+    return int(m.group(1)) if m else None
+def text_has_any(text: str, keywords: List[str]) -> bool:
+    if not text:
         return False
+    t = text.lower()
+    return any(kw.lower() in t for kw in keywords)
+def fuzzy_any(text: str, keywords: List[str], thresh: int = 80) -> bool:
+    if not text:
         return False
+    choices = [(kw, fuzz.partial_ratio(text.lower(), kw.lower())) for kw in keywords]
+    return any(score >= thresh for _, score in choices)
+def feature_guess(desc: str) -> Tuple[Optional[bool], Optional[bool], Optional[bool], Optional[bool]]:
+    # patio, terraza, mascotas, bidet
+    patio = fuzzy_any(desc, ["patio", "patio propio", "patio descubierto", "fondo", "jardín"])
+    terraza = fuzzy_any(desc, ["terraza", "terraza propia", "terraza transitable", "azotea"])
+    mascotas = fuzzy_any(desc, ["se aceptan mascotas", "pet friendly", "apta mascotas"])
+    bidet = fuzzy_any(desc, ["bidet"])  # estricto: si no lo menciona, muchas veces igual hay, pero filtramos a pedido
+    return patio or None, terraza or None, mascotas or None, bidet or None
+def residential_score(address: str, neighborhood: str, desc: str) -> float:
+    text = " ".join([address or "", neighborhood or "", desc or ""])
+    boost = 0.0
+    for kw in MICROZONAS_PRIORITARIAS:
+        if fuzz.partial_ratio(text.lower(), kw.lower()) >= 80:
+            boost += 0.5
+    # Tope para no desbalancear
+    return min(boost, 2.0)
+def compute_score(lst: Listing, filters: Dict[str, Any]) -> float:
+    score = 0.0
+    if lst.price_usd is not None and lst.price_usd <= filters["max_price_usd"]:
+        score += 1.0
+        # Mejor precio más bajo
+        score += (filters["max_price_usd"] - lst.price_usd) / max(filters["max_price_usd"], 1) * 1.0
+    # Ambientes
+    if lst.rooms and lst.rooms >= filters["min_rooms"]:
+        score += 1.0
+    # Exterior
+    if filters["require_outdoor"]:
+        if (lst.has_patio or lst.has_terrace):
+            score += 1.0
+    # Mascotas
+    if not filters["require_pet"]:
+        score += 0.2
+    else:
+        if lst.pet_friendly:
+            score += 0.6
+    # Bidet
+    if not filters["require_bidet"]:
+        score += 0.2
+    else:
+        if lst.has_bidet:
+            score += 0.6
+    # Microzonas residenciales
+    score += residential_score(lst.address or "", lst.neighborhood or "", lst.description or "")
+    return round(score, 3)
+def clean_text(s: str) -> str:
+    return re.sub(r"\s+", " ", (s or "").strip())
+def headers():
+    return {"User-Agent": random.choice(USER_AGENT_POOL)}
+async def fetch(client: httpx.AsyncClient, url: str) -> Optional[str]:
+    for i in range(RETRIES + 1):
         try:
+            r = await client.get(url, headers=headers(), timeout=TIMEOUT)
+            if r.status_code == 200 and r.text:
+                return r.text
+            await asyncio.sleep(BACKOFF_BASE * (2 ** i))
         except Exception:
+            await asyncio.sleep(BACKOFF_BASE * (2 ** i))
+    return None
+async def fetch_detail_and_enrich(client: httpx.AsyncClient, lst: Listing) -> Listing:
+    html = await fetch(client, lst.link)
     if not html:
+        return lst
+    soup = BeautifulSoup(html, "lxml")
+    # Descripción
+    desc_el = soup.find(["div", "section"], string=None, attrs={"class": re.compile(r"(description|Description|post|body)")}) \
+              or soup.find("p")
+    desc = ""
+    if desc_el:
+        desc = clean_text(desc_el.get_text(" ", strip=True))
+    else:
+        # fallback: join many text nodes (best effort)
+        desc = clean_text(" ".join(t.get_text(" ", strip=True) for t in soup.find_all(["p", "li"])[:30]))
+    # Features heurísticos
+    patio, terraza, mascotas, bidet = feature_guess(desc)
+    # Ambientes / baños de cards de features típicas
+    features_text = " ".join(el.get_text(" ", strip=True) for el in soup.find_all(["li", "span", "div"]) if el and el.get_text() and any(x in el.get_text().lower() for x in ["ambiente", "dorm", "baño"]))
+    rooms = lst.rooms or extract_int(re.search(r"(\d+)\s*ambiente", features_text.lower()).group(1)) if re.search(r"(\d+)\s*ambiente", features_text.lower()) else lst.rooms
+    bathrooms = lst.bathrooms or extract_int(re.search(r"(\d+)\s*bañ", features_text.lower()).group(1)) if re.search(r"(\d+)\s*bañ", features_text.lower()) else lst.bathrooms
+    bedrooms = lst.bedrooms or extract_int(re.search(r"(\d+)\s*dorm", features_text.lower()).group(1)) if re.search(r"(\d+)\s*dorm", features_text.lower()) else lst.bedrooms
+    # Dirección si aparece
+    addr_guess = soup.find(attrs={"class": re.compile(r"(address|ubicacion|location|inmo-location)")})
+    if addr_guess and not lst.address:
+        lst.address = clean_text(addr_guess.get_text(" ", strip=True))[:200]
+    # Merge
+    lst.description = desc or lst.description
+    lst.has_patio = lst.has_patio if lst.has_patio is not None else patio
+    lst.has_terrace = lst.has_terrace if lst.has_terrace is not None else terraza
+    lst.pet_friendly = lst.pet_friendly if lst.pet_friendly is not None else mascotas
+    lst.has_bidet = lst.has_bidet if lst.has_bidet is not None else bidet
+    lst.rooms = rooms
+    lst.bathrooms = bathrooms
+    lst.bedrooms = bedrooms
+    return lst
+# =========================
+# Scrapers (adaptadores)
+# =========================
+# Estrategia: usar búsquedas textuales robustas por sitio para barrios y filtros.
+# Luego, para cada aviso, enriquecemos con la página de detalle.
+def zonaprop_search_urls(neighs: List[str], max_usd: int, types: List[str]) -> List[str]:
+    urls = []
+    for n in neighs:
+        base = "https://www.zonaprop.com.ar/propiedades.html"
+        # Consulta textual robusta (evita slugs frágiles)
+        q = f"{' o '.join(types)} venta {n} hasta {max_usd} dolares 3 ambientes patio terraza mascotas bidet"
+        urls.append(f"{base}?q={ul.quote(q)}")
+    return urls
+def argenprop_search_urls(neighs: List[str], max_usd: int, types: List[str]) -> List[str]:
+    urls = []
+    for n in neighs:
+        base = "https://www.argenprop.com/propiedades"
+        q = f"{' o '.join(types)} venta {n} hasta {max_usd} dolares 3 ambientes patio terraza mascotas bidet"
+        urls.append(f"{base}?text={ul.quote(q)}")
+    return urls
+def properati_search_urls(neighs: List[str], max_usd: int, types: List[str]) -> List[str]:
+    urls = []
+    for n in neighs:
+        base = "https://www.properati.com.ar/s/venta/propiedades"
+        q = f"{' o '.join(types)} {n} hasta {max_usd} dolares 3 ambientes patio terraza mascotas bidet"
+        urls.append(f"{base}?q={ul.quote(q)}")
+    return urls
+def generic_card_extractor(soup: BeautifulSoup, source: str) -> List[Dict[str, Any]]:
+    """
+    Extrae tarjetas de resultados de modo flexible en sitios comunes.
+    Retorna dicts con title, link, price_text, addr_text, neighborhood.
+    """
+    cards = []
+    # Buscar anchors con href a la misma base
+    anchors = soup.select("a[href]")
+    seen = set()
+    for a in anchors:
+        href = a.get("href", "")
         if not href:
             continue
+        if source in href and href not in seen:
+            seen.add(href)
+            # Título cercano
+            title = a.get_text(" ", strip=True)
+            # Precio y dirección cercanos (padres cercanos)
+            parent = a.find_parent()
+            block_text = ""
+            price_text = ""
+            address_text = ""
+            if parent:
+                block_text = clean_text(parent.get_text(" ", strip=True))
+                # Precio
+                m = re.search(r"(U\$S|USD|US\$|D[oó]lares?)\s*([\d\.\,]+)", block_text, re.IGNORECASE)
+                price_text = (m.group(0) if m else "")
+                # Dirección aproximada
+                addr_m = re.search(r"(Saavedra|Nu[eñ]ez|La Lucila|Florida|Munro|Carapachay|Olivos|Martelli)[^|,]*", block_text, re.IGNORECASE)
+                address_text = addr_m.group(0) if addr_m else ""
+            cards.append({
+                "title": title or "",
+                "link": href if href.startswith("http") else f"https://{source}{href}",
+                "price_text": price_text,
+                "addr_text": address_text
             })
+    # Heurística: filtrar duplicados y ruido por título/link
+    filtered = []
+    for c in cards:
+        if len(c["title"]) < 8:
+            continue
+        if any(tok in c["link"] for tok in ["/perfil/", "/inmobiliaria/", "/ayuda", "/faq", "/login", "/like"]):
+            continue
+        filtered.append(c)
+    return filtered
+async def scrape_search_page(client: httpx.AsyncClient, url: str, domain: str) -> List[Listing]:
+    html = await fetch(client, url)
+    if not html:
+        return []
+    soup = BeautifulSoup(html, "lxml")
+    cards = generic_card_extractor(soup, domain)
+    listings: List[Listing] = []
+    for c in cards:
+        price = to_float_price(c["price_text"])
+        listings.append(Listing(
+            source=domain,
+            title=clean_text(c["title"])[:140],
+            link=c["link"],
+            price_usd=price,
+            currency="USD" if price is not None else None,
+            address=c["addr_text"],
+            neighborhood=None,
+            city="Vicente López / CABA",
+            rooms=None, bedrooms=None, bathrooms=None,
+            has_patio=None, has_terrace=None, pet_friendly=None, has_bidet=None,
+            description=None,
+            score=0.0
+        ))
+    # Limitar para no abusar (páginas pueden traer mucha basura)
+    return listings[:25]
+async def scrape_portal(client: httpx.AsyncClient, portal: str, urls: List[str]) -> List[Listing]:
+    out: List[Listing] = []
+    for u in urls[:4]:  # primeras 4 búsquedas por portal para limitar carga
+        try:
+            res = await scrape_search_page(client, u, portal)
+            out.extend(res)
+            await asyncio.sleep(0.5)
+        except Exception:
+            continue
     return out
+# =========================
+# Orquestación
+# =========================
+async def run_agent(
+    neighborhoods: List[str],
+    max_price_usd: int,
+    types: List[str],
+    min_rooms: int,
+    require_outdoor: bool,
+    require_bidet: bool,
+    require_pet: bool
+) -> List[Listing]:
+    filters = dict(
+        max_price_usd=max_price_usd,
+        min_rooms=min_rooms,
+        require_outdoor=require_outdoor,
+        require_bidet=require_bidet,
+        require_pet=require_pet,
+    )
+    async with httpx.AsyncClient(follow_redirects=True) as client:
+        # 1) Generar URLs de búsqueda
+        z_urls = zonaprop_search_urls(neighborhoods, max_price_usd, types)
+        a_urls = argenprop_search_urls(neighborhoods, max_price_usd, types)
+        p_urls = properati_search_urls(neighborhoods, max_price_usd, types)
+        # 2) Scrapeo base de resultados
+        tasks = [
+            scrape_portal(client, "www.zonaprop.com.ar", z_urls),
+            scrape_portal(client, "www.argenprop.com", a_urls),
+            scrape_portal(client, "www.properati.com.ar", p_urls),
+        ]
+        batch_lists = await asyncio.gather(*tasks)
+        listings = [l for batch in batch_lists for l in batch]
+        # 3) Deduplicar por link
+        seen = set()
+        unique: List[Listing] = []
+        for l in listings:
+            if l.link in seen:
                 continue
+            seen.add(l.link)
+            unique.append(l)
+        # 4) Enriquecer con detalle (concurrencia controlada)
+        sem = asyncio.Semaphore(MAX_CONCURRENCY)
+        async def enrich_guarded(l: Listing):
+            async with sem:
+                return await fetch_detail_and_enrich(client, l)
+        enriched = await asyncio.gather(*[enrich_guarded(l) for l in unique])
+        # 5) Aplicar filtros duros
+        def passes(l: Listing) -> bool:
+            # Precio
+            if l.price_usd is None or l.price_usd > max_price_usd:
+                return False
+            # Ambientes
+            if l.rooms is not None and l.rooms < min_rooms:
+                return False
+            # Exterior
+            if require_outdoor and not ((l.has_patio is True) or (l.has_terrace is True)):
+                return False
+            # Bidet
+            if require_bidet and l.has_bidet is not True:
+                return False
+            # Mascotas
+            if require_pet and l.pet_friendly is not True:
+                return False
+            # Tipos: filtrar por título/desc
+            type_hit = any(t in (l.title.lower() + " " + (l.description or "").lower()) for t in types)
+            if not type_hit:
+                # fallback: permitir si no se menciona pero cumple todo lo demás
+                type_hit = True
+            return type_hit
+        filtered = [l for l in enriched if passes(l)]
+        # 6) Scoring
+        for l in filtered:
+            l.score = compute_score(l, filters)
+        # 7) Orden final
+        filtered.sort(key=lambda x: (-x.score, x.price_usd or 1e9))
+        return filtered
+def listings_to_df(listings: List[Listing]) -> pd.DataFrame:
+    rows = []
+    for l in listings:
+        rows.append({
+            "Fuente": l.source.replace("www.", ""),
+            "Título": l.title,
+            "Precio USD": l.price_usd,
+            "Ambientes": l.rooms,
+            "Dormitorios": l.bedrooms,
+            "Baños": l.bathrooms,
+            "Patio": l.has_patio,
+            "Terraza": l.has_terrace,
+            "Mascotas": l.pet_friendly,
+            "Bidet": l.has_bidet,
+            "Dirección/Área": l.address,
+            "Link": l.link,
+            "Score": l.score
+        })
+    df = pd.DataFrame(rows)
+    if not df.empty:
+        # Columnas ordenadas
+        cols = ["Fuente","Título","Precio USD","Ambientes","Dormitorios","Baños","Patio","Terraza","Mascotas","Bidet","Dirección/Área","Link","Score"]
+        df = df[cols]
+    return df
+# =========================
+# UI (Gradio)
+# =========================
+DESCRIPTION = """
+Agente agregador de avisos (Zonaprop, Argenprop, Properati) para Saavedra → La Lucila y alrededores.
+Filtra: USD ≤ 90k, ≥ 3 ambientes (para oficina), patio/terraza, mascotas, bidet (si figura en descripción).
+Tip: si ves pocos resultados, desactiva “Bidet requerido” o “Pet-friendly requerido” (muchos avisos no lo escriben, aunque lo tengan).
+"""
+async def run_and_present(neighs, max_usd, types, min_rooms, req_outdoor, req_bidet, req_pet):
+    neighs = [n.strip() for n in neighs.split(",") if n.strip()]
+    types = [t.strip().lower() for t in types.split(",") if t.strip()]
+    results = await run_agent(
+        neighborhoods=neighs,
+        max_price_usd=max_usd,
+        types=types,
+        min_rooms=min_rooms,
+        require_outdoor=req_outdoor,
+        require_bidet=req_bidet,
+        require_pet=req_pet
+    )
+    df = listings_to_df(results)
+    # Export JSON también
+    json_blob = json.dumps([asdict(l) for l in results], ensure_ascii=False, indent=2)
+    return df, json_blob
+with gr.Blocks(title="Agente Inmuebles Norte BA (≤ USD 90k)") as demo:
+    gr.Markdown("# Agente de casas/PH norte BA (≤ 90 000 USD)")
+    gr.Markdown(DESCRIPTION)
+    with gr.Row():
+        neighs = gr.Textbox(label="Barrios (coma separada)", value=", ".join(DEFAULT_NEIGHBORHOODS))
+        max_usd = gr.Number(label="Precio máx. (USD)", value=DEFAULT_MAX_USD, precision=0)
+    with gr.Row():
+        types = gr.Textbox(label="Tipos (coma separada)", value=", ".join(DEFAULT_TYPES))
+        min_rooms = gr.Number(label="Mínimo ambientes", value=DEFAULT_MIN_ROOMS, precision=0)
+    with gr.Row():
+        req_outdoor = gr.Checkbox(label="Requerir patio o terraza", value=REQUIRE_OUTDOOR)
+        req_bidet = gr.Checkbox(label="Requerir bidet (solo si aparece en descripción)", value=REQUIRE_BIDET)
+        req_pet = gr.Checkbox(label="Requerir pet-friendly (si aparece en descripción)", value=REQUIRE_PET_FRIENDLY)
+    btn = gr.Button("Buscar ahora", variant="primary")
+    with gr.Tabs():
+        with gr.Tab("Resultados"):
+            table = gr.Dataframe(interactive=False, wrap=True, overflow_row_behaviour="paginate", max_rows=300)
+        with gr.Tab("JSON"):
+            j = gr.Code(language="json")
+    btn.click(run_and_present, inputs=[neighs, max_usd, types, min_rooms, req_outdoor, req_bidet, req_pet], outputs=[table, j])
 if __name__ == "__main__":
+    demo.launch()