Spaces:

Lukeetah
/

SamuelHouseFinderApp

Sleeping

App Files Files Community

Lukeetah commited on Aug 9, 2025

Commit

a801f06

verified ·

1 Parent(s): f8c719d

Update app.py

Browse files

Files changed (1) hide show

app.py +379 -73

app.py CHANGED Viewed

@@ -1,94 +1,400 @@
-import gradio as gr
-import requests
-from bs4 import BeautifulSoup
-import sqlite3
 import re
-import time
-DB_NAME = "propiedades.db"
-# ------------------- DB -------------------
 def init_db():
-    conn = sqlite3.connect(DB_NAME)
     c = conn.cursor()
-    c.execute('''CREATE TABLE IF NOT EXISTS propiedades (
-                    id INTEGER PRIMARY KEY AUTOINCREMENT,
-                    titulo TEXT,
-                    precio TEXT,
-                    ubicacion TEXT,
-                    link TEXT UNIQUE,
-                    fecha TEXT
-                )''')
     conn.commit()
     conn.close()
-def guardar_propiedad(titulo, precio, ubicacion, link):
-    conn = sqlite3.connect(DB_NAME)
     c = conn.cursor()
     try:
-        c.execute("INSERT INTO propiedades (titulo, precio, ubicacion, link, fecha) VALUES (?, ?, ?, ?, date('now'))",
-                  (titulo, precio, ubicacion, link))
         conn.commit()
     except sqlite3.IntegrityError:
-        pass
-    conn.close()
-def buscar_propiedades(filtro_precio=None, ubicacion=None):
-    conn = sqlite3.connect(DB_NAME)
     c = conn.cursor()
-    query = "SELECT titulo, precio, ubicacion, link FROM propiedades WHERE 1=1"
     params = []
-    if filtro_precio:
-        query += " AND precio LIKE ?"
-        params.append(f"%{filtro_precio}%")
-    if ubicacion:
-        query += " AND ubicacion LIKE ?"
-        params.append(f"%{ubicacion}%")
-    c.execute(query, params)
-    data = c.fetchall()
     conn.close()
-    return data
-# ------------------- SCRAPER -------------------
-def scrape_mercadolibre_zonanorte():
-    url = "https://inmuebles.mercadolibre.com.ar/venta/capital-federal/zona-norte/"
-    r = requests.get(url, headers={"User-Agent": "Mozilla/5.0"})
-    soup = BeautifulSoup(r.text, "html.parser")
-    cards = soup.find_all("li", {"class": "ui-search-layout__item"})
-    for card in cards:
-        titulo_tag = card.find("h2")
-        precio_tag = card.find("span", {"class": "price-tag-fraction"})
-        link_tag = card.find("a", href=True)
-        ubicacion_tag = card.find("span", {"class": "ui-search-item__location"})
-        if titulo_tag and precio_tag and link_tag:
-            titulo = titulo_tag.text.strip()
-            precio = precio_tag.text.strip()
-            link = link_tag['href'].split('#')[0]
-            ubicacion = ubicacion_tag.text.strip() if ubicacion_tag else "N/D"
-            guardar_propiedad(titulo, precio, ubicacion, link)
-# ------------------- INTERFAZ -------------------
-def actualizar_y_buscar(precio, ubicacion):
-    scrape_mercadolibre_zonanorte()
-    data = buscar_propiedades(precio, ubicacion)
-    if not data:
-        return "No se encontraron propiedades con esos filtros."
-    table = ""
-    for t, p, u, l in data:
-        table += f"🏠 {t}\n💰 {p}\n📍 {u}\n🔗 {l}\n\n"
-    return table
 init_db()
-with gr.Blocks() as demo:
-    gr.Markdown("# 🏡 Samuel House Finder — Zona Norte BA")
-    precio_in = gr.Textbox(label="Filtrar por precio (ej: 100000)")
-    ubicacion_in = gr.Textbox(label="Filtrar por ubicación (ej: Saavedra)")
-    boton = gr.Button("Buscar y Actualizar")
-    salida = gr.Textbox(label="Resultados", lines=20)
-    boton.click(actualizar_y_buscar, inputs=[precio_in, ubicacion_in], outputs=salida)
 if __name__ == "__main__":
-    demo.launch()

+# app.py
+# SamuelHouseFinder - FastAPI + Playwright scrapers + Gradio UI (single file)
+# Requisitos: ejecutar en contenedor con Playwright browsers (Dockerfile incluido abajo)
+import os
+import asyncio
+import json
 import re
+import sqlite3
+from typing import List, Dict, Any, Optional
+from datetime import datetime
+from urllib.parse import urljoin
+from fastapi import FastAPI, HTTPException, Request, BackgroundTasks
+from fastapi.responses import JSONResponse, HTMLResponse
+import uvicorn
+import httpx
+from bs4 import BeautifulSoup
+# Gradio UI
+import gradio as gr
+# Playwright
+from playwright.async_api import async_playwright, TimeoutError as PWTimeout
+# ---------------- CONFIG ----------------
+DB_PATH = os.environ.get("DB_PATH", "data/properties.db")
+PROXY_LIST = os.environ.get("PROXY_LIST")  # comma separated http://user:pass@ip:port
+CAPTCHA_API_KEY = os.environ.get("CAPTCHA_API_KEY")  # optional
+MAX_CONCURRENT_BROWSERS = int(os.environ.get("MAX_BROWSERS", "2"))
+DEFAULT_MAX_PAGES = int(os.environ.get("DEFAULT_MAX_PAGES", "2"))
+USER_AGENTS = [
+    # Expand this list for production
+    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36",
+    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36",
+    "Mozilla/5.0 (Macintosh; Intel Mac OS X 13_6) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.0 Safari/605.1.15",
+]
+os.makedirs(os.path.dirname(DB_PATH) or ".", exist_ok=True)
+# ---------------- DB ----------------
 def init_db():
+    conn = sqlite3.connect(DB_PATH)
     c = conn.cursor()
+    c.execute("""
+        CREATE TABLE IF NOT EXISTS properties (
+            id INTEGER PRIMARY KEY AUTOINCREMENT,
+            title TEXT,
+            price TEXT,
+            currency TEXT,
+            address TEXT,
+            neighbourhood TEXT,
+            lat REAL,
+            lon REAL,
+            bedrooms INTEGER,
+            bathrooms INTEGER,
+            surface REAL,
+            amenities TEXT,
+            source TEXT,
+            url TEXT UNIQUE,
+            scraped_at TEXT,
+            raw_html TEXT
+        )
+    """)
     conn.commit()
     conn.close()
+def save_property(item: Dict[str,Any]) -> bool:
+    conn = sqlite3.connect(DB_PATH)
     c = conn.cursor()
     try:
+        c.execute("""
+            INSERT INTO properties
+            (title,price,currency,address,neighbourhood,lat,lon,bedrooms,bathrooms,surface,amenities,source,url,scraped_at,raw_html)
+            VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)
+        """, (
+            item.get("title"),
+            item.get("price"),
+            item.get("currency"),
+            item.get("address"),
+            item.get("neighbourhood"),
+            item.get("lat"),
+            item.get("lon"),
+            item.get("bedrooms"),
+            item.get("bathrooms"),
+            item.get("surface"),
+            json.dumps(item.get("amenities",[]), ensure_ascii=False),
+            item.get("source"),
+            item.get("url"),
+            datetime.utcnow().isoformat(),
+            item.get("raw_html","")[:10000]
+        ))
         conn.commit()
+        return True
     except sqlite3.IntegrityError:
+        # already exists
+        return False
+    except Exception as e:
+        print("DB save error:", e)
+        return False
+    finally:
+        conn.close()
+def query_db(q: Optional[str]=None, min_price: Optional[int]=None, max_price: Optional[int]=None, bedrooms: Optional[int]=None, source: Optional[str]=None, limit:int=200):
+    conn = sqlite3.connect(DB_PATH)
+    conn.row_factory = sqlite3.Row
     c = conn.cursor()
+    sql = "SELECT * FROM properties WHERE 1=1"
     params = []
+    if q:
+        sql += " AND (title LIKE ? OR address LIKE ? OR neighbourhood LIKE ?)"
+        qv = f"%{q}%"
+        params += [qv,qv,qv]
+    if source:
+        sql += " AND source = ?"
+        params.append(source)
+    # NOTE: price is stored as text (different formats). For production parse and store numeric.
+    sql += " ORDER BY scraped_at DESC LIMIT ?"
+    params.append(limit)
+    rows = c.execute(sql, params).fetchall()
     conn.close()
+    return [dict(r) for r in rows]
+# ---------------- UTIL ----------------
+def get_proxies_list():
+    if not PROXY_LIST:
+        return []
+    return [p.strip() for p in PROXY_LIST.split(",") if p.strip()]
+def pick_proxy(idx=0):
+    lst = get_proxies_list()
+    if not lst:
+        return None
+    return lst[idx % len(lst)]
+async def validate_url(client: httpx.AsyncClient, url: str) -> bool:
+    try:
+        r = await client.head(url, follow_redirects=True, timeout=15)
+        return r.status_code == 200
+    except Exception:
+        try:
+            r2 = await client.get(url, follow_redirects=True, timeout=20)
+            return r2.status_code == 200
+        except Exception:
+            return False
+# ---------------- SCRAPERS (Playwright) ----------------
+# Each scraper returns list[dict] with canonical fields (see save_property)
+async def scrape_mercadolibre(pw, location:str, max_pages:int=1, idx_offset=0) -> List[Dict[str,Any]]:
+    """Scrapes MercadoLibre Inmuebles listing pages (rendered)."""
+    out = []
+    base = "https://listado.mercadolibre.com.ar"
+    # build query: try location as-is and also appended 'venta'
+    q = location.replace(" ", "-")
+    async with pw.chromium.launch(headless=True, args=["--no-sandbox"]) as browser:
+        for p in range(1, max_pages+1):
+            # MercadoLibre pagination is usually offset-based; try two patterns
+            page_path = f"/{q}_Desde_{(p-1)*50+1}"
+            url = urljoin(base, page_path)
+            proxy = pick_proxy(p-1)
+            context_args = {}
+            if proxy:
+                context_args["proxy"] = {"server": proxy}
+            ua = USER_AGENTS[(idx_offset + p) % len(USER_AGENTS)]
+            context_args["user_agent"] = ua
+            context = await browser.new_context(**context_args)
+            page = await context.new_page()
+            try:
+                await page.goto(url, wait_until="networkidle", timeout=30000)
+                # ML often lazy-loads; ensure content loaded
+                await page.wait_for_timeout(1500)
+                html = await page.content()
+            except PWTimeout:
+                html = await page.content()
+            except Exception as e:
+                print("ML page error:", e)
+                html = ""
+            finally:
+                try:
+                    await page.close()
+                    await context.close()
+                except Exception:
+                    pass
+            if not html:
+                continue
+            soup = BeautifulSoup(html, "html.parser")
+            # Search for anchor tags that likely link to properties
+            anchors = soup.select("a[href]")
+            found = set()
+            for a in anchors:
+                href = a.get("href")
+                if not href:
+                    continue
+                # heuristics: property detail urls often contain '/MLA-' or '/MLO-' or '/inmuebles'
+                if re.search(r"/MLA-|/MLO-|/inmuebles/", href):
+                    full = href if href.startswith("http") else urljoin(base, href)
+                    if full in found:
+                        continue
+                    found.add(full)
+                    title = (a.get_text(strip=True) or "Propiedad MercadoLibre")[:300]
+                    out.append({
+                        "title": title,
+                        "price": None,
+                        "currency": "ARS",
+                        "address": None,
+                        "neighbourhood": None,
+                        "lat": None,
+                        "lon": None,
+                        "bedrooms": None,
+                        "bathrooms": None,
+                        "surface": None,
+                        "amenities": [],
+                        "source": "MercadoLibre",
+                        "url": full,
+                        "raw_html": str(a)[:8000]
+                    })
+    return out
+async def scrape_properati(pw, location:str, max_pages:int=1, idx_offset=0) -> List[Dict[str,Any]]:
+    out = []
+    base = "https://www.properati.com.ar"
+    url = f"{base}/search?q={location}"
+    proxy = pick_proxy(idx_offset)
+    ua = USER_AGENTS[idx_offset % len(USER_AGENTS)]
+    async with pw.chromium.launch(headless=True, args=["--no-sandbox"]) as browser:
+        context_args = {"user_agent": ua}
+        if proxy:
+            context_args["proxy"] = {"server": proxy}
+        context = await browser.new_context(**context_args)
+        page = await context.new_page()
+        try:
+            await page.goto(url, wait_until="networkidle", timeout=30000)
+            await page.wait_for_timeout(1200)
+            html = await page.content()
+        except Exception as e:
+            print("Properati error:", e)
+            html = ""
+        try:
+            await page.close()
+            await context.close()
+        except Exception:
+            pass
+    if not html:
+        return out
+    soup = BeautifulSoup(html, "html.parser")
+    cards = soup.select("a[href]")
+    found=set()
+    for a in cards:
+        href=a.get("href")
+        if not href:
+            continue
+        if "/property/" in href or "/inmuebles/" in href or "/propiedad" in href:
+            full = href if href.startswith("http") else urljoin(base, href)
+            if full in found: continue
+            found.add(full)
+            title=(a.get_text(strip=True) or "Propiedad Properati")[:300]
+            out.append({
+                "title": title,
+                "price": None,
+                "currency": "ARS",
+                "address": None,
+                "neighbourhood": None,
+                "lat": None,
+                "lon": None,
+                "bedrooms": None,
+                "bathrooms": None,
+                "surface": None,
+                "amenities": [],
+                "source": "Properati",
+                "url": full,
+                "raw_html": str(a)[:8000]
+            })
+    return out
+# Extendable: add ZonaProp, Inmuebles, ArgenProp, etc.
+# ---------------- ORCHESTRATOR ----------------
+app = FastAPI(title="SamuelHouseFinder API")
 init_db()
+async def run_all_scrapers(location: str, sources: List[str], max_pages:int=1, force:bool=False) -> Dict[str,Any]:
+    results = []
+    async with async_playwright() as pw:
+        tasks=[]
+        idx=0
+        for s in sources:
+            if s.lower()=="mercadolibre":
+                tasks.append(scrape_mercadolibre(pw, location, max_pages=max_pages, idx_offset=idx))
+            elif s.lower()=="properati":
+                tasks.append(scrape_properati(pw, location, max_pages=max_pages, idx_offset=idx))
+            else:
+                # unknown: skip for now
+                pass
+            idx+=1
+        # run concurrently but throttle by MAX_CONCURRENT_BROWSERS
+        gathered=[]
+        sem = asyncio.Semaphore(MAX_CONCURRENT_BROWSERS)
+        async def sem_task(coro):
+            async with sem:
+                return await coro
+        gathered = await asyncio.gather(*[sem_task(t) for t in tasks], return_exceptions=True)
+        # flatten
+        all_items=[]
+        for g in gathered:
+            if isinstance(g, Exception):
+                print("scrape exception:", g)
+                continue
+            all_items.extend(g)
+    # validate urls and save
+    async with httpx.AsyncClient(follow_redirects=True, timeout=20) as client:
+        valid=[]
+        for i,it in enumerate(all_items):
+            ok = await validate_url(client, it["url"])
+            if ok:
+                saved = save_property(it)
+                valid.append(it)
+    return {"found": len(all_items), "validated": len(valid)}
+@app.post("/api/scrape")
+async def api_scrape(req: Request):
+    payload = await req.json()
+    location = payload.get("location")
+    if not location:
+        raise HTTPException(status_code=400, detail="location required")
+    sources = payload.get("sources", ["mercadolibre","properati"])
+    max_pages = int(payload.get("max_pages", DEFAULT_MAX_PAGES))
+    force = bool(payload.get("force", False))
+    result = await run_all_scrapers(location, sources, max_pages=max_pages, force=force)
+    return JSONResponse(result)
+@app.get("/api/search")
+async def api_search(q: Optional[str]=None, source: Optional[str]=None, limit:int=200):
+    data = query_db(q=q, source=source, limit=limit)
+    return JSONResponse({"items": data, "count": len(data)})
+@app.get("/api/health")
+async def health():
+    return JSONResponse({"ok":True,"time":datetime.utcnow().isoformat()})
+# ---------------- GRADIO UI (simple) ----------------
+def frontend_invoke_scrape(location, sources, max_pages, force_flag):
+    # call local API (same process) synchronously
+    import requests
+    payload = {"location":location, "sources": [s.strip() for s in sources.split(",") if s.strip()], "max_pages":int(max_pages), "force":bool(force_flag)}
+    try:
+        r = requests.post("http://127.0.0.1:8000/api/scrape", json=payload, timeout=600)
+        r.raise_for_status()
+        return f"Scrape iniciado: {r.json()}"
+    except Exception as e:
+        return f"Error al iniciar scrape: {e}"
+def frontend_query(q_text, source):
+    import requests
+    try:
+        params = {}
+        if q_text: params["q"] = q_text
+        if source: params["source"] = source
+        r = requests.get("http://127.0.0.1:8000/api/search", params=params, timeout=60)
+        r.raise_for_status()
+        items = r.json().get("items", [])
+        # convert to table-friendly list
+        rows = []
+        for it in items:
+            rows.append([it.get("title"), it.get("price"), it.get("currency"), it.get("source"), it.get("url")])
+        return rows
+    except Exception as e:
+        return [["Error", str(e), "", "", ""]]
+def mount_gradio():
+    with gr.Blocks(title="SamuelHouseFinder") as demo:
+        gr.Markdown("## SamuelHouseFinder — Zona Norte (Saavedra → La Lucila)\nBackend con Playwright. Usá con cuidado y respetá TOS de portales.")
+        with gr.Row():
+            with gr.Column():
+                loc = gr.Textbox(label="Ubicación (ej: Martinez, Olivos, Saavedra)", value="Saavedra")
+                srcs = gr.Textbox(label="Fuentes (csv)", value="mercadolibre,properati")
+                pages = gr.Slider(label="Páginas por fuente", minimum=1, maximum=5, value=1)
+                force = gr.Checkbox(label="Forzar re-scrape", value=False)
+                btn = gr.Button("Buscar y Scrappear")
+                out = gr.Textbox(label="Estado")
+            with gr.Column():
+                qtxt = gr.Textbox(label="Buscar en DB (texto libre)", value="")
+                qsrc = gr.Textbox(label="Fuente (opcional)", value="")
+                qbtn = gr.Button("Consultar DB")
+                table = gr.Dataframe(headers=["title","price","currency","source","url"], datatype=["str","str","str","str","str"])
+        btn.click(frontend_invoke_scrape, inputs=[loc, srcs, pages, force], outputs=[out])
+        qbtn.click(frontend_query, inputs=[qtxt, qsrc], outputs=[table])
+    return demo
+# ---------------- RUN ----------------
 if __name__ == "__main__":
+    # Run FastAPI + Gradio in same process: start FastAPI in background, then Gradio
+    import threading, time
+    def start_uvicorn():
+        uvicorn.run("app:app", host="0.0.0.0", port=8000, log_level="info")
+    t = threading.Thread(target=start_uvicorn, daemon=True)
+    t.start()
+    # wait a moment for server
+    time.sleep(1.5)
+    demo = mount_gradio()
+    demo.launch(server_name="0.0.0.0", server_port=7860)