Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -1,94 +1,400 @@
|
|
| 1 |
-
|
| 2 |
-
|
| 3 |
-
|
| 4 |
-
import
|
|
|
|
|
|
|
| 5 |
import re
|
| 6 |
-
import
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 7 |
|
| 8 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 9 |
|
| 10 |
-
|
|
|
|
|
|
|
| 11 |
def init_db():
|
| 12 |
-
conn = sqlite3.connect(
|
| 13 |
c = conn.cursor()
|
| 14 |
-
c.execute(
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 22 |
conn.commit()
|
| 23 |
conn.close()
|
| 24 |
|
| 25 |
-
def
|
| 26 |
-
conn = sqlite3.connect(
|
| 27 |
c = conn.cursor()
|
| 28 |
try:
|
| 29 |
-
c.execute("
|
| 30 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 31 |
conn.commit()
|
|
|
|
| 32 |
except sqlite3.IntegrityError:
|
| 33 |
-
|
| 34 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 35 |
|
| 36 |
-
def
|
| 37 |
-
conn = sqlite3.connect(
|
|
|
|
| 38 |
c = conn.cursor()
|
| 39 |
-
|
| 40 |
params = []
|
| 41 |
-
if
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
|
|
|
|
|
|
|
|
|
| 49 |
conn.close()
|
| 50 |
-
return
|
| 51 |
-
|
| 52 |
-
# ------------------- SCRAPER -------------------
|
| 53 |
-
def scrape_mercadolibre_zonanorte():
|
| 54 |
-
url = "https://inmuebles.mercadolibre.com.ar/venta/capital-federal/zona-norte/"
|
| 55 |
-
r = requests.get(url, headers={"User-Agent": "Mozilla/5.0"})
|
| 56 |
-
soup = BeautifulSoup(r.text, "html.parser")
|
| 57 |
-
cards = soup.find_all("li", {"class": "ui-search-layout__item"})
|
| 58 |
-
|
| 59 |
-
for card in cards:
|
| 60 |
-
titulo_tag = card.find("h2")
|
| 61 |
-
precio_tag = card.find("span", {"class": "price-tag-fraction"})
|
| 62 |
-
link_tag = card.find("a", href=True)
|
| 63 |
-
ubicacion_tag = card.find("span", {"class": "ui-search-item__location"})
|
| 64 |
-
|
| 65 |
-
if titulo_tag and precio_tag and link_tag:
|
| 66 |
-
titulo = titulo_tag.text.strip()
|
| 67 |
-
precio = precio_tag.text.strip()
|
| 68 |
-
link = link_tag['href'].split('#')[0]
|
| 69 |
-
ubicacion = ubicacion_tag.text.strip() if ubicacion_tag else "N/D"
|
| 70 |
-
guardar_propiedad(titulo, precio, ubicacion, link)
|
| 71 |
-
|
| 72 |
-
# ------------------- INTERFAZ -------------------
|
| 73 |
-
def actualizar_y_buscar(precio, ubicacion):
|
| 74 |
-
scrape_mercadolibre_zonanorte()
|
| 75 |
-
data = buscar_propiedades(precio, ubicacion)
|
| 76 |
-
if not data:
|
| 77 |
-
return "No se encontraron propiedades con esos filtros."
|
| 78 |
-
table = ""
|
| 79 |
-
for t, p, u, l in data:
|
| 80 |
-
table += f"🏠 {t}\n💰 {p}\n📍 {u}\n🔗 {l}\n\n"
|
| 81 |
-
return table
|
| 82 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 83 |
init_db()
|
| 84 |
|
| 85 |
-
|
| 86 |
-
|
| 87 |
-
|
| 88 |
-
|
| 89 |
-
|
| 90 |
-
|
| 91 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 92 |
|
|
|
|
| 93 |
if __name__ == "__main__":
|
| 94 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# app.py
|
| 2 |
+
# SamuelHouseFinder - FastAPI + Playwright scrapers + Gradio UI (single file)
|
| 3 |
+
# Requisitos: ejecutar en contenedor con Playwright browsers (Dockerfile incluido abajo)
|
| 4 |
+
import os
|
| 5 |
+
import asyncio
|
| 6 |
+
import json
|
| 7 |
import re
|
| 8 |
+
import sqlite3
|
| 9 |
+
from typing import List, Dict, Any, Optional
|
| 10 |
+
from datetime import datetime
|
| 11 |
+
from urllib.parse import urljoin
|
| 12 |
+
|
| 13 |
+
from fastapi import FastAPI, HTTPException, Request, BackgroundTasks
|
| 14 |
+
from fastapi.responses import JSONResponse, HTMLResponse
|
| 15 |
+
import uvicorn
|
| 16 |
+
import httpx
|
| 17 |
+
from bs4 import BeautifulSoup
|
| 18 |
+
|
| 19 |
+
# Gradio UI
|
| 20 |
+
import gradio as gr
|
| 21 |
+
|
| 22 |
+
# Playwright
|
| 23 |
+
from playwright.async_api import async_playwright, TimeoutError as PWTimeout
|
| 24 |
|
| 25 |
+
# ---------------- CONFIG ----------------
|
| 26 |
+
DB_PATH = os.environ.get("DB_PATH", "data/properties.db")
|
| 27 |
+
PROXY_LIST = os.environ.get("PROXY_LIST") # comma separated http://user:pass@ip:port
|
| 28 |
+
CAPTCHA_API_KEY = os.environ.get("CAPTCHA_API_KEY") # optional
|
| 29 |
+
MAX_CONCURRENT_BROWSERS = int(os.environ.get("MAX_BROWSERS", "2"))
|
| 30 |
+
DEFAULT_MAX_PAGES = int(os.environ.get("DEFAULT_MAX_PAGES", "2"))
|
| 31 |
+
USER_AGENTS = [
|
| 32 |
+
# Expand this list for production
|
| 33 |
+
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36",
|
| 34 |
+
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36",
|
| 35 |
+
"Mozilla/5.0 (Macintosh; Intel Mac OS X 13_6) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.0 Safari/605.1.15",
|
| 36 |
+
]
|
| 37 |
|
| 38 |
+
os.makedirs(os.path.dirname(DB_PATH) or ".", exist_ok=True)
|
| 39 |
+
|
| 40 |
+
# ---------------- DB ----------------
|
| 41 |
def init_db():
|
| 42 |
+
conn = sqlite3.connect(DB_PATH)
|
| 43 |
c = conn.cursor()
|
| 44 |
+
c.execute("""
|
| 45 |
+
CREATE TABLE IF NOT EXISTS properties (
|
| 46 |
+
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
| 47 |
+
title TEXT,
|
| 48 |
+
price TEXT,
|
| 49 |
+
currency TEXT,
|
| 50 |
+
address TEXT,
|
| 51 |
+
neighbourhood TEXT,
|
| 52 |
+
lat REAL,
|
| 53 |
+
lon REAL,
|
| 54 |
+
bedrooms INTEGER,
|
| 55 |
+
bathrooms INTEGER,
|
| 56 |
+
surface REAL,
|
| 57 |
+
amenities TEXT,
|
| 58 |
+
source TEXT,
|
| 59 |
+
url TEXT UNIQUE,
|
| 60 |
+
scraped_at TEXT,
|
| 61 |
+
raw_html TEXT
|
| 62 |
+
)
|
| 63 |
+
""")
|
| 64 |
conn.commit()
|
| 65 |
conn.close()
|
| 66 |
|
| 67 |
+
def save_property(item: Dict[str,Any]) -> bool:
|
| 68 |
+
conn = sqlite3.connect(DB_PATH)
|
| 69 |
c = conn.cursor()
|
| 70 |
try:
|
| 71 |
+
c.execute("""
|
| 72 |
+
INSERT INTO properties
|
| 73 |
+
(title,price,currency,address,neighbourhood,lat,lon,bedrooms,bathrooms,surface,amenities,source,url,scraped_at,raw_html)
|
| 74 |
+
VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)
|
| 75 |
+
""", (
|
| 76 |
+
item.get("title"),
|
| 77 |
+
item.get("price"),
|
| 78 |
+
item.get("currency"),
|
| 79 |
+
item.get("address"),
|
| 80 |
+
item.get("neighbourhood"),
|
| 81 |
+
item.get("lat"),
|
| 82 |
+
item.get("lon"),
|
| 83 |
+
item.get("bedrooms"),
|
| 84 |
+
item.get("bathrooms"),
|
| 85 |
+
item.get("surface"),
|
| 86 |
+
json.dumps(item.get("amenities",[]), ensure_ascii=False),
|
| 87 |
+
item.get("source"),
|
| 88 |
+
item.get("url"),
|
| 89 |
+
datetime.utcnow().isoformat(),
|
| 90 |
+
item.get("raw_html","")[:10000]
|
| 91 |
+
))
|
| 92 |
conn.commit()
|
| 93 |
+
return True
|
| 94 |
except sqlite3.IntegrityError:
|
| 95 |
+
# already exists
|
| 96 |
+
return False
|
| 97 |
+
except Exception as e:
|
| 98 |
+
print("DB save error:", e)
|
| 99 |
+
return False
|
| 100 |
+
finally:
|
| 101 |
+
conn.close()
|
| 102 |
|
| 103 |
+
def query_db(q: Optional[str]=None, min_price: Optional[int]=None, max_price: Optional[int]=None, bedrooms: Optional[int]=None, source: Optional[str]=None, limit:int=200):
|
| 104 |
+
conn = sqlite3.connect(DB_PATH)
|
| 105 |
+
conn.row_factory = sqlite3.Row
|
| 106 |
c = conn.cursor()
|
| 107 |
+
sql = "SELECT * FROM properties WHERE 1=1"
|
| 108 |
params = []
|
| 109 |
+
if q:
|
| 110 |
+
sql += " AND (title LIKE ? OR address LIKE ? OR neighbourhood LIKE ?)"
|
| 111 |
+
qv = f"%{q}%"
|
| 112 |
+
params += [qv,qv,qv]
|
| 113 |
+
if source:
|
| 114 |
+
sql += " AND source = ?"
|
| 115 |
+
params.append(source)
|
| 116 |
+
# NOTE: price is stored as text (different formats). For production parse and store numeric.
|
| 117 |
+
sql += " ORDER BY scraped_at DESC LIMIT ?"
|
| 118 |
+
params.append(limit)
|
| 119 |
+
rows = c.execute(sql, params).fetchall()
|
| 120 |
conn.close()
|
| 121 |
+
return [dict(r) for r in rows]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 122 |
|
| 123 |
+
# ---------------- UTIL ----------------
|
| 124 |
+
def get_proxies_list():
|
| 125 |
+
if not PROXY_LIST:
|
| 126 |
+
return []
|
| 127 |
+
return [p.strip() for p in PROXY_LIST.split(",") if p.strip()]
|
| 128 |
+
|
| 129 |
+
def pick_proxy(idx=0):
|
| 130 |
+
lst = get_proxies_list()
|
| 131 |
+
if not lst:
|
| 132 |
+
return None
|
| 133 |
+
return lst[idx % len(lst)]
|
| 134 |
+
|
| 135 |
+
async def validate_url(client: httpx.AsyncClient, url: str) -> bool:
|
| 136 |
+
try:
|
| 137 |
+
r = await client.head(url, follow_redirects=True, timeout=15)
|
| 138 |
+
return r.status_code == 200
|
| 139 |
+
except Exception:
|
| 140 |
+
try:
|
| 141 |
+
r2 = await client.get(url, follow_redirects=True, timeout=20)
|
| 142 |
+
return r2.status_code == 200
|
| 143 |
+
except Exception:
|
| 144 |
+
return False
|
| 145 |
+
|
| 146 |
+
# ---------------- SCRAPERS (Playwright) ----------------
|
| 147 |
+
# Each scraper returns list[dict] with canonical fields (see save_property)
|
| 148 |
+
|
| 149 |
+
async def scrape_mercadolibre(pw, location:str, max_pages:int=1, idx_offset=0) -> List[Dict[str,Any]]:
|
| 150 |
+
"""Scrapes MercadoLibre Inmuebles listing pages (rendered)."""
|
| 151 |
+
out = []
|
| 152 |
+
base = "https://listado.mercadolibre.com.ar"
|
| 153 |
+
# build query: try location as-is and also appended 'venta'
|
| 154 |
+
q = location.replace(" ", "-")
|
| 155 |
+
async with pw.chromium.launch(headless=True, args=["--no-sandbox"]) as browser:
|
| 156 |
+
for p in range(1, max_pages+1):
|
| 157 |
+
# MercadoLibre pagination is usually offset-based; try two patterns
|
| 158 |
+
page_path = f"/{q}_Desde_{(p-1)*50+1}"
|
| 159 |
+
url = urljoin(base, page_path)
|
| 160 |
+
proxy = pick_proxy(p-1)
|
| 161 |
+
context_args = {}
|
| 162 |
+
if proxy:
|
| 163 |
+
context_args["proxy"] = {"server": proxy}
|
| 164 |
+
ua = USER_AGENTS[(idx_offset + p) % len(USER_AGENTS)]
|
| 165 |
+
context_args["user_agent"] = ua
|
| 166 |
+
context = await browser.new_context(**context_args)
|
| 167 |
+
page = await context.new_page()
|
| 168 |
+
try:
|
| 169 |
+
await page.goto(url, wait_until="networkidle", timeout=30000)
|
| 170 |
+
# ML often lazy-loads; ensure content loaded
|
| 171 |
+
await page.wait_for_timeout(1500)
|
| 172 |
+
html = await page.content()
|
| 173 |
+
except PWTimeout:
|
| 174 |
+
html = await page.content()
|
| 175 |
+
except Exception as e:
|
| 176 |
+
print("ML page error:", e)
|
| 177 |
+
html = ""
|
| 178 |
+
finally:
|
| 179 |
+
try:
|
| 180 |
+
await page.close()
|
| 181 |
+
await context.close()
|
| 182 |
+
except Exception:
|
| 183 |
+
pass
|
| 184 |
+
if not html:
|
| 185 |
+
continue
|
| 186 |
+
soup = BeautifulSoup(html, "html.parser")
|
| 187 |
+
# Search for anchor tags that likely link to properties
|
| 188 |
+
anchors = soup.select("a[href]")
|
| 189 |
+
found = set()
|
| 190 |
+
for a in anchors:
|
| 191 |
+
href = a.get("href")
|
| 192 |
+
if not href:
|
| 193 |
+
continue
|
| 194 |
+
# heuristics: property detail urls often contain '/MLA-' or '/MLO-' or '/inmuebles'
|
| 195 |
+
if re.search(r"/MLA-|/MLO-|/inmuebles/", href):
|
| 196 |
+
full = href if href.startswith("http") else urljoin(base, href)
|
| 197 |
+
if full in found:
|
| 198 |
+
continue
|
| 199 |
+
found.add(full)
|
| 200 |
+
title = (a.get_text(strip=True) or "Propiedad MercadoLibre")[:300]
|
| 201 |
+
out.append({
|
| 202 |
+
"title": title,
|
| 203 |
+
"price": None,
|
| 204 |
+
"currency": "ARS",
|
| 205 |
+
"address": None,
|
| 206 |
+
"neighbourhood": None,
|
| 207 |
+
"lat": None,
|
| 208 |
+
"lon": None,
|
| 209 |
+
"bedrooms": None,
|
| 210 |
+
"bathrooms": None,
|
| 211 |
+
"surface": None,
|
| 212 |
+
"amenities": [],
|
| 213 |
+
"source": "MercadoLibre",
|
| 214 |
+
"url": full,
|
| 215 |
+
"raw_html": str(a)[:8000]
|
| 216 |
+
})
|
| 217 |
+
return out
|
| 218 |
+
|
| 219 |
+
async def scrape_properati(pw, location:str, max_pages:int=1, idx_offset=0) -> List[Dict[str,Any]]:
|
| 220 |
+
out = []
|
| 221 |
+
base = "https://www.properati.com.ar"
|
| 222 |
+
url = f"{base}/search?q={location}"
|
| 223 |
+
proxy = pick_proxy(idx_offset)
|
| 224 |
+
ua = USER_AGENTS[idx_offset % len(USER_AGENTS)]
|
| 225 |
+
async with pw.chromium.launch(headless=True, args=["--no-sandbox"]) as browser:
|
| 226 |
+
context_args = {"user_agent": ua}
|
| 227 |
+
if proxy:
|
| 228 |
+
context_args["proxy"] = {"server": proxy}
|
| 229 |
+
context = await browser.new_context(**context_args)
|
| 230 |
+
page = await context.new_page()
|
| 231 |
+
try:
|
| 232 |
+
await page.goto(url, wait_until="networkidle", timeout=30000)
|
| 233 |
+
await page.wait_for_timeout(1200)
|
| 234 |
+
html = await page.content()
|
| 235 |
+
except Exception as e:
|
| 236 |
+
print("Properati error:", e)
|
| 237 |
+
html = ""
|
| 238 |
+
try:
|
| 239 |
+
await page.close()
|
| 240 |
+
await context.close()
|
| 241 |
+
except Exception:
|
| 242 |
+
pass
|
| 243 |
+
if not html:
|
| 244 |
+
return out
|
| 245 |
+
soup = BeautifulSoup(html, "html.parser")
|
| 246 |
+
cards = soup.select("a[href]")
|
| 247 |
+
found=set()
|
| 248 |
+
for a in cards:
|
| 249 |
+
href=a.get("href")
|
| 250 |
+
if not href:
|
| 251 |
+
continue
|
| 252 |
+
if "/property/" in href or "/inmuebles/" in href or "/propiedad" in href:
|
| 253 |
+
full = href if href.startswith("http") else urljoin(base, href)
|
| 254 |
+
if full in found: continue
|
| 255 |
+
found.add(full)
|
| 256 |
+
title=(a.get_text(strip=True) or "Propiedad Properati")[:300]
|
| 257 |
+
out.append({
|
| 258 |
+
"title": title,
|
| 259 |
+
"price": None,
|
| 260 |
+
"currency": "ARS",
|
| 261 |
+
"address": None,
|
| 262 |
+
"neighbourhood": None,
|
| 263 |
+
"lat": None,
|
| 264 |
+
"lon": None,
|
| 265 |
+
"bedrooms": None,
|
| 266 |
+
"bathrooms": None,
|
| 267 |
+
"surface": None,
|
| 268 |
+
"amenities": [],
|
| 269 |
+
"source": "Properati",
|
| 270 |
+
"url": full,
|
| 271 |
+
"raw_html": str(a)[:8000]
|
| 272 |
+
})
|
| 273 |
+
return out
|
| 274 |
+
|
| 275 |
+
# Extendable: add ZonaProp, Inmuebles, ArgenProp, etc.
|
| 276 |
+
|
| 277 |
+
# ---------------- ORCHESTRATOR ----------------
|
| 278 |
+
app = FastAPI(title="SamuelHouseFinder API")
|
| 279 |
init_db()
|
| 280 |
|
| 281 |
+
async def run_all_scrapers(location: str, sources: List[str], max_pages:int=1, force:bool=False) -> Dict[str,Any]:
|
| 282 |
+
results = []
|
| 283 |
+
async with async_playwright() as pw:
|
| 284 |
+
tasks=[]
|
| 285 |
+
idx=0
|
| 286 |
+
for s in sources:
|
| 287 |
+
if s.lower()=="mercadolibre":
|
| 288 |
+
tasks.append(scrape_mercadolibre(pw, location, max_pages=max_pages, idx_offset=idx))
|
| 289 |
+
elif s.lower()=="properati":
|
| 290 |
+
tasks.append(scrape_properati(pw, location, max_pages=max_pages, idx_offset=idx))
|
| 291 |
+
else:
|
| 292 |
+
# unknown: skip for now
|
| 293 |
+
pass
|
| 294 |
+
idx+=1
|
| 295 |
+
# run concurrently but throttle by MAX_CONCURRENT_BROWSERS
|
| 296 |
+
gathered=[]
|
| 297 |
+
sem = asyncio.Semaphore(MAX_CONCURRENT_BROWSERS)
|
| 298 |
+
async def sem_task(coro):
|
| 299 |
+
async with sem:
|
| 300 |
+
return await coro
|
| 301 |
+
gathered = await asyncio.gather(*[sem_task(t) for t in tasks], return_exceptions=True)
|
| 302 |
+
# flatten
|
| 303 |
+
all_items=[]
|
| 304 |
+
for g in gathered:
|
| 305 |
+
if isinstance(g, Exception):
|
| 306 |
+
print("scrape exception:", g)
|
| 307 |
+
continue
|
| 308 |
+
all_items.extend(g)
|
| 309 |
+
# validate urls and save
|
| 310 |
+
async with httpx.AsyncClient(follow_redirects=True, timeout=20) as client:
|
| 311 |
+
valid=[]
|
| 312 |
+
for i,it in enumerate(all_items):
|
| 313 |
+
ok = await validate_url(client, it["url"])
|
| 314 |
+
if ok:
|
| 315 |
+
saved = save_property(it)
|
| 316 |
+
valid.append(it)
|
| 317 |
+
return {"found": len(all_items), "validated": len(valid)}
|
| 318 |
+
|
| 319 |
+
@app.post("/api/scrape")
|
| 320 |
+
async def api_scrape(req: Request):
|
| 321 |
+
payload = await req.json()
|
| 322 |
+
location = payload.get("location")
|
| 323 |
+
if not location:
|
| 324 |
+
raise HTTPException(status_code=400, detail="location required")
|
| 325 |
+
sources = payload.get("sources", ["mercadolibre","properati"])
|
| 326 |
+
max_pages = int(payload.get("max_pages", DEFAULT_MAX_PAGES))
|
| 327 |
+
force = bool(payload.get("force", False))
|
| 328 |
+
result = await run_all_scrapers(location, sources, max_pages=max_pages, force=force)
|
| 329 |
+
return JSONResponse(result)
|
| 330 |
+
|
| 331 |
+
@app.get("/api/search")
|
| 332 |
+
async def api_search(q: Optional[str]=None, source: Optional[str]=None, limit:int=200):
|
| 333 |
+
data = query_db(q=q, source=source, limit=limit)
|
| 334 |
+
return JSONResponse({"items": data, "count": len(data)})
|
| 335 |
+
|
| 336 |
+
@app.get("/api/health")
|
| 337 |
+
async def health():
|
| 338 |
+
return JSONResponse({"ok":True,"time":datetime.utcnow().isoformat()})
|
| 339 |
+
|
| 340 |
+
# ---------------- GRADIO UI (simple) ----------------
|
| 341 |
+
def frontend_invoke_scrape(location, sources, max_pages, force_flag):
|
| 342 |
+
# call local API (same process) synchronously
|
| 343 |
+
import requests
|
| 344 |
+
payload = {"location":location, "sources": [s.strip() for s in sources.split(",") if s.strip()], "max_pages":int(max_pages), "force":bool(force_flag)}
|
| 345 |
+
try:
|
| 346 |
+
r = requests.post("http://127.0.0.1:8000/api/scrape", json=payload, timeout=600)
|
| 347 |
+
r.raise_for_status()
|
| 348 |
+
return f"Scrape iniciado: {r.json()}"
|
| 349 |
+
except Exception as e:
|
| 350 |
+
return f"Error al iniciar scrape: {e}"
|
| 351 |
+
|
| 352 |
+
def frontend_query(q_text, source):
|
| 353 |
+
import requests
|
| 354 |
+
try:
|
| 355 |
+
params = {}
|
| 356 |
+
if q_text: params["q"] = q_text
|
| 357 |
+
if source: params["source"] = source
|
| 358 |
+
r = requests.get("http://127.0.0.1:8000/api/search", params=params, timeout=60)
|
| 359 |
+
r.raise_for_status()
|
| 360 |
+
items = r.json().get("items", [])
|
| 361 |
+
# convert to table-friendly list
|
| 362 |
+
rows = []
|
| 363 |
+
for it in items:
|
| 364 |
+
rows.append([it.get("title"), it.get("price"), it.get("currency"), it.get("source"), it.get("url")])
|
| 365 |
+
return rows
|
| 366 |
+
except Exception as e:
|
| 367 |
+
return [["Error", str(e), "", "", ""]]
|
| 368 |
+
|
| 369 |
+
def mount_gradio():
|
| 370 |
+
with gr.Blocks(title="SamuelHouseFinder") as demo:
|
| 371 |
+
gr.Markdown("## SamuelHouseFinder — Zona Norte (Saavedra → La Lucila)\nBackend con Playwright. Usá con cuidado y respetá TOS de portales.")
|
| 372 |
+
with gr.Row():
|
| 373 |
+
with gr.Column():
|
| 374 |
+
loc = gr.Textbox(label="Ubicación (ej: Martinez, Olivos, Saavedra)", value="Saavedra")
|
| 375 |
+
srcs = gr.Textbox(label="Fuentes (csv)", value="mercadolibre,properati")
|
| 376 |
+
pages = gr.Slider(label="Páginas por fuente", minimum=1, maximum=5, value=1)
|
| 377 |
+
force = gr.Checkbox(label="Forzar re-scrape", value=False)
|
| 378 |
+
btn = gr.Button("Buscar y Scrappear")
|
| 379 |
+
out = gr.Textbox(label="Estado")
|
| 380 |
+
with gr.Column():
|
| 381 |
+
qtxt = gr.Textbox(label="Buscar en DB (texto libre)", value="")
|
| 382 |
+
qsrc = gr.Textbox(label="Fuente (opcional)", value="")
|
| 383 |
+
qbtn = gr.Button("Consultar DB")
|
| 384 |
+
table = gr.Dataframe(headers=["title","price","currency","source","url"], datatype=["str","str","str","str","str"])
|
| 385 |
+
btn.click(frontend_invoke_scrape, inputs=[loc, srcs, pages, force], outputs=[out])
|
| 386 |
+
qbtn.click(frontend_query, inputs=[qtxt, qsrc], outputs=[table])
|
| 387 |
+
return demo
|
| 388 |
|
| 389 |
+
# ---------------- RUN ----------------
|
| 390 |
if __name__ == "__main__":
|
| 391 |
+
# Run FastAPI + Gradio in same process: start FastAPI in background, then Gradio
|
| 392 |
+
import threading, time
|
| 393 |
+
def start_uvicorn():
|
| 394 |
+
uvicorn.run("app:app", host="0.0.0.0", port=8000, log_level="info")
|
| 395 |
+
t = threading.Thread(target=start_uvicorn, daemon=True)
|
| 396 |
+
t.start()
|
| 397 |
+
# wait a moment for server
|
| 398 |
+
time.sleep(1.5)
|
| 399 |
+
demo = mount_gradio()
|
| 400 |
+
demo.launch(server_name="0.0.0.0", server_port=7860)
|