Lukeetah commited on
Commit
a801f06
·
verified ·
1 Parent(s): f8c719d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +379 -73
app.py CHANGED
@@ -1,94 +1,400 @@
1
- import gradio as gr
2
- import requests
3
- from bs4 import BeautifulSoup
4
- import sqlite3
 
 
5
  import re
6
- import time
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7
 
8
- DB_NAME = "propiedades.db"
 
 
 
 
 
 
 
 
 
 
 
9
 
10
- # ------------------- DB -------------------
 
 
11
  def init_db():
12
- conn = sqlite3.connect(DB_NAME)
13
  c = conn.cursor()
14
- c.execute('''CREATE TABLE IF NOT EXISTS propiedades (
15
- id INTEGER PRIMARY KEY AUTOINCREMENT,
16
- titulo TEXT,
17
- precio TEXT,
18
- ubicacion TEXT,
19
- link TEXT UNIQUE,
20
- fecha TEXT
21
- )''')
 
 
 
 
 
 
 
 
 
 
 
 
22
  conn.commit()
23
  conn.close()
24
 
25
- def guardar_propiedad(titulo, precio, ubicacion, link):
26
- conn = sqlite3.connect(DB_NAME)
27
  c = conn.cursor()
28
  try:
29
- c.execute("INSERT INTO propiedades (titulo, precio, ubicacion, link, fecha) VALUES (?, ?, ?, ?, date('now'))",
30
- (titulo, precio, ubicacion, link))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
31
  conn.commit()
 
32
  except sqlite3.IntegrityError:
33
- pass
34
- conn.close()
 
 
 
 
 
35
 
36
- def buscar_propiedades(filtro_precio=None, ubicacion=None):
37
- conn = sqlite3.connect(DB_NAME)
 
38
  c = conn.cursor()
39
- query = "SELECT titulo, precio, ubicacion, link FROM propiedades WHERE 1=1"
40
  params = []
41
- if filtro_precio:
42
- query += " AND precio LIKE ?"
43
- params.append(f"%{filtro_precio}%")
44
- if ubicacion:
45
- query += " AND ubicacion LIKE ?"
46
- params.append(f"%{ubicacion}%")
47
- c.execute(query, params)
48
- data = c.fetchall()
 
 
 
49
  conn.close()
50
- return data
51
-
52
- # ------------------- SCRAPER -------------------
53
- def scrape_mercadolibre_zonanorte():
54
- url = "https://inmuebles.mercadolibre.com.ar/venta/capital-federal/zona-norte/"
55
- r = requests.get(url, headers={"User-Agent": "Mozilla/5.0"})
56
- soup = BeautifulSoup(r.text, "html.parser")
57
- cards = soup.find_all("li", {"class": "ui-search-layout__item"})
58
-
59
- for card in cards:
60
- titulo_tag = card.find("h2")
61
- precio_tag = card.find("span", {"class": "price-tag-fraction"})
62
- link_tag = card.find("a", href=True)
63
- ubicacion_tag = card.find("span", {"class": "ui-search-item__location"})
64
-
65
- if titulo_tag and precio_tag and link_tag:
66
- titulo = titulo_tag.text.strip()
67
- precio = precio_tag.text.strip()
68
- link = link_tag['href'].split('#')[0]
69
- ubicacion = ubicacion_tag.text.strip() if ubicacion_tag else "N/D"
70
- guardar_propiedad(titulo, precio, ubicacion, link)
71
-
72
- # ------------------- INTERFAZ -------------------
73
- def actualizar_y_buscar(precio, ubicacion):
74
- scrape_mercadolibre_zonanorte()
75
- data = buscar_propiedades(precio, ubicacion)
76
- if not data:
77
- return "No se encontraron propiedades con esos filtros."
78
- table = ""
79
- for t, p, u, l in data:
80
- table += f"🏠 {t}\n💰 {p}\n📍 {u}\n🔗 {l}\n\n"
81
- return table
82
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
83
  init_db()
84
 
85
- with gr.Blocks() as demo:
86
- gr.Markdown("# 🏡 Samuel House Finder — Zona Norte BA")
87
- precio_in = gr.Textbox(label="Filtrar por precio (ej: 100000)")
88
- ubicacion_in = gr.Textbox(label="Filtrar por ubicación (ej: Saavedra)")
89
- boton = gr.Button("Buscar y Actualizar")
90
- salida = gr.Textbox(label="Resultados", lines=20)
91
- boton.click(actualizar_y_buscar, inputs=[precio_in, ubicacion_in], outputs=salida)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
92
 
 
93
  if __name__ == "__main__":
94
- demo.launch()
 
 
 
 
 
 
 
 
 
 
1
+ # app.py
2
+ # SamuelHouseFinder - FastAPI + Playwright scrapers + Gradio UI (single file)
3
+ # Requisitos: ejecutar en contenedor con Playwright browsers (Dockerfile incluido abajo)
4
+ import os
5
+ import asyncio
6
+ import json
7
  import re
8
+ import sqlite3
9
+ from typing import List, Dict, Any, Optional
10
+ from datetime import datetime
11
+ from urllib.parse import urljoin
12
+
13
+ from fastapi import FastAPI, HTTPException, Request, BackgroundTasks
14
+ from fastapi.responses import JSONResponse, HTMLResponse
15
+ import uvicorn
16
+ import httpx
17
+ from bs4 import BeautifulSoup
18
+
19
+ # Gradio UI
20
+ import gradio as gr
21
+
22
+ # Playwright
23
+ from playwright.async_api import async_playwright, TimeoutError as PWTimeout
24
 
25
+ # ---------------- CONFIG ----------------
26
+ DB_PATH = os.environ.get("DB_PATH", "data/properties.db")
27
+ PROXY_LIST = os.environ.get("PROXY_LIST") # comma separated http://user:pass@ip:port
28
+ CAPTCHA_API_KEY = os.environ.get("CAPTCHA_API_KEY") # optional
29
+ MAX_CONCURRENT_BROWSERS = int(os.environ.get("MAX_BROWSERS", "2"))
30
+ DEFAULT_MAX_PAGES = int(os.environ.get("DEFAULT_MAX_PAGES", "2"))
31
+ USER_AGENTS = [
32
+ # Expand this list for production
33
+ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36",
34
+ "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36",
35
+ "Mozilla/5.0 (Macintosh; Intel Mac OS X 13_6) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.0 Safari/605.1.15",
36
+ ]
37
 
38
+ os.makedirs(os.path.dirname(DB_PATH) or ".", exist_ok=True)
39
+
40
+ # ---------------- DB ----------------
41
  def init_db():
42
+ conn = sqlite3.connect(DB_PATH)
43
  c = conn.cursor()
44
+ c.execute("""
45
+ CREATE TABLE IF NOT EXISTS properties (
46
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
47
+ title TEXT,
48
+ price TEXT,
49
+ currency TEXT,
50
+ address TEXT,
51
+ neighbourhood TEXT,
52
+ lat REAL,
53
+ lon REAL,
54
+ bedrooms INTEGER,
55
+ bathrooms INTEGER,
56
+ surface REAL,
57
+ amenities TEXT,
58
+ source TEXT,
59
+ url TEXT UNIQUE,
60
+ scraped_at TEXT,
61
+ raw_html TEXT
62
+ )
63
+ """)
64
  conn.commit()
65
  conn.close()
66
 
67
+ def save_property(item: Dict[str,Any]) -> bool:
68
+ conn = sqlite3.connect(DB_PATH)
69
  c = conn.cursor()
70
  try:
71
+ c.execute("""
72
+ INSERT INTO properties
73
+ (title,price,currency,address,neighbourhood,lat,lon,bedrooms,bathrooms,surface,amenities,source,url,scraped_at,raw_html)
74
+ VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)
75
+ """, (
76
+ item.get("title"),
77
+ item.get("price"),
78
+ item.get("currency"),
79
+ item.get("address"),
80
+ item.get("neighbourhood"),
81
+ item.get("lat"),
82
+ item.get("lon"),
83
+ item.get("bedrooms"),
84
+ item.get("bathrooms"),
85
+ item.get("surface"),
86
+ json.dumps(item.get("amenities",[]), ensure_ascii=False),
87
+ item.get("source"),
88
+ item.get("url"),
89
+ datetime.utcnow().isoformat(),
90
+ item.get("raw_html","")[:10000]
91
+ ))
92
  conn.commit()
93
+ return True
94
  except sqlite3.IntegrityError:
95
+ # already exists
96
+ return False
97
+ except Exception as e:
98
+ print("DB save error:", e)
99
+ return False
100
+ finally:
101
+ conn.close()
102
 
103
+ def query_db(q: Optional[str]=None, min_price: Optional[int]=None, max_price: Optional[int]=None, bedrooms: Optional[int]=None, source: Optional[str]=None, limit:int=200):
104
+ conn = sqlite3.connect(DB_PATH)
105
+ conn.row_factory = sqlite3.Row
106
  c = conn.cursor()
107
+ sql = "SELECT * FROM properties WHERE 1=1"
108
  params = []
109
+ if q:
110
+ sql += " AND (title LIKE ? OR address LIKE ? OR neighbourhood LIKE ?)"
111
+ qv = f"%{q}%"
112
+ params += [qv,qv,qv]
113
+ if source:
114
+ sql += " AND source = ?"
115
+ params.append(source)
116
+ # NOTE: price is stored as text (different formats). For production parse and store numeric.
117
+ sql += " ORDER BY scraped_at DESC LIMIT ?"
118
+ params.append(limit)
119
+ rows = c.execute(sql, params).fetchall()
120
  conn.close()
121
+ return [dict(r) for r in rows]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
122
 
123
+ # ---------------- UTIL ----------------
124
+ def get_proxies_list():
125
+ if not PROXY_LIST:
126
+ return []
127
+ return [p.strip() for p in PROXY_LIST.split(",") if p.strip()]
128
+
129
+ def pick_proxy(idx=0):
130
+ lst = get_proxies_list()
131
+ if not lst:
132
+ return None
133
+ return lst[idx % len(lst)]
134
+
135
+ async def validate_url(client: httpx.AsyncClient, url: str) -> bool:
136
+ try:
137
+ r = await client.head(url, follow_redirects=True, timeout=15)
138
+ return r.status_code == 200
139
+ except Exception:
140
+ try:
141
+ r2 = await client.get(url, follow_redirects=True, timeout=20)
142
+ return r2.status_code == 200
143
+ except Exception:
144
+ return False
145
+
146
+ # ---------------- SCRAPERS (Playwright) ----------------
147
+ # Each scraper returns list[dict] with canonical fields (see save_property)
148
+
149
+ async def scrape_mercadolibre(pw, location:str, max_pages:int=1, idx_offset=0) -> List[Dict[str,Any]]:
150
+ """Scrapes MercadoLibre Inmuebles listing pages (rendered)."""
151
+ out = []
152
+ base = "https://listado.mercadolibre.com.ar"
153
+ # build query: try location as-is and also appended 'venta'
154
+ q = location.replace(" ", "-")
155
+ async with pw.chromium.launch(headless=True, args=["--no-sandbox"]) as browser:
156
+ for p in range(1, max_pages+1):
157
+ # MercadoLibre pagination is usually offset-based; try two patterns
158
+ page_path = f"/{q}_Desde_{(p-1)*50+1}"
159
+ url = urljoin(base, page_path)
160
+ proxy = pick_proxy(p-1)
161
+ context_args = {}
162
+ if proxy:
163
+ context_args["proxy"] = {"server": proxy}
164
+ ua = USER_AGENTS[(idx_offset + p) % len(USER_AGENTS)]
165
+ context_args["user_agent"] = ua
166
+ context = await browser.new_context(**context_args)
167
+ page = await context.new_page()
168
+ try:
169
+ await page.goto(url, wait_until="networkidle", timeout=30000)
170
+ # ML often lazy-loads; ensure content loaded
171
+ await page.wait_for_timeout(1500)
172
+ html = await page.content()
173
+ except PWTimeout:
174
+ html = await page.content()
175
+ except Exception as e:
176
+ print("ML page error:", e)
177
+ html = ""
178
+ finally:
179
+ try:
180
+ await page.close()
181
+ await context.close()
182
+ except Exception:
183
+ pass
184
+ if not html:
185
+ continue
186
+ soup = BeautifulSoup(html, "html.parser")
187
+ # Search for anchor tags that likely link to properties
188
+ anchors = soup.select("a[href]")
189
+ found = set()
190
+ for a in anchors:
191
+ href = a.get("href")
192
+ if not href:
193
+ continue
194
+ # heuristics: property detail urls often contain '/MLA-' or '/MLO-' or '/inmuebles'
195
+ if re.search(r"/MLA-|/MLO-|/inmuebles/", href):
196
+ full = href if href.startswith("http") else urljoin(base, href)
197
+ if full in found:
198
+ continue
199
+ found.add(full)
200
+ title = (a.get_text(strip=True) or "Propiedad MercadoLibre")[:300]
201
+ out.append({
202
+ "title": title,
203
+ "price": None,
204
+ "currency": "ARS",
205
+ "address": None,
206
+ "neighbourhood": None,
207
+ "lat": None,
208
+ "lon": None,
209
+ "bedrooms": None,
210
+ "bathrooms": None,
211
+ "surface": None,
212
+ "amenities": [],
213
+ "source": "MercadoLibre",
214
+ "url": full,
215
+ "raw_html": str(a)[:8000]
216
+ })
217
+ return out
218
+
219
+ async def scrape_properati(pw, location:str, max_pages:int=1, idx_offset=0) -> List[Dict[str,Any]]:
220
+ out = []
221
+ base = "https://www.properati.com.ar"
222
+ url = f"{base}/search?q={location}"
223
+ proxy = pick_proxy(idx_offset)
224
+ ua = USER_AGENTS[idx_offset % len(USER_AGENTS)]
225
+ async with pw.chromium.launch(headless=True, args=["--no-sandbox"]) as browser:
226
+ context_args = {"user_agent": ua}
227
+ if proxy:
228
+ context_args["proxy"] = {"server": proxy}
229
+ context = await browser.new_context(**context_args)
230
+ page = await context.new_page()
231
+ try:
232
+ await page.goto(url, wait_until="networkidle", timeout=30000)
233
+ await page.wait_for_timeout(1200)
234
+ html = await page.content()
235
+ except Exception as e:
236
+ print("Properati error:", e)
237
+ html = ""
238
+ try:
239
+ await page.close()
240
+ await context.close()
241
+ except Exception:
242
+ pass
243
+ if not html:
244
+ return out
245
+ soup = BeautifulSoup(html, "html.parser")
246
+ cards = soup.select("a[href]")
247
+ found=set()
248
+ for a in cards:
249
+ href=a.get("href")
250
+ if not href:
251
+ continue
252
+ if "/property/" in href or "/inmuebles/" in href or "/propiedad" in href:
253
+ full = href if href.startswith("http") else urljoin(base, href)
254
+ if full in found: continue
255
+ found.add(full)
256
+ title=(a.get_text(strip=True) or "Propiedad Properati")[:300]
257
+ out.append({
258
+ "title": title,
259
+ "price": None,
260
+ "currency": "ARS",
261
+ "address": None,
262
+ "neighbourhood": None,
263
+ "lat": None,
264
+ "lon": None,
265
+ "bedrooms": None,
266
+ "bathrooms": None,
267
+ "surface": None,
268
+ "amenities": [],
269
+ "source": "Properati",
270
+ "url": full,
271
+ "raw_html": str(a)[:8000]
272
+ })
273
+ return out
274
+
275
+ # Extendable: add ZonaProp, Inmuebles, ArgenProp, etc.
276
+
277
+ # ---------------- ORCHESTRATOR ----------------
278
+ app = FastAPI(title="SamuelHouseFinder API")
279
  init_db()
280
 
281
+ async def run_all_scrapers(location: str, sources: List[str], max_pages:int=1, force:bool=False) -> Dict[str,Any]:
282
+ results = []
283
+ async with async_playwright() as pw:
284
+ tasks=[]
285
+ idx=0
286
+ for s in sources:
287
+ if s.lower()=="mercadolibre":
288
+ tasks.append(scrape_mercadolibre(pw, location, max_pages=max_pages, idx_offset=idx))
289
+ elif s.lower()=="properati":
290
+ tasks.append(scrape_properati(pw, location, max_pages=max_pages, idx_offset=idx))
291
+ else:
292
+ # unknown: skip for now
293
+ pass
294
+ idx+=1
295
+ # run concurrently but throttle by MAX_CONCURRENT_BROWSERS
296
+ gathered=[]
297
+ sem = asyncio.Semaphore(MAX_CONCURRENT_BROWSERS)
298
+ async def sem_task(coro):
299
+ async with sem:
300
+ return await coro
301
+ gathered = await asyncio.gather(*[sem_task(t) for t in tasks], return_exceptions=True)
302
+ # flatten
303
+ all_items=[]
304
+ for g in gathered:
305
+ if isinstance(g, Exception):
306
+ print("scrape exception:", g)
307
+ continue
308
+ all_items.extend(g)
309
+ # validate urls and save
310
+ async with httpx.AsyncClient(follow_redirects=True, timeout=20) as client:
311
+ valid=[]
312
+ for i,it in enumerate(all_items):
313
+ ok = await validate_url(client, it["url"])
314
+ if ok:
315
+ saved = save_property(it)
316
+ valid.append(it)
317
+ return {"found": len(all_items), "validated": len(valid)}
318
+
319
+ @app.post("/api/scrape")
320
+ async def api_scrape(req: Request):
321
+ payload = await req.json()
322
+ location = payload.get("location")
323
+ if not location:
324
+ raise HTTPException(status_code=400, detail="location required")
325
+ sources = payload.get("sources", ["mercadolibre","properati"])
326
+ max_pages = int(payload.get("max_pages", DEFAULT_MAX_PAGES))
327
+ force = bool(payload.get("force", False))
328
+ result = await run_all_scrapers(location, sources, max_pages=max_pages, force=force)
329
+ return JSONResponse(result)
330
+
331
+ @app.get("/api/search")
332
+ async def api_search(q: Optional[str]=None, source: Optional[str]=None, limit:int=200):
333
+ data = query_db(q=q, source=source, limit=limit)
334
+ return JSONResponse({"items": data, "count": len(data)})
335
+
336
+ @app.get("/api/health")
337
+ async def health():
338
+ return JSONResponse({"ok":True,"time":datetime.utcnow().isoformat()})
339
+
340
+ # ---------------- GRADIO UI (simple) ----------------
341
+ def frontend_invoke_scrape(location, sources, max_pages, force_flag):
342
+ # call local API (same process) synchronously
343
+ import requests
344
+ payload = {"location":location, "sources": [s.strip() for s in sources.split(",") if s.strip()], "max_pages":int(max_pages), "force":bool(force_flag)}
345
+ try:
346
+ r = requests.post("http://127.0.0.1:8000/api/scrape", json=payload, timeout=600)
347
+ r.raise_for_status()
348
+ return f"Scrape iniciado: {r.json()}"
349
+ except Exception as e:
350
+ return f"Error al iniciar scrape: {e}"
351
+
352
+ def frontend_query(q_text, source):
353
+ import requests
354
+ try:
355
+ params = {}
356
+ if q_text: params["q"] = q_text
357
+ if source: params["source"] = source
358
+ r = requests.get("http://127.0.0.1:8000/api/search", params=params, timeout=60)
359
+ r.raise_for_status()
360
+ items = r.json().get("items", [])
361
+ # convert to table-friendly list
362
+ rows = []
363
+ for it in items:
364
+ rows.append([it.get("title"), it.get("price"), it.get("currency"), it.get("source"), it.get("url")])
365
+ return rows
366
+ except Exception as e:
367
+ return [["Error", str(e), "", "", ""]]
368
+
369
+ def mount_gradio():
370
+ with gr.Blocks(title="SamuelHouseFinder") as demo:
371
+ gr.Markdown("## SamuelHouseFinder — Zona Norte (Saavedra → La Lucila)\nBackend con Playwright. Usá con cuidado y respetá TOS de portales.")
372
+ with gr.Row():
373
+ with gr.Column():
374
+ loc = gr.Textbox(label="Ubicación (ej: Martinez, Olivos, Saavedra)", value="Saavedra")
375
+ srcs = gr.Textbox(label="Fuentes (csv)", value="mercadolibre,properati")
376
+ pages = gr.Slider(label="Páginas por fuente", minimum=1, maximum=5, value=1)
377
+ force = gr.Checkbox(label="Forzar re-scrape", value=False)
378
+ btn = gr.Button("Buscar y Scrappear")
379
+ out = gr.Textbox(label="Estado")
380
+ with gr.Column():
381
+ qtxt = gr.Textbox(label="Buscar en DB (texto libre)", value="")
382
+ qsrc = gr.Textbox(label="Fuente (opcional)", value="")
383
+ qbtn = gr.Button("Consultar DB")
384
+ table = gr.Dataframe(headers=["title","price","currency","source","url"], datatype=["str","str","str","str","str"])
385
+ btn.click(frontend_invoke_scrape, inputs=[loc, srcs, pages, force], outputs=[out])
386
+ qbtn.click(frontend_query, inputs=[qtxt, qsrc], outputs=[table])
387
+ return demo
388
 
389
+ # ---------------- RUN ----------------
390
  if __name__ == "__main__":
391
+ # Run FastAPI + Gradio in same process: start FastAPI in background, then Gradio
392
+ import threading, time
393
+ def start_uvicorn():
394
+ uvicorn.run("app:app", host="0.0.0.0", port=8000, log_level="info")
395
+ t = threading.Thread(target=start_uvicorn, daemon=True)
396
+ t.start()
397
+ # wait a moment for server
398
+ time.sleep(1.5)
399
+ demo = mount_gradio()
400
+ demo.launch(server_name="0.0.0.0", server_port=7860)