Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -34,7 +34,6 @@ except ImportError:
|
|
| 34 |
|
| 35 |
# --- FILTRO ANTI-EMOJIS PARA EL PDF ---
|
| 36 |
def sanear_texto(texto):
|
| 37 |
-
"""Elimina emojis y caracteres especiales que rompen el PDF (latin-1)"""
|
| 38 |
if not isinstance(texto, str): return ""
|
| 39 |
return texto.encode('latin-1', 'ignore').decode('latin-1')
|
| 40 |
|
|
@@ -44,20 +43,18 @@ def descargar_imagen(url, idx):
|
|
| 44 |
return None
|
| 45 |
try:
|
| 46 |
headers = {
|
| 47 |
-
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)
|
| 48 |
"Accept": "image/*",
|
| 49 |
"Referer": "https://www.fincaraiz.com.co/"
|
| 50 |
}
|
| 51 |
r = requests.get(url, timeout=8, headers=headers)
|
| 52 |
if r.status_code == 200:
|
| 53 |
img = Image.open(io.BytesIO(r.content))
|
| 54 |
-
if img.mode != 'RGB':
|
| 55 |
-
img = img.convert('RGB')
|
| 56 |
path = f"temp_img_{idx}.jpg"
|
| 57 |
img.save(path, format="JPEG")
|
| 58 |
return path
|
| 59 |
-
except:
|
| 60 |
-
return None
|
| 61 |
return None
|
| 62 |
|
| 63 |
# --- 1. GENERADOR DE URLS ---
|
|
@@ -102,7 +99,7 @@ def extraer_ubicacion(texto):
|
|
| 102 |
return linea[:60]
|
| 103 |
return "Ubicacion en zona solicitada"
|
| 104 |
|
| 105 |
-
# --- 3. MOTOR DE EXTRACCIÓN
|
| 106 |
def motor_tramitia_visual(operacion, barrio, ciudad, area, m2_min, m2_max, tipo, hab, ban, park, antiguedad, ascensor, piscina):
|
| 107 |
resultados = []
|
| 108 |
log_visible = ""
|
|
@@ -121,7 +118,6 @@ def motor_tramitia_visual(operacion, barrio, ciudad, area, m2_min, m2_max, tipo,
|
|
| 121 |
page = context.new_page()
|
| 122 |
log_visible += "🔄 FR: Buscando inmuebles y fotos...\n"
|
| 123 |
page.goto(url_fr, wait_until="domcontentloaded", timeout=60000)
|
| 124 |
-
|
| 125 |
try: page.wait_for_load_state("networkidle", timeout=10000)
|
| 126 |
except: pass
|
| 127 |
|
|
@@ -134,25 +130,33 @@ def motor_tramitia_visual(operacion, barrio, ciudad, area, m2_min, m2_max, tipo,
|
|
| 134 |
for el in elementos:
|
| 135 |
if cont_fr >= 12: break
|
| 136 |
try:
|
|
|
|
|
|
|
|
|
|
| 137 |
txt = el.inner_text()
|
| 138 |
precio = extraer_precio(txt, operacion)
|
| 139 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 140 |
if precio > 0:
|
| 141 |
-
|
| 142 |
-
|
| 143 |
-
|
|
|
|
| 144 |
img_url = ""
|
| 145 |
-
img_el =
|
| 146 |
-
if not img_el:
|
| 147 |
-
padre = el.evaluate_handle("el => el.parentElement ? el.parentElement.parentElement : null")
|
| 148 |
-
if padre: img_el = padre.query_selector("img")
|
| 149 |
-
|
| 150 |
if img_el:
|
| 151 |
img_url = img_el.get_attribute("src") or img_el.get_attribute("data-src") or ""
|
| 152 |
if img_url.startswith("/"): img_url = "https://www.fincaraiz.com.co" + img_url
|
| 153 |
|
| 154 |
-
full_url = f"https://www.fincaraiz.com.co{href}" if href.startswith("/") else href
|
| 155 |
-
|
| 156 |
resultados.append({
|
| 157 |
"Portal": "Finca Raiz",
|
| 158 |
"Precio": precio,
|
|
@@ -168,12 +172,11 @@ def motor_tramitia_visual(operacion, barrio, ciudad, area, m2_min, m2_max, tipo,
|
|
| 168 |
log_visible += f"✅ FR: Encontrados {cont_fr} candidatos.\n"
|
| 169 |
except Exception as e: log_visible += f"⚠️ Error buscando en FR.\n"
|
| 170 |
|
| 171 |
-
# --- METROCUADRADO ---
|
| 172 |
try:
|
| 173 |
page = context.new_page()
|
| 174 |
log_visible += "🔄 MC: Buscando inmuebles y fotos...\n"
|
| 175 |
page.goto(url_mc, wait_until="domcontentloaded", timeout=60000)
|
| 176 |
-
|
| 177 |
try: page.wait_for_load_state("networkidle", timeout=10000)
|
| 178 |
except: pass
|
| 179 |
|
|
@@ -181,34 +184,47 @@ def motor_tramitia_visual(operacion, barrio, ciudad, area, m2_min, m2_max, tipo,
|
|
| 181 |
page.mouse.wheel(0, 1000)
|
| 182 |
page.wait_for_timeout(2000)
|
| 183 |
|
| 184 |
-
|
| 185 |
cont_mc = 0
|
| 186 |
-
for
|
| 187 |
if cont_mc >= 12: break
|
| 188 |
try:
|
| 189 |
-
|
|
|
|
|
|
|
|
|
|
| 190 |
precio = extraer_precio(txt, operacion)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 191 |
if precio > 0:
|
| 192 |
-
|
| 193 |
-
|
| 194 |
-
|
| 195 |
-
href = enlace_el.get_attribute("href") if enlace_el else ""
|
| 196 |
img_url = ""
|
|
|
|
| 197 |
if img_el:
|
| 198 |
img_url = img_el.get_attribute("src") or img_el.get_attribute("data-src") or ""
|
| 199 |
if img_url.startswith("/"): img_url = "https://www.metrocuadrado.com" + img_url
|
| 200 |
|
| 201 |
-
|
| 202 |
-
|
| 203 |
-
|
| 204 |
-
|
| 205 |
-
|
| 206 |
-
|
| 207 |
-
|
| 208 |
-
|
| 209 |
-
|
| 210 |
-
|
| 211 |
-
cont_mc += 1
|
| 212 |
except: continue
|
| 213 |
page.close()
|
| 214 |
log_visible += f"✅ MC: Encontrados {cont_mc} candidatos.\n"
|
|
@@ -217,9 +233,9 @@ def motor_tramitia_visual(operacion, barrio, ciudad, area, m2_min, m2_max, tipo,
|
|
| 217 |
browser.close()
|
| 218 |
|
| 219 |
if not resultados:
|
| 220 |
-
return f"{log_visible}\n❌ NO SE ENCONTRARON DATOS.
|
| 221 |
|
| 222 |
-
# --- LIMPIEZA ---
|
| 223 |
df_crudo = pd.DataFrame(resultados).drop_duplicates(subset=['URL'])
|
| 224 |
df_fr = df_crudo[df_crudo['Portal'] == 'Finca Raiz'].head(6)
|
| 225 |
df_mc = df_crudo[df_crudo['Portal'] == 'Metrocuadrado'].head(6)
|
|
@@ -230,7 +246,7 @@ def motor_tramitia_visual(operacion, barrio, ciudad, area, m2_min, m2_max, tipo,
|
|
| 230 |
|
| 231 |
log_visible += f"\n✨ PROCESANDO PDF CON {len(df_final)} INMUEBLES..."
|
| 232 |
|
| 233 |
-
# --- PDF VISUAL
|
| 234 |
pdf_path = f"Reporte_Visual_{int(time.time())}.pdf"
|
| 235 |
pdf = FPDF()
|
| 236 |
pdf.add_page()
|
|
@@ -238,7 +254,6 @@ def motor_tramitia_visual(operacion, barrio, ciudad, area, m2_min, m2_max, tipo,
|
|
| 238 |
pdf.set_font("Arial", 'B', 16)
|
| 239 |
pdf.set_fill_color(40, 53, 147)
|
| 240 |
pdf.set_text_color(255, 255, 255)
|
| 241 |
-
# Saneamos el encabezado por si acaso
|
| 242 |
encabezado = sanear_texto(f" REPORTE INMOBILIARIO: {barrio.upper()} ({operacion.upper()})")
|
| 243 |
pdf.cell(0, 15, encabezado, ln=True, fill=True)
|
| 244 |
pdf.set_text_color(0, 0, 0)
|
|
@@ -265,12 +280,10 @@ def motor_tramitia_visual(operacion, barrio, ciudad, area, m2_min, m2_max, tipo,
|
|
| 265 |
pdf.set_font("Arial", 'B', 9)
|
| 266 |
pdf.set_text_color(100, 100, 100)
|
| 267 |
|
| 268 |
-
# Saneamiento de textos dinámicos
|
| 269 |
ubicacion_limpia = sanear_texto(r['Ubicacion'])
|
| 270 |
portal_limpio = sanear_texto(r['Portal'])
|
| 271 |
desc_limpia = sanear_texto(r['Descripcion'])
|
| 272 |
|
| 273 |
-
# Textos limpios sin emojis hardcodeados
|
| 274 |
pdf.cell(0, 5, f"Ubicacion: {ubicacion_limpia} | Fuente: {portal_limpio}", ln=True)
|
| 275 |
|
| 276 |
pdf.set_x(text_x)
|
|
@@ -281,7 +294,7 @@ def motor_tramitia_visual(operacion, barrio, ciudad, area, m2_min, m2_max, tipo,
|
|
| 281 |
pdf.set_x(text_x)
|
| 282 |
pdf.set_font("Arial", 'U', 8)
|
| 283 |
pdf.set_text_color(0, 102, 204)
|
| 284 |
-
pdf.cell(0, 5, ">> Hacer clic aqui para ver publicacion", link=r['URL'], ln=True)
|
| 285 |
pdf.set_text_color(0, 0, 0)
|
| 286 |
|
| 287 |
y_end = pdf.get_y()
|
|
@@ -316,7 +329,7 @@ def motor_tramitia_visual(operacion, barrio, ciudad, area, m2_min, m2_max, tipo,
|
|
| 316 |
|
| 317 |
# --- INTERFAZ GRÁFICA ---
|
| 318 |
with gr.Blocks() as demo:
|
| 319 |
-
gr.Markdown("## 📸 TramitIA Pro: Generador Visual
|
| 320 |
|
| 321 |
with gr.Row():
|
| 322 |
with gr.Column(scale=1):
|
|
|
|
| 34 |
|
| 35 |
# --- FILTRO ANTI-EMOJIS PARA EL PDF ---
|
| 36 |
def sanear_texto(texto):
|
|
|
|
| 37 |
if not isinstance(texto, str): return ""
|
| 38 |
return texto.encode('latin-1', 'ignore').decode('latin-1')
|
| 39 |
|
|
|
|
| 43 |
return None
|
| 44 |
try:
|
| 45 |
headers = {
|
| 46 |
+
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)",
|
| 47 |
"Accept": "image/*",
|
| 48 |
"Referer": "https://www.fincaraiz.com.co/"
|
| 49 |
}
|
| 50 |
r = requests.get(url, timeout=8, headers=headers)
|
| 51 |
if r.status_code == 200:
|
| 52 |
img = Image.open(io.BytesIO(r.content))
|
| 53 |
+
if img.mode != 'RGB': img = img.convert('RGB')
|
|
|
|
| 54 |
path = f"temp_img_{idx}.jpg"
|
| 55 |
img.save(path, format="JPEG")
|
| 56 |
return path
|
| 57 |
+
except: return None
|
|
|
|
| 58 |
return None
|
| 59 |
|
| 60 |
# --- 1. GENERADOR DE URLS ---
|
|
|
|
| 99 |
return linea[:60]
|
| 100 |
return "Ubicacion en zona solicitada"
|
| 101 |
|
| 102 |
+
# --- 3. MOTOR DE EXTRACCIÓN (NUEVO RADAR 360) ---
|
| 103 |
def motor_tramitia_visual(operacion, barrio, ciudad, area, m2_min, m2_max, tipo, hab, ban, park, antiguedad, ascensor, piscina):
|
| 104 |
resultados = []
|
| 105 |
log_visible = ""
|
|
|
|
| 118 |
page = context.new_page()
|
| 119 |
log_visible += "🔄 FR: Buscando inmuebles y fotos...\n"
|
| 120 |
page.goto(url_fr, wait_until="domcontentloaded", timeout=60000)
|
|
|
|
| 121 |
try: page.wait_for_load_state("networkidle", timeout=10000)
|
| 122 |
except: pass
|
| 123 |
|
|
|
|
| 130 |
for el in elementos:
|
| 131 |
if cont_fr >= 12: break
|
| 132 |
try:
|
| 133 |
+
href = el.get_attribute("href")
|
| 134 |
+
if not href or len(href) < 15 or "javascript" in href: continue
|
| 135 |
+
|
| 136 |
txt = el.inner_text()
|
| 137 |
precio = extraer_precio(txt, operacion)
|
| 138 |
|
| 139 |
+
# RADAR: Si el link no tiene precio, subimos 4 niveles en el HTML buscando el contenedor
|
| 140 |
+
padre = el
|
| 141 |
+
niveles = 0
|
| 142 |
+
while precio == 0 and niveles < 4:
|
| 143 |
+
padre = padre.evaluate_handle("el => el.parentElement")
|
| 144 |
+
if not padre: break
|
| 145 |
+
txt = padre.inner_text()
|
| 146 |
+
precio = extraer_precio(txt, operacion)
|
| 147 |
+
niveles += 1
|
| 148 |
+
|
| 149 |
if precio > 0:
|
| 150 |
+
full_url = f"https://www.fincaraiz.com.co{href}" if href.startswith("/") else href
|
| 151 |
+
# Evita duplicados en tiempo real
|
| 152 |
+
if any(r['URL'] == full_url for r in resultados): continue
|
| 153 |
+
|
| 154 |
img_url = ""
|
| 155 |
+
img_el = padre.query_selector("img")
|
|
|
|
|
|
|
|
|
|
|
|
|
| 156 |
if img_el:
|
| 157 |
img_url = img_el.get_attribute("src") or img_el.get_attribute("data-src") or ""
|
| 158 |
if img_url.startswith("/"): img_url = "https://www.fincaraiz.com.co" + img_url
|
| 159 |
|
|
|
|
|
|
|
| 160 |
resultados.append({
|
| 161 |
"Portal": "Finca Raiz",
|
| 162 |
"Precio": precio,
|
|
|
|
| 172 |
log_visible += f"✅ FR: Encontrados {cont_fr} candidatos.\n"
|
| 173 |
except Exception as e: log_visible += f"⚠️ Error buscando en FR.\n"
|
| 174 |
|
| 175 |
+
# --- METROCUADRADO (CON RADAR) ---
|
| 176 |
try:
|
| 177 |
page = context.new_page()
|
| 178 |
log_visible += "🔄 MC: Buscando inmuebles y fotos...\n"
|
| 179 |
page.goto(url_mc, wait_until="domcontentloaded", timeout=60000)
|
|
|
|
| 180 |
try: page.wait_for_load_state("networkidle", timeout=10000)
|
| 181 |
except: pass
|
| 182 |
|
|
|
|
| 184 |
page.mouse.wheel(0, 1000)
|
| 185 |
page.wait_for_timeout(2000)
|
| 186 |
|
| 187 |
+
elementos = page.query_selector_all("a")
|
| 188 |
cont_mc = 0
|
| 189 |
+
for el in elementos:
|
| 190 |
if cont_mc >= 12: break
|
| 191 |
try:
|
| 192 |
+
href = el.get_attribute("href")
|
| 193 |
+
if not href or len(href) < 15 or "javascript" in href: continue
|
| 194 |
+
|
| 195 |
+
txt = el.inner_text()
|
| 196 |
precio = extraer_precio(txt, operacion)
|
| 197 |
+
|
| 198 |
+
# EL MISMO RADAR QUE SALVA A METROCUADRADO
|
| 199 |
+
padre = el
|
| 200 |
+
niveles = 0
|
| 201 |
+
while precio == 0 and niveles < 4:
|
| 202 |
+
padre = padre.evaluate_handle("el => el.parentElement")
|
| 203 |
+
if not padre: break
|
| 204 |
+
txt = padre.inner_text()
|
| 205 |
+
precio = extraer_precio(txt, operacion)
|
| 206 |
+
niveles += 1
|
| 207 |
+
|
| 208 |
if precio > 0:
|
| 209 |
+
full_url = f"https://www.metrocuadrado.com{href}" if href.startswith("/") else href
|
| 210 |
+
if any(r['URL'] == full_url for r in resultados): continue
|
| 211 |
+
|
|
|
|
| 212 |
img_url = ""
|
| 213 |
+
img_el = padre.query_selector("img")
|
| 214 |
if img_el:
|
| 215 |
img_url = img_el.get_attribute("src") or img_el.get_attribute("data-src") or ""
|
| 216 |
if img_url.startswith("/"): img_url = "https://www.metrocuadrado.com" + img_url
|
| 217 |
|
| 218 |
+
resultados.append({
|
| 219 |
+
"Portal": "Metrocuadrado",
|
| 220 |
+
"Precio": precio,
|
| 221 |
+
"Precio_M2": precio / area,
|
| 222 |
+
"Ubicacion": extraer_ubicacion(txt),
|
| 223 |
+
"Descripcion": txt.replace('\n', ' | ')[:120] + "...",
|
| 224 |
+
"URL": full_url,
|
| 225 |
+
"Imagen": img_url
|
| 226 |
+
})
|
| 227 |
+
cont_mc += 1
|
|
|
|
| 228 |
except: continue
|
| 229 |
page.close()
|
| 230 |
log_visible += f"✅ MC: Encontrados {cont_mc} candidatos.\n"
|
|
|
|
| 233 |
browser.close()
|
| 234 |
|
| 235 |
if not resultados:
|
| 236 |
+
return f"{log_visible}\n❌ NO SE ENCONTRARON DATOS.", pd.DataFrame(), None, "---"
|
| 237 |
|
| 238 |
+
# --- LIMPIEZA FINAL ---
|
| 239 |
df_crudo = pd.DataFrame(resultados).drop_duplicates(subset=['URL'])
|
| 240 |
df_fr = df_crudo[df_crudo['Portal'] == 'Finca Raiz'].head(6)
|
| 241 |
df_mc = df_crudo[df_crudo['Portal'] == 'Metrocuadrado'].head(6)
|
|
|
|
| 246 |
|
| 247 |
log_visible += f"\n✨ PROCESANDO PDF CON {len(df_final)} INMUEBLES..."
|
| 248 |
|
| 249 |
+
# --- PDF VISUAL Y SANEADO ---
|
| 250 |
pdf_path = f"Reporte_Visual_{int(time.time())}.pdf"
|
| 251 |
pdf = FPDF()
|
| 252 |
pdf.add_page()
|
|
|
|
| 254 |
pdf.set_font("Arial", 'B', 16)
|
| 255 |
pdf.set_fill_color(40, 53, 147)
|
| 256 |
pdf.set_text_color(255, 255, 255)
|
|
|
|
| 257 |
encabezado = sanear_texto(f" REPORTE INMOBILIARIO: {barrio.upper()} ({operacion.upper()})")
|
| 258 |
pdf.cell(0, 15, encabezado, ln=True, fill=True)
|
| 259 |
pdf.set_text_color(0, 0, 0)
|
|
|
|
| 280 |
pdf.set_font("Arial", 'B', 9)
|
| 281 |
pdf.set_text_color(100, 100, 100)
|
| 282 |
|
|
|
|
| 283 |
ubicacion_limpia = sanear_texto(r['Ubicacion'])
|
| 284 |
portal_limpio = sanear_texto(r['Portal'])
|
| 285 |
desc_limpia = sanear_texto(r['Descripcion'])
|
| 286 |
|
|
|
|
| 287 |
pdf.cell(0, 5, f"Ubicacion: {ubicacion_limpia} | Fuente: {portal_limpio}", ln=True)
|
| 288 |
|
| 289 |
pdf.set_x(text_x)
|
|
|
|
| 294 |
pdf.set_x(text_x)
|
| 295 |
pdf.set_font("Arial", 'U', 8)
|
| 296 |
pdf.set_text_color(0, 102, 204)
|
| 297 |
+
pdf.cell(0, 5, ">> Hacer clic aqui para ver publicacion original", link=r['URL'], ln=True)
|
| 298 |
pdf.set_text_color(0, 0, 0)
|
| 299 |
|
| 300 |
y_end = pdf.get_y()
|
|
|
|
| 329 |
|
| 330 |
# --- INTERFAZ GRÁFICA ---
|
| 331 |
with gr.Blocks() as demo:
|
| 332 |
+
gr.Markdown("## 📸 TramitIA Pro: Generador Visual (Radar 360)")
|
| 333 |
|
| 334 |
with gr.Row():
|
| 335 |
with gr.Column(scale=1):
|