Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
|
@@ -9,7 +9,7 @@ from playwright.sync_api import sync_playwright
|
|
| 9 |
import time
|
| 10 |
import random
|
| 11 |
|
| 12 |
-
# --- INSTALACIÓN ---
|
| 13 |
try:
|
| 14 |
subprocess.run(["playwright", "install", "chromium"], check=True)
|
| 15 |
except: pass
|
|
@@ -20,7 +20,7 @@ except ImportError:
|
|
| 20 |
subprocess.run(["pip", "install", "fake-useragent"], check=True)
|
| 21 |
from fake_useragent import UserAgent
|
| 22 |
|
| 23 |
-
# --- 1. GENERADOR DE URLS (CORREGIDO) ---
|
| 24 |
def construir_urls_final(zona, ciudad, tipo, hab, ban, park, antiguedad):
|
| 25 |
mapa_ant = {
|
| 26 |
"Menos de 1 año": "de-0-a-1-anos",
|
|
@@ -32,19 +32,21 @@ def construir_urls_final(zona, ciudad, tipo, hab, ban, park, antiguedad):
|
|
| 32 |
slug_ant = mapa_ant.get(antiguedad, "de-1-a-8-anios")
|
| 33 |
slug_park = f"{int(park)}-parqueadero" if int(park) == 1 else f"{int(park)}-parqueaderos"
|
| 34 |
|
|
|
|
| 35 |
z_slug = zona.lower().strip().replace(" ", "-")
|
| 36 |
c_slug = ciudad.lower().strip().replace(" ", "-")
|
| 37 |
|
| 38 |
-
#
|
| 39 |
-
# Ejemplo: /arriendo/usaquen/bogota/3-o-mas-habitaciones...
|
| 40 |
url_fr = f"https://www.fincaraiz.com.co/arriendo/{z_slug}/{c_slug}/{int(hab)}-o-mas-habitaciones/{int(ban)}-o-mas-banos/{slug_park}/{slug_ant}"
|
| 41 |
|
|
|
|
| 42 |
url_mc = f"https://www.metrocuadrado.com/{tipo.lower()}-casa-oficina/arriendo/{c_slug}/{int(ban)}-banos-{int(hab)}-habitaciones/?search=form"
|
| 43 |
|
| 44 |
return url_fr, url_mc
|
| 45 |
|
| 46 |
-
# --- 2. EXTRACTOR
|
| 47 |
def extraer_precio_regex(texto):
|
|
|
|
| 48 |
patron = r'\$\s?(\d{1,3}(?:[.,]\d{3})*)'
|
| 49 |
coincidencias = re.findall(patron, texto)
|
| 50 |
if coincidencias:
|
|
@@ -52,156 +54,165 @@ def extraer_precio_regex(texto):
|
|
| 52 |
return max(precios)
|
| 53 |
return 0
|
| 54 |
|
| 55 |
-
# --- 3. MOTOR DE EXTRACCIÓN
|
| 56 |
-
def
|
| 57 |
resultados = []
|
| 58 |
url_fr, url_mc = construir_urls_final(zona, ciudad, tipo, hab, ban, park, antiguedad)
|
| 59 |
-
|
| 60 |
-
|
| 61 |
ua = UserAgent()
|
| 62 |
|
| 63 |
with sync_playwright() as p:
|
|
|
|
| 64 |
browser = p.chromium.launch(
|
| 65 |
headless=True,
|
| 66 |
args=[
|
| 67 |
'--disable-blink-features=AutomationControlled',
|
| 68 |
'--no-sandbox',
|
| 69 |
-
'--disable-infobars'
|
|
|
|
|
|
|
| 70 |
]
|
| 71 |
)
|
| 72 |
-
context = browser.new_context(
|
| 73 |
-
user_agent=ua.random,
|
| 74 |
-
viewport={'width': 1366, 'height': 768},
|
| 75 |
-
locale='es-CO'
|
| 76 |
-
)
|
| 77 |
-
context.add_init_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined});")
|
| 78 |
|
| 79 |
# --- FINCA RAÍZ ---
|
| 80 |
try:
|
| 81 |
page = context.new_page()
|
| 82 |
-
log_visible += "🔄
|
| 83 |
page.goto(url_fr, wait_until="domcontentloaded", timeout=60000)
|
| 84 |
|
| 85 |
-
|
|
|
|
| 86 |
for _ in range(4):
|
| 87 |
-
page.mouse.wheel(0,
|
| 88 |
-
time.sleep(1.5)
|
| 89 |
|
| 90 |
-
# Buscamos enlaces (
|
| 91 |
elementos = page.query_selector_all("a")
|
| 92 |
cont_fr = 0
|
|
|
|
|
|
|
| 93 |
for el in elementos:
|
| 94 |
-
if cont_fr >= 15: break
|
| 95 |
txt = el.inner_text()
|
| 96 |
|
| 97 |
if "$" in txt:
|
| 98 |
precio = extraer_precio_regex(txt)
|
| 99 |
-
if precio >
|
| 100 |
href = el.get_attribute("href")
|
| 101 |
-
full_url = f"https://www.fincaraiz.com.co{href}" if href.startswith("/") else href
|
| 102 |
|
| 103 |
-
|
| 104 |
-
|
| 105 |
-
|
| 106 |
-
|
| 107 |
-
|
| 108 |
-
|
| 109 |
-
|
| 110 |
-
|
|
|
|
|
|
|
| 111 |
page.close()
|
| 112 |
-
log_visible += f"✅ FR: {cont_fr} datos brutos extraídos.\n"
|
| 113 |
except Exception as e: log_visible += f"⚠️ Error FR: {e}\n"
|
| 114 |
|
| 115 |
# --- METROCUADRADO ---
|
| 116 |
try:
|
| 117 |
page = context.new_page()
|
| 118 |
-
log_visible += "🔄
|
| 119 |
page.goto(url_mc, wait_until="domcontentloaded", timeout=60000)
|
| 120 |
|
| 121 |
-
for _ in range(
|
| 122 |
page.mouse.wheel(0, 1000)
|
| 123 |
time.sleep(1.5)
|
| 124 |
|
| 125 |
cards = page.query_selector_all("li, div[class*='card']")
|
| 126 |
cont_mc = 0
|
|
|
|
| 127 |
for card in cards:
|
| 128 |
-
if cont_mc >= 15: break
|
| 129 |
txt = card.inner_text()
|
| 130 |
|
| 131 |
if "$" in txt:
|
| 132 |
precio = extraer_precio_regex(txt)
|
| 133 |
-
if precio >
|
| 134 |
enlace = card.query_selector("a")
|
| 135 |
if enlace:
|
| 136 |
href = enlace.get_attribute("href")
|
| 137 |
-
full_url = f"https://www.metrocuadrado.com{href}" if href.startswith("/") else href
|
| 138 |
|
| 139 |
-
|
| 140 |
-
|
| 141 |
-
|
| 142 |
-
|
| 143 |
-
|
| 144 |
-
|
| 145 |
-
|
| 146 |
-
|
|
|
|
|
|
|
| 147 |
page.close()
|
| 148 |
-
log_visible += f"✅ MC: {cont_mc} datos brutos extraídos.\n"
|
| 149 |
except Exception as e: log_visible += f"⚠️ Error MC: {e}\n"
|
| 150 |
|
| 151 |
browser.close()
|
| 152 |
|
| 153 |
if not resultados:
|
| 154 |
-
return f"{log_visible}\n❌ NO SE ENCONTRARON DATOS.", None, None, "---"
|
| 155 |
|
| 156 |
-
# --- LIMPIEZA DE
|
| 157 |
-
# 1. Convertimos a DataFrame
|
| 158 |
df_crudo = pd.DataFrame(resultados)
|
| 159 |
-
|
|
|
|
| 160 |
df_limpio = df_crudo.drop_duplicates(subset=['URL'])
|
| 161 |
|
| 162 |
-
#
|
| 163 |
-
|
| 164 |
-
|
| 165 |
|
| 166 |
-
#
|
| 167 |
-
df_final = pd.concat([
|
| 168 |
|
| 169 |
-
log_visible += f"\n
|
| 170 |
|
| 171 |
# --- PDF ---
|
| 172 |
pdf_path = f"Reporte_{int(time.time())}.pdf"
|
| 173 |
pdf = FPDF()
|
| 174 |
pdf.add_page()
|
| 175 |
pdf.set_font("Arial", 'B', 14)
|
| 176 |
-
pdf.cell(0, 10, f"ESTUDIO {zona.upper()}", ln=True)
|
| 177 |
pdf.ln(5)
|
|
|
|
| 178 |
for _, r in df_final.iterrows():
|
|
|
|
| 179 |
pdf.set_font("Arial", 'B', 10)
|
| 180 |
-
pdf.cell(0, 8, f"
|
| 181 |
-
pdf.set_font("Arial", '',
|
| 182 |
-
pdf.multi_cell(0,
|
| 183 |
pdf.set_font("Arial", 'U', 8); pdf.set_text_color(0,0,255)
|
| 184 |
-
pdf.cell(0, 6, "Ver Publicacion", link=r['URL'], ln=True)
|
| 185 |
pdf.set_text_color(0,0,0); pdf.ln(3)
|
| 186 |
pdf.output(pdf_path)
|
| 187 |
|
| 188 |
# --- CÁLCULOS ---
|
| 189 |
-
|
| 190 |
-
|
| 191 |
-
|
| 192 |
-
|
| 193 |
-
|
| 194 |
-
|
| 195 |
-
|
| 196 |
-
|
| 197 |
-
|
| 198 |
-
|
|
|
|
|
|
|
|
|
|
| 199 |
|
| 200 |
-
return f"{log_visible}\n✅
|
| 201 |
|
| 202 |
-
# --- INTERFAZ ---
|
| 203 |
with gr.Blocks(theme=gr.themes.Soft()) as demo:
|
| 204 |
-
gr.Markdown("## 🤖 TramitIA Pro:
|
| 205 |
|
| 206 |
with gr.Row():
|
| 207 |
with gr.Column(scale=1):
|
|
@@ -219,15 +230,15 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
|
|
| 219 |
["Menos de 1 año", "1 a 8 años", "9 a 15 años", "16 a 30 años", "Más de 30 años"],
|
| 220 |
label="Antigüedad", value="1 a 8 años"
|
| 221 |
)
|
| 222 |
-
btn = gr.Button("EJECUTAR
|
| 223 |
|
| 224 |
with gr.Column(scale=2):
|
| 225 |
-
res_fin = gr.Markdown("### 💰
|
| 226 |
with gr.Tabs():
|
| 227 |
-
with gr.TabItem("Auditoría"): msg = gr.Textbox(lines=
|
| 228 |
-
with gr.TabItem("
|
| 229 |
with gr.TabItem("Descargar PDF"): out_pdf = gr.File()
|
| 230 |
|
| 231 |
-
btn.click(
|
| 232 |
|
| 233 |
demo.launch()
|
|
|
|
| 9 |
import time
|
| 10 |
import random
|
| 11 |
|
| 12 |
+
# --- INSTALACIÓN DE DEPENDENCIAS ---
|
| 13 |
try:
|
| 14 |
subprocess.run(["playwright", "install", "chromium"], check=True)
|
| 15 |
except: pass
|
|
|
|
| 20 |
subprocess.run(["pip", "install", "fake-useragent"], check=True)
|
| 21 |
from fake_useragent import UserAgent
|
| 22 |
|
| 23 |
+
# --- 1. GENERADOR DE URLS (CORREGIDO CON RUTA COMPLETA) ---
|
| 24 |
def construir_urls_final(zona, ciudad, tipo, hab, ban, park, antiguedad):
|
| 25 |
mapa_ant = {
|
| 26 |
"Menos de 1 año": "de-0-a-1-anos",
|
|
|
|
| 32 |
slug_ant = mapa_ant.get(antiguedad, "de-1-a-8-anios")
|
| 33 |
slug_park = f"{int(park)}-parqueadero" if int(park) == 1 else f"{int(park)}-parqueaderos"
|
| 34 |
|
| 35 |
+
# Limpieza estricta de slugs
|
| 36 |
z_slug = zona.lower().strip().replace(" ", "-")
|
| 37 |
c_slug = ciudad.lower().strip().replace(" ", "-")
|
| 38 |
|
| 39 |
+
# URL FINCA RAÍZ: /arriendo/barrio/ciudad/filtros
|
|
|
|
| 40 |
url_fr = f"https://www.fincaraiz.com.co/arriendo/{z_slug}/{c_slug}/{int(hab)}-o-mas-habitaciones/{int(ban)}-o-mas-banos/{slug_park}/{slug_ant}"
|
| 41 |
|
| 42 |
+
# URL METROCUADRADO
|
| 43 |
url_mc = f"https://www.metrocuadrado.com/{tipo.lower()}-casa-oficina/arriendo/{c_slug}/{int(ban)}-banos-{int(hab)}-habitaciones/?search=form"
|
| 44 |
|
| 45 |
return url_fr, url_mc
|
| 46 |
|
| 47 |
+
# --- 2. EXTRACTOR DE PRECIO (REGEX) ---
|
| 48 |
def extraer_precio_regex(texto):
|
| 49 |
+
# Detecta $ 1.500.000 o $1500000
|
| 50 |
patron = r'\$\s?(\d{1,3}(?:[.,]\d{3})*)'
|
| 51 |
coincidencias = re.findall(patron, texto)
|
| 52 |
if coincidencias:
|
|
|
|
| 54 |
return max(precios)
|
| 55 |
return 0
|
| 56 |
|
| 57 |
+
# --- 3. MOTOR DE EXTRACCIÓN ROBUSTO ---
|
| 58 |
+
def motor_tramitia_final(zona, ciudad, area, tipo, hab, ban, park, antiguedad):
|
| 59 |
resultados = []
|
| 60 |
url_fr, url_mc = construir_urls_final(zona, ciudad, tipo, hab, ban, park, antiguedad)
|
| 61 |
+
|
| 62 |
+
log_visible = f"✅ URLs GENERADAS (AUDITORÍA):\nFR: {url_fr}\nMC: {url_mc}\n\n"
|
| 63 |
ua = UserAgent()
|
| 64 |
|
| 65 |
with sync_playwright() as p:
|
| 66 |
+
# Lanzamos navegador con perfil de evasión
|
| 67 |
browser = p.chromium.launch(
|
| 68 |
headless=True,
|
| 69 |
args=[
|
| 70 |
'--disable-blink-features=AutomationControlled',
|
| 71 |
'--no-sandbox',
|
| 72 |
+
'--disable-infobars',
|
| 73 |
+
'--window-position=0,0',
|
| 74 |
+
f'--user-agent={ua.random}' # Identidad aleatoria
|
| 75 |
]
|
| 76 |
)
|
| 77 |
+
context = browser.new_context(viewport={'width': 1366, 'height': 768})
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 78 |
|
| 79 |
# --- FINCA RAÍZ ---
|
| 80 |
try:
|
| 81 |
page = context.new_page()
|
| 82 |
+
log_visible += "🔄 Escaneando Finca Raíz...\n"
|
| 83 |
page.goto(url_fr, wait_until="domcontentloaded", timeout=60000)
|
| 84 |
|
| 85 |
+
# Movimiento humano
|
| 86 |
+
page.mouse.move(random.randint(100, 500), random.randint(100, 500))
|
| 87 |
for _ in range(4):
|
| 88 |
+
page.mouse.wheel(0, 1000)
|
| 89 |
+
time.sleep(1.5) # Espera para cargar elementos dinámicos
|
| 90 |
|
| 91 |
+
# Buscamos enlaces (Links <a>)
|
| 92 |
elementos = page.query_selector_all("a")
|
| 93 |
cont_fr = 0
|
| 94 |
+
|
| 95 |
+
# Extraemos hasta 15 para tener margen de descarte
|
| 96 |
for el in elementos:
|
| 97 |
+
if cont_fr >= 15: break
|
| 98 |
txt = el.inner_text()
|
| 99 |
|
| 100 |
if "$" in txt:
|
| 101 |
precio = extraer_precio_regex(txt)
|
| 102 |
+
if precio > 600000: # Filtro anti-ruido (precios muy bajos)
|
| 103 |
href = el.get_attribute("href")
|
| 104 |
+
full_url = f"https://www.fincaraiz.com.co{href}" if href and href.startswith("/") else href
|
| 105 |
|
| 106 |
+
if full_url:
|
| 107 |
+
resultados.append({
|
| 108 |
+
"Portal": "Finca Raiz",
|
| 109 |
+
"Precio": precio,
|
| 110 |
+
"Precio_M2": precio / area,
|
| 111 |
+
"Descripcion": txt.replace('\n', ' ')[:90] + "...",
|
| 112 |
+
"URL": full_url
|
| 113 |
+
})
|
| 114 |
+
cont_fr += 1
|
| 115 |
+
log_visible += f"✅ FR: Encontrados {cont_fr} candidatos brutos.\n"
|
| 116 |
page.close()
|
|
|
|
| 117 |
except Exception as e: log_visible += f"⚠️ Error FR: {e}\n"
|
| 118 |
|
| 119 |
# --- METROCUADRADO ---
|
| 120 |
try:
|
| 121 |
page = context.new_page()
|
| 122 |
+
log_visible += "🔄 Escaneando Metrocuadrado...\n"
|
| 123 |
page.goto(url_mc, wait_until="domcontentloaded", timeout=60000)
|
| 124 |
|
| 125 |
+
for _ in range(5): # Un poco más de scroll en MC
|
| 126 |
page.mouse.wheel(0, 1000)
|
| 127 |
time.sleep(1.5)
|
| 128 |
|
| 129 |
cards = page.query_selector_all("li, div[class*='card']")
|
| 130 |
cont_mc = 0
|
| 131 |
+
|
| 132 |
for card in cards:
|
| 133 |
+
if cont_mc >= 15: break
|
| 134 |
txt = card.inner_text()
|
| 135 |
|
| 136 |
if "$" in txt:
|
| 137 |
precio = extraer_precio_regex(txt)
|
| 138 |
+
if precio > 600000:
|
| 139 |
enlace = card.query_selector("a")
|
| 140 |
if enlace:
|
| 141 |
href = enlace.get_attribute("href")
|
| 142 |
+
full_url = f"https://www.metrocuadrado.com{href}" if href and href.startswith("/") else href
|
| 143 |
|
| 144 |
+
if full_url:
|
| 145 |
+
resultados.append({
|
| 146 |
+
"Portal": "Metrocuadrado",
|
| 147 |
+
"Precio": precio,
|
| 148 |
+
"Precio_M2": precio / area,
|
| 149 |
+
"Descripcion": txt.replace('\n', ' ')[:90] + "...",
|
| 150 |
+
"URL": full_url
|
| 151 |
+
})
|
| 152 |
+
cont_mc += 1
|
| 153 |
+
log_visible += f"✅ MC: Encontrados {cont_mc} candidatos brutos.\n"
|
| 154 |
page.close()
|
|
|
|
| 155 |
except Exception as e: log_visible += f"⚠️ Error MC: {e}\n"
|
| 156 |
|
| 157 |
browser.close()
|
| 158 |
|
| 159 |
if not resultados:
|
| 160 |
+
return f"{log_visible}\n❌ NO SE ENCONTRARON DATOS VÁLIDOS.", None, None, "---"
|
| 161 |
|
| 162 |
+
# --- LIMPIEZA Y SELECCIÓN DE LOS 6 MEJORES ---
|
|
|
|
| 163 |
df_crudo = pd.DataFrame(resultados)
|
| 164 |
+
|
| 165 |
+
# 1. Eliminar duplicados exactos de URL
|
| 166 |
df_limpio = df_crudo.drop_duplicates(subset=['URL'])
|
| 167 |
|
| 168 |
+
# 2. Seleccionar top 6 por portal
|
| 169 |
+
df_fr = df_limpio[df_limpio['Portal'] == 'Finca Raiz'].head(6)
|
| 170 |
+
df_mc = df_limpio[df_limpio['Portal'] == 'Metrocuadrado'].head(6)
|
| 171 |
|
| 172 |
+
# 3. Combinar
|
| 173 |
+
df_final = pd.concat([df_fr, df_mc]).reset_index(drop=True)
|
| 174 |
|
| 175 |
+
log_visible += f"\n✨ PROCESADO FINAL: {len(df_final)} inmuebles únicos seleccionados para el reporte."
|
| 176 |
|
| 177 |
# --- PDF ---
|
| 178 |
pdf_path = f"Reporte_{int(time.time())}.pdf"
|
| 179 |
pdf = FPDF()
|
| 180 |
pdf.add_page()
|
| 181 |
pdf.set_font("Arial", 'B', 14)
|
| 182 |
+
pdf.cell(0, 10, f"ESTUDIO DE MERCADO: {zona.upper()}", ln=True)
|
| 183 |
pdf.ln(5)
|
| 184 |
+
|
| 185 |
for _, r in df_final.iterrows():
|
| 186 |
+
pdf.set_fill_color(240, 240, 240)
|
| 187 |
pdf.set_font("Arial", 'B', 10)
|
| 188 |
+
pdf.cell(0, 8, f"{r['Portal']} - ${r['Precio']:,.0f}", ln=True, fill=True)
|
| 189 |
+
pdf.set_font("Arial", '', 9)
|
| 190 |
+
pdf.multi_cell(0, 5, f"{r['Descripcion']}")
|
| 191 |
pdf.set_font("Arial", 'U', 8); pdf.set_text_color(0,0,255)
|
| 192 |
+
pdf.cell(0, 6, "Ver Publicacion Original", link=r['URL'], ln=True)
|
| 193 |
pdf.set_text_color(0,0,0); pdf.ln(3)
|
| 194 |
pdf.output(pdf_path)
|
| 195 |
|
| 196 |
# --- CÁLCULOS ---
|
| 197 |
+
if not df_final.empty:
|
| 198 |
+
promedio = df_final['Precio_M2'].mean() * area
|
| 199 |
+
minimo = df_final['Precio'].min()
|
| 200 |
+
maximo = df_final['Precio'].max()
|
| 201 |
+
|
| 202 |
+
resumen = (
|
| 203 |
+
f"💰 **ESTIMACIÓN DE RENTA**\n"
|
| 204 |
+
f"🔹 **Canon Sugerido:** ${promedio:,.0f}\n"
|
| 205 |
+
f"📉 Mínimo Zona: ${minimo:,.0f}\n"
|
| 206 |
+
f"📈 Máximo Zona: ${maximo:,.0f}"
|
| 207 |
+
)
|
| 208 |
+
else:
|
| 209 |
+
resumen = "⚠️ No hay suficientes datos para calcular."
|
| 210 |
|
| 211 |
+
return f"{log_visible}\n✅ Tarea Completada.", df_final, pdf_path, resumen
|
| 212 |
|
| 213 |
+
# --- INTERFAZ GRÁFICA ---
|
| 214 |
with gr.Blocks(theme=gr.themes.Soft()) as demo:
|
| 215 |
+
gr.Markdown("## 🤖 TramitIA Pro: Analizador Inmobiliario (vFinal)")
|
| 216 |
|
| 217 |
with gr.Row():
|
| 218 |
with gr.Column(scale=1):
|
|
|
|
| 230 |
["Menos de 1 año", "1 a 8 años", "9 a 15 años", "16 a 30 años", "Más de 30 años"],
|
| 231 |
label="Antigüedad", value="1 a 8 años"
|
| 232 |
)
|
| 233 |
+
btn = gr.Button("EJECUTAR ANÁLISIS", variant="primary")
|
| 234 |
|
| 235 |
with gr.Column(scale=2):
|
| 236 |
+
res_fin = gr.Markdown("### 💰 El resultado aparecerá aquí...")
|
| 237 |
with gr.Tabs():
|
| 238 |
+
with gr.TabItem("Auditoría"): msg = gr.Textbox(lines=10, label="Log del Sistema")
|
| 239 |
+
with gr.TabItem("Tabla de Resultados"): out_df = gr.Dataframe()
|
| 240 |
with gr.TabItem("Descargar PDF"): out_pdf = gr.File()
|
| 241 |
|
| 242 |
+
btn.click(motor_tramitia_final, [z, c, a, t, h, b, p, e], [msg, out_df, out_pdf, res_fin])
|
| 243 |
|
| 244 |
demo.launch()
|