estamidaosderentav3

Running

App Files Files Community

jcalbornoz commited on 14 days ago

Commit

2663862

verified ·

1 Parent(s): f61ba3c

Update app.py

Browse files

Files changed (1) hide show

app.py +91 -80

app.py CHANGED Viewed

@@ -9,7 +9,7 @@ from playwright.sync_api import sync_playwright
 import time
 import random
-# --- INSTALACIÓN ---
 try:
     subprocess.run(["playwright", "install", "chromium"], check=True)
 except: pass
@@ -20,7 +20,7 @@ except ImportError:
     subprocess.run(["pip", "install", "fake-useragent"], check=True)
     from fake_useragent import UserAgent
-# --- 1. GENERADOR DE URLS (CORREGIDO) ---
 def construir_urls_final(zona, ciudad, tipo, hab, ban, park, antiguedad):
     mapa_ant = {
         "Menos de 1 año": "de-0-a-1-anos",
@@ -32,19 +32,21 @@ def construir_urls_final(zona, ciudad, tipo, hab, ban, park, antiguedad):
     slug_ant = mapa_ant.get(antiguedad, "de-1-a-8-anios")
     slug_park = f"{int(park)}-parqueadero" if int(park) == 1 else f"{int(park)}-parqueaderos"
     z_slug = zona.lower().strip().replace(" ", "-")
     c_slug = ciudad.lower().strip().replace(" ", "-")
-    # FR CORREGIDO: Se reincorpora la zona y ciudad en el path principal
-    # Ejemplo: /arriendo/usaquen/bogota/3-o-mas-habitaciones...
     url_fr = f"https://www.fincaraiz.com.co/arriendo/{z_slug}/{c_slug}/{int(hab)}-o-mas-habitaciones/{int(ban)}-o-mas-banos/{slug_park}/{slug_ant}"
     url_mc = f"https://www.metrocuadrado.com/{tipo.lower()}-casa-oficina/arriendo/{c_slug}/{int(ban)}-banos-{int(hab)}-habitaciones/?search=form"
     return url_fr, url_mc
-# --- 2. EXTRACTOR GENÉRICO (Regex) ---
 def extraer_precio_regex(texto):
     patron = r'\$\s?(\d{1,3}(?:[.,]\d{3})*)'
     coincidencias = re.findall(patron, texto)
     if coincidencias:
@@ -52,156 +54,165 @@ def extraer_precio_regex(texto):
         return max(precios)
     return 0
-# --- 3. MOTOR DE EXTRACCIÓN CAMUFLADO ---
-def motor_tramitia_camuflado(zona, ciudad, area, tipo, hab, ban, park, antiguedad):
     resultados = []
     url_fr, url_mc = construir_urls_final(zona, ciudad, tipo, hab, ban, park, antiguedad)
-    log_visible = f"✅ URLs INICIADAS:\nFR: {url_fr}\nMC: {url_mc}\n\n"
     ua = UserAgent()
     with sync_playwright() as p:
         browser = p.chromium.launch(
             headless=True,
             args=[
                 '--disable-blink-features=AutomationControlled',
                 '--no-sandbox',
-                '--disable-infobars'
             ]
         )
-        context = browser.new_context(
-            user_agent=ua.random,
-            viewport={'width': 1366, 'height': 768},
-            locale='es-CO'
-        )
-        context.add_init_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined});")
         # --- FINCA RAÍZ ---
         try:
             page = context.new_page()
-            log_visible += "🔄 Conectando a FR...\n"
             page.goto(url_fr, wait_until="domcontentloaded", timeout=60000)
-            page.mouse.move(200, 200)
             for _ in range(4):
-                page.mouse.wheel(0, 1500)
-                time.sleep(1.5)
-            # Buscamos enlaces (aumentamos a 20 para sobrevivir a la eliminación de duplicados)
             elementos = page.query_selector_all("a")
             cont_fr = 0
             for el in elementos:
-                if cont_fr >= 15: break # Extraemos de más a propósito
                 txt = el.inner_text()
                 if "$" in txt:
                     precio = extraer_precio_regex(txt)
-                    if precio > 500000:
                         href = el.get_attribute("href")
-                        full_url = f"https://www.fincaraiz.com.co{href}" if href.startswith("/") else href
-                        resultados.append({
-                            "Portal": "Finca Raiz",
-                            "Precio": precio,
-                            "Precio_M2": precio / area,
-                            "Descripcion": txt.replace('\n', ' ')[:80] + "...",
-                            "URL": full_url
-                        })
-                        cont_fr += 1
             page.close()
-            log_visible += f"✅ FR: {cont_fr} datos brutos extraídos.\n"
         except Exception as e: log_visible += f"⚠️ Error FR: {e}\n"
         # --- METROCUADRADO ---
         try:
             page = context.new_page()
-            log_visible += "🔄 Conectando a MC...\n"
             page.goto(url_mc, wait_until="domcontentloaded", timeout=60000)
-            for _ in range(4):
                 page.mouse.wheel(0, 1000)
                 time.sleep(1.5)
             cards = page.query_selector_all("li, div[class*='card']")
             cont_mc = 0
             for card in cards:
-                if cont_mc >= 15: break # Extraemos de más a propósito
                 txt = card.inner_text()
                 if "$" in txt:
                     precio = extraer_precio_regex(txt)
-                    if precio > 500000:
                         enlace = card.query_selector("a")
                         if enlace:
                             href = enlace.get_attribute("href")
-                            full_url = f"https://www.metrocuadrado.com{href}" if href.startswith("/") else href
-                            resultados.append({
-                                "Portal": "Metrocuadrado",
-                                "Precio": precio,
-                                "Precio_M2": precio / area,
-                                "Descripcion": txt.replace('\n', ' ')[:80] + "...",
-                                "URL": full_url
-                            })
-                            cont_mc += 1
             page.close()
-            log_visible += f"✅ MC: {cont_mc} datos brutos extraídos.\n"
         except Exception as e: log_visible += f"⚠️ Error MC: {e}\n"
         browser.close()
     if not resultados:
-        return f"{log_visible}\n❌ NO SE ENCONTRARON DATOS.", None, None, "---"
-    # --- LIMPIEZA DE DUPLICADOS Y LIMITACIÓN ---
-    # 1. Convertimos a DataFrame
     df_crudo = pd.DataFrame(resultados)
-    # 2. Borramos los que tienen la misma URL (los clones)
     df_limpio = df_crudo.drop_duplicates(subset=['URL'])
-    # 3. Forzamos a que sean máximo 6 únicos por cada portal
-    df_fr_final = df_limpio[df_limpio['Portal'] == 'Finca Raiz'].head(6)
-    df_mc_final = df_limpio[df_limpio['Portal'] == 'Metrocuadrado'].head(6)
-    # 4. Unimos todo de nuevo
-    df_final = pd.concat([df_fr_final, df_mc_final]).reset_index(drop=True)
-    log_visible += f"\n🧹 Limpieza final: Quedaron {len(df_final)} inmuebles únicos y reales."
     # --- PDF ---
     pdf_path = f"Reporte_{int(time.time())}.pdf"
     pdf = FPDF()
     pdf.add_page()
     pdf.set_font("Arial", 'B', 14)
-    pdf.cell(0, 10, f"ESTUDIO {zona.upper()}", ln=True)
     pdf.ln(5)
     for _, r in df_final.iterrows():
         pdf.set_font("Arial", 'B', 10)
-        pdf.cell(0, 8, f"${r['Precio']:,.0f} - {r['Portal']}", ln=True)
-        pdf.set_font("Arial", '', 8)
-        pdf.multi_cell(0, 4, f"{r['Descripcion']}")
         pdf.set_font("Arial", 'U', 8); pdf.set_text_color(0,0,255)
-        pdf.cell(0, 6, "Ver Publicacion", link=r['URL'], ln=True)
         pdf.set_text_color(0,0,0); pdf.ln(3)
     pdf.output(pdf_path)
     # --- CÁLCULOS ---
-    promedio = df_final['Precio_M2'].mean() * area
-    minimo = df_final['Precio'].min()
-    maximo = df_final['Precio'].max()
-    resumen = (
-        f"💰 **ESTIMACIÓN DE RENTA**\n"
-        f"🔹 **Canon Sugerido:** ${promedio:,.0f}\n"
-        f"📉 Mínimo Zona: ${minimo:,.0f}\n"
-        f"📈 Máximo Zona: ${maximo:,.0f}"
-    )
-    return f"{log_visible}\n✅ Proceso Terminado.", df_final, pdf_path, resumen
-# --- INTERFAZ ---
 with gr.Blocks(theme=gr.themes.Soft()) as demo:
-    gr.Markdown("## 🤖 TramitIA Pro: Extracción Balanceada")
     with gr.Row():
         with gr.Column(scale=1):
@@ -219,15 +230,15 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
                 ["Menos de 1 año", "1 a 8 años", "9 a 15 años", "16 a 30 años", "Más de 30 años"],
                 label="Antigüedad", value="1 a 8 años"
             )
-            btn = gr.Button("EJECUTAR ESCANEO", variant="primary")
         with gr.Column(scale=2):
-            res_fin = gr.Markdown("### 💰 Resultado...")
             with gr.Tabs():
-                with gr.TabItem("Auditoría"): msg = gr.Textbox(lines=8)
-                with gr.TabItem("Comparables (Únicos)"): out_df = gr.Dataframe()
                 with gr.TabItem("Descargar PDF"): out_pdf = gr.File()
-    btn.click(motor_tramitia_camuflado, [z, c, a, t, h, b, p, e], [msg, out_df, out_pdf, res_fin])
 demo.launch()

 import time
 import random
+# --- INSTALACIÓN DE DEPENDENCIAS ---
 try:
     subprocess.run(["playwright", "install", "chromium"], check=True)
 except: pass
     subprocess.run(["pip", "install", "fake-useragent"], check=True)
     from fake_useragent import UserAgent
+# --- 1. GENERADOR DE URLS (CORREGIDO CON RUTA COMPLETA) ---
 def construir_urls_final(zona, ciudad, tipo, hab, ban, park, antiguedad):
     mapa_ant = {
         "Menos de 1 año": "de-0-a-1-anos",
     slug_ant = mapa_ant.get(antiguedad, "de-1-a-8-anios")
     slug_park = f"{int(park)}-parqueadero" if int(park) == 1 else f"{int(park)}-parqueaderos"
+    # Limpieza estricta de slugs
     z_slug = zona.lower().strip().replace(" ", "-")
     c_slug = ciudad.lower().strip().replace(" ", "-")
+    # URL FINCA RAÍZ: /arriendo/barrio/ciudad/filtros
     url_fr = f"https://www.fincaraiz.com.co/arriendo/{z_slug}/{c_slug}/{int(hab)}-o-mas-habitaciones/{int(ban)}-o-mas-banos/{slug_park}/{slug_ant}"
+    # URL METROCUADRADO
     url_mc = f"https://www.metrocuadrado.com/{tipo.lower()}-casa-oficina/arriendo/{c_slug}/{int(ban)}-banos-{int(hab)}-habitaciones/?search=form"
     return url_fr, url_mc
+# --- 2. EXTRACTOR DE PRECIO (REGEX) ---
 def extraer_precio_regex(texto):
+    # Detecta $ 1.500.000 o $1500000
     patron = r'\$\s?(\d{1,3}(?:[.,]\d{3})*)'
     coincidencias = re.findall(patron, texto)
     if coincidencias:
         return max(precios)
     return 0
+# --- 3. MOTOR DE EXTRACCIÓN ROBUSTO ---
+def motor_tramitia_final(zona, ciudad, area, tipo, hab, ban, park, antiguedad):
     resultados = []
     url_fr, url_mc = construir_urls_final(zona, ciudad, tipo, hab, ban, park, antiguedad)
+    log_visible = f"✅ URLs GENERADAS (AUDITORÍA):\nFR: {url_fr}\nMC: {url_mc}\n\n"
     ua = UserAgent()
     with sync_playwright() as p:
+        # Lanzamos navegador con perfil de evasión
         browser = p.chromium.launch(
             headless=True,
             args=[
                 '--disable-blink-features=AutomationControlled',
                 '--no-sandbox',
+                '--disable-infobars',
+                '--window-position=0,0',
+                f'--user-agent={ua.random}' # Identidad aleatoria
             ]
         )
+        context = browser.new_context(viewport={'width': 1366, 'height': 768})
         # --- FINCA RAÍZ ---
         try:
             page = context.new_page()
+            log_visible += "🔄 Escaneando Finca Raíz...\n"
             page.goto(url_fr, wait_until="domcontentloaded", timeout=60000)
+            # Movimiento humano
+            page.mouse.move(random.randint(100, 500), random.randint(100, 500))
             for _ in range(4):
+                page.mouse.wheel(0, 1000)
+                time.sleep(1.5) # Espera para cargar elementos dinámicos
+            # Buscamos enlaces (Links <a>)
             elementos = page.query_selector_all("a")
             cont_fr = 0
+            # Extraemos hasta 15 para tener margen de descarte
             for el in elementos:
+                if cont_fr >= 15: break
                 txt = el.inner_text()
                 if "$" in txt:
                     precio = extraer_precio_regex(txt)
+                    if precio > 600000: # Filtro anti-ruido (precios muy bajos)
                         href = el.get_attribute("href")
+                        full_url = f"https://www.fincaraiz.com.co{href}" if href and href.startswith("/") else href
+                        if full_url:
+                            resultados.append({
+                                "Portal": "Finca Raiz",
+                                "Precio": precio,
+                                "Precio_M2": precio / area,
+                                "Descripcion": txt.replace('\n', ' ')[:90] + "...",
+                                "URL": full_url
+                            })
+                            cont_fr += 1
+            log_visible += f"✅ FR: Encontrados {cont_fr} candidatos brutos.\n"
             page.close()
         except Exception as e: log_visible += f"⚠️ Error FR: {e}\n"
         # --- METROCUADRADO ---
         try:
             page = context.new_page()
+            log_visible += "🔄 Escaneando Metrocuadrado...\n"
             page.goto(url_mc, wait_until="domcontentloaded", timeout=60000)
+            for _ in range(5): # Un poco más de scroll en MC
                 page.mouse.wheel(0, 1000)
                 time.sleep(1.5)
             cards = page.query_selector_all("li, div[class*='card']")
             cont_mc = 0
             for card in cards:
+                if cont_mc >= 15: break
                 txt = card.inner_text()
                 if "$" in txt:
                     precio = extraer_precio_regex(txt)
+                    if precio > 600000:
                         enlace = card.query_selector("a")
                         if enlace:
                             href = enlace.get_attribute("href")
+                            full_url = f"https://www.metrocuadrado.com{href}" if href and href.startswith("/") else href
+                            if full_url:
+                                resultados.append({
+                                    "Portal": "Metrocuadrado",
+                                    "Precio": precio,
+                                    "Precio_M2": precio / area,
+                                    "Descripcion": txt.replace('\n', ' ')[:90] + "...",
+                                    "URL": full_url
+                                })
+                                cont_mc += 1
+            log_visible += f"✅ MC: Encontrados {cont_mc} candidatos brutos.\n"
             page.close()
         except Exception as e: log_visible += f"⚠️ Error MC: {e}\n"
         browser.close()
     if not resultados:
+        return f"{log_visible}\n❌ NO SE ENCONTRARON DATOS VÁLIDOS.", None, None, "---"
+    # --- LIMPIEZA Y SELECCIÓN DE LOS 6 MEJORES ---
     df_crudo = pd.DataFrame(resultados)
+    # 1. Eliminar duplicados exactos de URL
     df_limpio = df_crudo.drop_duplicates(subset=['URL'])
+    # 2. Seleccionar top 6 por portal
+    df_fr = df_limpio[df_limpio['Portal'] == 'Finca Raiz'].head(6)
+    df_mc = df_limpio[df_limpio['Portal'] == 'Metrocuadrado'].head(6)
+    # 3. Combinar
+    df_final = pd.concat([df_fr, df_mc]).reset_index(drop=True)
+    log_visible += f"\n✨ PROCESADO FINAL: {len(df_final)} inmuebles únicos seleccionados para el reporte."
     # --- PDF ---
     pdf_path = f"Reporte_{int(time.time())}.pdf"
     pdf = FPDF()
     pdf.add_page()
     pdf.set_font("Arial", 'B', 14)
+    pdf.cell(0, 10, f"ESTUDIO DE MERCADO: {zona.upper()}", ln=True)
     pdf.ln(5)
     for _, r in df_final.iterrows():
+        pdf.set_fill_color(240, 240, 240)
         pdf.set_font("Arial", 'B', 10)
+        pdf.cell(0, 8, f"{r['Portal']} - ${r['Precio']:,.0f}", ln=True, fill=True)
+        pdf.set_font("Arial", '', 9)
+        pdf.multi_cell(0, 5, f"{r['Descripcion']}")
         pdf.set_font("Arial", 'U', 8); pdf.set_text_color(0,0,255)
+        pdf.cell(0, 6, "Ver Publicacion Original", link=r['URL'], ln=True)
         pdf.set_text_color(0,0,0); pdf.ln(3)
     pdf.output(pdf_path)
     # --- CÁLCULOS ---
+    if not df_final.empty:
+        promedio = df_final['Precio_M2'].mean() * area
+        minimo = df_final['Precio'].min()
+        maximo = df_final['Precio'].max()
+        resumen = (
+            f"💰 **ESTIMACIÓN DE RENTA**\n"
+            f"🔹 **Canon Sugerido:** ${promedio:,.0f}\n"
+            f"📉 Mínimo Zona: ${minimo:,.0f}\n"
+            f"📈 Máximo Zona: ${maximo:,.0f}"
+        )
+    else:
+        resumen = "⚠️ No hay suficientes datos para calcular."
+    return f"{log_visible}\n✅ Tarea Completada.", df_final, pdf_path, resumen
+# --- INTERFAZ GRÁFICA ---
 with gr.Blocks(theme=gr.themes.Soft()) as demo:
+    gr.Markdown("## 🤖 TramitIA Pro: Analizador Inmobiliario (vFinal)")
     with gr.Row():
         with gr.Column(scale=1):
                 ["Menos de 1 año", "1 a 8 años", "9 a 15 años", "16 a 30 años", "Más de 30 años"],
                 label="Antigüedad", value="1 a 8 años"
             )
+            btn = gr.Button("EJECUTAR ANÁLISIS", variant="primary")
         with gr.Column(scale=2):
+            res_fin = gr.Markdown("### 💰 El resultado aparecerá aquí...")
             with gr.Tabs():
+                with gr.TabItem("Auditoría"): msg = gr.Textbox(lines=10, label="Log del Sistema")
+                with gr.TabItem("Tabla de Resultados"): out_df = gr.Dataframe()
                 with gr.TabItem("Descargar PDF"): out_pdf = gr.File()
+    btn.click(motor_tramitia_final, [z, c, a, t, h, b, p, e], [msg, out_df, out_pdf, res_fin])
 demo.launch()