Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
|
@@ -20,7 +20,7 @@ except ImportError:
|
|
| 20 |
subprocess.run(["pip", "install", "fake-useragent"], check=True)
|
| 21 |
from fake_useragent import UserAgent
|
| 22 |
|
| 23 |
-
# --- 1. GENERADOR DE URLS ---
|
| 24 |
def construir_urls_final(zona, ciudad, tipo, hab, ban, park, antiguedad):
|
| 25 |
mapa_ant = {
|
| 26 |
"Menos de 1 año": "de-0-a-1-anos",
|
|
@@ -31,24 +31,25 @@ def construir_urls_final(zona, ciudad, tipo, hab, ban, park, antiguedad):
|
|
| 31 |
}
|
| 32 |
slug_ant = mapa_ant.get(antiguedad, "de-1-a-8-anios")
|
| 33 |
slug_park = f"{int(park)}-parqueadero" if int(park) == 1 else f"{int(park)}-parqueaderos"
|
| 34 |
-
query_geo = f"{zona.lower().replace(' ', '-')}-{ciudad.lower().replace(' ', '-')}"
|
| 35 |
|
| 36 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 37 |
|
| 38 |
-
c_slug = ciudad.lower().replace(" ", "-")
|
| 39 |
url_mc = f"https://www.metrocuadrado.com/{tipo.lower()}-casa-oficina/arriendo/{c_slug}/{int(ban)}-banos-{int(hab)}-habitaciones/?search=form"
|
| 40 |
|
| 41 |
return url_fr, url_mc
|
| 42 |
|
| 43 |
# --- 2. EXTRACTOR GENÉRICO (Regex) ---
|
| 44 |
def extraer_precio_regex(texto):
|
| 45 |
-
# Busca patrones como $ 1.500.000 o $1500000
|
| 46 |
patron = r'\$\s?(\d{1,3}(?:[.,]\d{3})*)'
|
| 47 |
coincidencias = re.findall(patron, texto)
|
| 48 |
if coincidencias:
|
| 49 |
-
# Convertir a entero: quitar puntos y comas
|
| 50 |
precios = [int(p.replace('.', '').replace(',', '')) for p in coincidencias]
|
| 51 |
-
return max(precios)
|
| 52 |
return 0
|
| 53 |
|
| 54 |
# --- 3. MOTOR DE EXTRACCIÓN CAMUFLADO ---
|
|
@@ -60,36 +61,20 @@ def motor_tramitia_camuflado(zona, ciudad, area, tipo, hab, ban, park, antigueda
|
|
| 60 |
ua = UserAgent()
|
| 61 |
|
| 62 |
with sync_playwright() as p:
|
| 63 |
-
# LANZAMIENTO EN MODO FURTIVO
|
| 64 |
browser = p.chromium.launch(
|
| 65 |
headless=True,
|
| 66 |
args=[
|
| 67 |
-
'--disable-blink-features=AutomationControlled',
|
| 68 |
'--no-sandbox',
|
| 69 |
-
'--disable-
|
| 70 |
-
'--disable-infobars',
|
| 71 |
-
'--window-position=0,0',
|
| 72 |
-
'--ignore-certifcate-errors',
|
| 73 |
-
'--ignore-certifcate-errors-spki-list',
|
| 74 |
-
'--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36'
|
| 75 |
]
|
| 76 |
)
|
| 77 |
-
|
| 78 |
-
# Contexto con Viewport aleatorio para parecer humano
|
| 79 |
context = browser.new_context(
|
| 80 |
user_agent=ua.random,
|
| 81 |
viewport={'width': 1366, 'height': 768},
|
| 82 |
-
locale='es-CO'
|
| 83 |
-
timezone_id='America/Bogota'
|
| 84 |
)
|
| 85 |
-
|
| 86 |
-
# Script para evasión de detección de WebDriver
|
| 87 |
-
init_script = """
|
| 88 |
-
Object.defineProperty(navigator, 'webdriver', {
|
| 89 |
-
get: () => undefined
|
| 90 |
-
});
|
| 91 |
-
"""
|
| 92 |
-
context.add_init_script(init_script)
|
| 93 |
|
| 94 |
# --- FINCA RAÍZ ---
|
| 95 |
try:
|
|
@@ -97,28 +82,21 @@ def motor_tramitia_camuflado(zona, ciudad, area, tipo, hab, ban, park, antigueda
|
|
| 97 |
log_visible += "🔄 Conectando a FR...\n"
|
| 98 |
page.goto(url_fr, wait_until="domcontentloaded", timeout=60000)
|
| 99 |
|
| 100 |
-
|
| 101 |
-
|
| 102 |
-
page.mouse.wheel(0, 500)
|
| 103 |
-
time.sleep(random.uniform(2, 4))
|
| 104 |
-
|
| 105 |
-
# Scroll lento
|
| 106 |
-
for _ in range(3):
|
| 107 |
page.mouse.wheel(0, 1500)
|
| 108 |
-
time.sleep(1)
|
| 109 |
|
| 110 |
-
#
|
| 111 |
-
elementos = page.query_selector_all("a")
|
| 112 |
-
|
| 113 |
cont_fr = 0
|
| 114 |
for el in elementos:
|
| 115 |
-
if cont_fr >=
|
| 116 |
txt = el.inner_text()
|
| 117 |
|
| 118 |
-
# Si tiene signo pesos, es un candidato
|
| 119 |
if "$" in txt:
|
| 120 |
precio = extraer_precio_regex(txt)
|
| 121 |
-
if precio > 500000:
|
| 122 |
href = el.get_attribute("href")
|
| 123 |
full_url = f"https://www.fincaraiz.com.co{href}" if href.startswith("/") else href
|
| 124 |
|
|
@@ -131,7 +109,7 @@ def motor_tramitia_camuflado(zona, ciudad, area, tipo, hab, ban, park, antigueda
|
|
| 131 |
})
|
| 132 |
cont_fr += 1
|
| 133 |
page.close()
|
| 134 |
-
log_visible += f"✅ FR: {cont_fr} datos extraídos.\n"
|
| 135 |
except Exception as e: log_visible += f"⚠️ Error FR: {e}\n"
|
| 136 |
|
| 137 |
# --- METROCUADRADO ---
|
|
@@ -140,25 +118,19 @@ def motor_tramitia_camuflado(zona, ciudad, area, tipo, hab, ban, park, antigueda
|
|
| 140 |
log_visible += "🔄 Conectando a MC...\n"
|
| 141 |
page.goto(url_mc, wait_until="domcontentloaded", timeout=60000)
|
| 142 |
|
| 143 |
-
page.mouse.move(random.randint(100, 500), random.randint(100, 500))
|
| 144 |
-
time.sleep(random.uniform(2, 4))
|
| 145 |
-
|
| 146 |
for _ in range(4):
|
| 147 |
page.mouse.wheel(0, 1000)
|
| 148 |
-
time.sleep(1)
|
| 149 |
|
| 150 |
-
# MC suele usar Li o Div cards. Buscamos contenedores genéricos
|
| 151 |
cards = page.query_selector_all("li, div[class*='card']")
|
| 152 |
-
|
| 153 |
cont_mc = 0
|
| 154 |
for card in cards:
|
| 155 |
-
if cont_mc >=
|
| 156 |
txt = card.inner_text()
|
| 157 |
|
| 158 |
if "$" in txt:
|
| 159 |
precio = extraer_precio_regex(txt)
|
| 160 |
if precio > 500000:
|
| 161 |
-
# Buscamos el link dentro de esa tarjeta
|
| 162 |
enlace = card.query_selector("a")
|
| 163 |
if enlace:
|
| 164 |
href = enlace.get_attribute("href")
|
|
@@ -173,24 +145,37 @@ def motor_tramitia_camuflado(zona, ciudad, area, tipo, hab, ban, park, antigueda
|
|
| 173 |
})
|
| 174 |
cont_mc += 1
|
| 175 |
page.close()
|
| 176 |
-
log_visible += f"✅ MC: {cont_mc} datos extraídos.\n"
|
| 177 |
except Exception as e: log_visible += f"⚠️ Error MC: {e}\n"
|
| 178 |
|
| 179 |
browser.close()
|
| 180 |
|
| 181 |
if not resultados:
|
| 182 |
-
return f"{log_visible}\n❌ NO SE ENCONTRARON DATOS.
|
| 183 |
|
| 184 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 185 |
|
| 186 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 187 |
pdf_path = f"Reporte_{int(time.time())}.pdf"
|
| 188 |
pdf = FPDF()
|
| 189 |
pdf.add_page()
|
| 190 |
pdf.set_font("Arial", 'B', 14)
|
| 191 |
pdf.cell(0, 10, f"ESTUDIO {zona.upper()}", ln=True)
|
| 192 |
pdf.ln(5)
|
| 193 |
-
for _, r in
|
| 194 |
pdf.set_font("Arial", 'B', 10)
|
| 195 |
pdf.cell(0, 8, f"${r['Precio']:,.0f} - {r['Portal']}", ln=True)
|
| 196 |
pdf.set_font("Arial", '', 8)
|
|
@@ -200,35 +185,35 @@ def motor_tramitia_camuflado(zona, ciudad, area, tipo, hab, ban, park, antigueda
|
|
| 200 |
pdf.set_text_color(0,0,0); pdf.ln(3)
|
| 201 |
pdf.output(pdf_path)
|
| 202 |
|
| 203 |
-
#
|
| 204 |
-
promedio =
|
| 205 |
-
minimo =
|
| 206 |
-
maximo =
|
| 207 |
|
| 208 |
resumen = (
|
| 209 |
f"💰 **ESTIMACIÓN DE RENTA**\n"
|
| 210 |
-
f"🔹 **
|
| 211 |
-
f"📉 Mínimo: ${minimo:,.0f}\n"
|
| 212 |
-
f"📈 Máximo: ${maximo:,.0f}"
|
| 213 |
)
|
| 214 |
|
| 215 |
-
return f"{log_visible}\n✅ Proceso Terminado.",
|
| 216 |
|
| 217 |
# --- INTERFAZ ---
|
| 218 |
with gr.Blocks(theme=gr.themes.Soft()) as demo:
|
| 219 |
-
gr.Markdown("## 🤖 TramitIA Pro:
|
| 220 |
|
| 221 |
with gr.Row():
|
| 222 |
with gr.Column(scale=1):
|
| 223 |
c = gr.Textbox(label="Ciudad", value="Bogota")
|
| 224 |
-
z = gr.Textbox(label="Zona", value="
|
| 225 |
a = gr.Number(label="Área M2", value=60)
|
| 226 |
t = gr.Dropdown(["Apartamento", "Casa"], label="Tipo", value="Apartamento")
|
| 227 |
|
| 228 |
with gr.Row():
|
| 229 |
-
h = gr.Number(label="
|
| 230 |
b = gr.Number(label="Baños", value=2)
|
| 231 |
-
p = gr.Number(label="
|
| 232 |
|
| 233 |
e = gr.Dropdown(
|
| 234 |
["Menos de 1 año", "1 a 8 años", "9 a 15 años", "16 a 30 años", "Más de 30 años"],
|
|
@@ -239,9 +224,9 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
|
|
| 239 |
with gr.Column(scale=2):
|
| 240 |
res_fin = gr.Markdown("### 💰 Resultado...")
|
| 241 |
with gr.Tabs():
|
| 242 |
-
with gr.TabItem("
|
| 243 |
-
with gr.TabItem("
|
| 244 |
-
with gr.TabItem("PDF"): out_pdf = gr.File()
|
| 245 |
|
| 246 |
btn.click(motor_tramitia_camuflado, [z, c, a, t, h, b, p, e], [msg, out_df, out_pdf, res_fin])
|
| 247 |
|
|
|
|
| 20 |
subprocess.run(["pip", "install", "fake-useragent"], check=True)
|
| 21 |
from fake_useragent import UserAgent
|
| 22 |
|
| 23 |
+
# --- 1. GENERADOR DE URLS (CORREGIDO) ---
|
| 24 |
def construir_urls_final(zona, ciudad, tipo, hab, ban, park, antiguedad):
|
| 25 |
mapa_ant = {
|
| 26 |
"Menos de 1 año": "de-0-a-1-anos",
|
|
|
|
| 31 |
}
|
| 32 |
slug_ant = mapa_ant.get(antiguedad, "de-1-a-8-anios")
|
| 33 |
slug_park = f"{int(park)}-parqueadero" if int(park) == 1 else f"{int(park)}-parqueaderos"
|
|
|
|
| 34 |
|
| 35 |
+
z_slug = zona.lower().strip().replace(" ", "-")
|
| 36 |
+
c_slug = ciudad.lower().strip().replace(" ", "-")
|
| 37 |
+
|
| 38 |
+
# FR CORREGIDO: Se reincorpora la zona y ciudad en el path principal
|
| 39 |
+
# Ejemplo: /arriendo/usaquen/bogota/3-o-mas-habitaciones...
|
| 40 |
+
url_fr = f"https://www.fincaraiz.com.co/arriendo/{z_slug}/{c_slug}/{int(hab)}-o-mas-habitaciones/{int(ban)}-o-mas-banos/{slug_park}/{slug_ant}"
|
| 41 |
|
|
|
|
| 42 |
url_mc = f"https://www.metrocuadrado.com/{tipo.lower()}-casa-oficina/arriendo/{c_slug}/{int(ban)}-banos-{int(hab)}-habitaciones/?search=form"
|
| 43 |
|
| 44 |
return url_fr, url_mc
|
| 45 |
|
| 46 |
# --- 2. EXTRACTOR GENÉRICO (Regex) ---
|
| 47 |
def extraer_precio_regex(texto):
|
|
|
|
| 48 |
patron = r'\$\s?(\d{1,3}(?:[.,]\d{3})*)'
|
| 49 |
coincidencias = re.findall(patron, texto)
|
| 50 |
if coincidencias:
|
|
|
|
| 51 |
precios = [int(p.replace('.', '').replace(',', '')) for p in coincidencias]
|
| 52 |
+
return max(precios)
|
| 53 |
return 0
|
| 54 |
|
| 55 |
# --- 3. MOTOR DE EXTRACCIÓN CAMUFLADO ---
|
|
|
|
| 61 |
ua = UserAgent()
|
| 62 |
|
| 63 |
with sync_playwright() as p:
|
|
|
|
| 64 |
browser = p.chromium.launch(
|
| 65 |
headless=True,
|
| 66 |
args=[
|
| 67 |
+
'--disable-blink-features=AutomationControlled',
|
| 68 |
'--no-sandbox',
|
| 69 |
+
'--disable-infobars'
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 70 |
]
|
| 71 |
)
|
|
|
|
|
|
|
| 72 |
context = browser.new_context(
|
| 73 |
user_agent=ua.random,
|
| 74 |
viewport={'width': 1366, 'height': 768},
|
| 75 |
+
locale='es-CO'
|
|
|
|
| 76 |
)
|
| 77 |
+
context.add_init_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined});")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 78 |
|
| 79 |
# --- FINCA RAÍZ ---
|
| 80 |
try:
|
|
|
|
| 82 |
log_visible += "🔄 Conectando a FR...\n"
|
| 83 |
page.goto(url_fr, wait_until="domcontentloaded", timeout=60000)
|
| 84 |
|
| 85 |
+
page.mouse.move(200, 200)
|
| 86 |
+
for _ in range(4):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 87 |
page.mouse.wheel(0, 1500)
|
| 88 |
+
time.sleep(1.5)
|
| 89 |
|
| 90 |
+
# Buscamos enlaces (aumentamos a 20 para sobrevivir a la eliminación de duplicados)
|
| 91 |
+
elementos = page.query_selector_all("a")
|
|
|
|
| 92 |
cont_fr = 0
|
| 93 |
for el in elementos:
|
| 94 |
+
if cont_fr >= 15: break # Extraemos de más a propósito
|
| 95 |
txt = el.inner_text()
|
| 96 |
|
|
|
|
| 97 |
if "$" in txt:
|
| 98 |
precio = extraer_precio_regex(txt)
|
| 99 |
+
if precio > 500000:
|
| 100 |
href = el.get_attribute("href")
|
| 101 |
full_url = f"https://www.fincaraiz.com.co{href}" if href.startswith("/") else href
|
| 102 |
|
|
|
|
| 109 |
})
|
| 110 |
cont_fr += 1
|
| 111 |
page.close()
|
| 112 |
+
log_visible += f"✅ FR: {cont_fr} datos brutos extraídos.\n"
|
| 113 |
except Exception as e: log_visible += f"⚠️ Error FR: {e}\n"
|
| 114 |
|
| 115 |
# --- METROCUADRADO ---
|
|
|
|
| 118 |
log_visible += "🔄 Conectando a MC...\n"
|
| 119 |
page.goto(url_mc, wait_until="domcontentloaded", timeout=60000)
|
| 120 |
|
|
|
|
|
|
|
|
|
|
| 121 |
for _ in range(4):
|
| 122 |
page.mouse.wheel(0, 1000)
|
| 123 |
+
time.sleep(1.5)
|
| 124 |
|
|
|
|
| 125 |
cards = page.query_selector_all("li, div[class*='card']")
|
|
|
|
| 126 |
cont_mc = 0
|
| 127 |
for card in cards:
|
| 128 |
+
if cont_mc >= 15: break # Extraemos de más a propósito
|
| 129 |
txt = card.inner_text()
|
| 130 |
|
| 131 |
if "$" in txt:
|
| 132 |
precio = extraer_precio_regex(txt)
|
| 133 |
if precio > 500000:
|
|
|
|
| 134 |
enlace = card.query_selector("a")
|
| 135 |
if enlace:
|
| 136 |
href = enlace.get_attribute("href")
|
|
|
|
| 145 |
})
|
| 146 |
cont_mc += 1
|
| 147 |
page.close()
|
| 148 |
+
log_visible += f"✅ MC: {cont_mc} datos brutos extraídos.\n"
|
| 149 |
except Exception as e: log_visible += f"⚠️ Error MC: {e}\n"
|
| 150 |
|
| 151 |
browser.close()
|
| 152 |
|
| 153 |
if not resultados:
|
| 154 |
+
return f"{log_visible}\n❌ NO SE ENCONTRARON DATOS.", None, None, "---"
|
| 155 |
|
| 156 |
+
# --- LIMPIEZA DE DUPLICADOS Y LIMITACIÓN ---
|
| 157 |
+
# 1. Convertimos a DataFrame
|
| 158 |
+
df_crudo = pd.DataFrame(resultados)
|
| 159 |
+
# 2. Borramos los que tienen la misma URL (los clones)
|
| 160 |
+
df_limpio = df_crudo.drop_duplicates(subset=['URL'])
|
| 161 |
|
| 162 |
+
# 3. Forzamos a que sean máximo 6 únicos por cada portal
|
| 163 |
+
df_fr_final = df_limpio[df_limpio['Portal'] == 'Finca Raiz'].head(6)
|
| 164 |
+
df_mc_final = df_limpio[df_limpio['Portal'] == 'Metrocuadrado'].head(6)
|
| 165 |
+
|
| 166 |
+
# 4. Unimos todo de nuevo
|
| 167 |
+
df_final = pd.concat([df_fr_final, df_mc_final]).reset_index(drop=True)
|
| 168 |
+
|
| 169 |
+
log_visible += f"\n🧹 Limpieza final: Quedaron {len(df_final)} inmuebles únicos y reales."
|
| 170 |
+
|
| 171 |
+
# --- PDF ---
|
| 172 |
pdf_path = f"Reporte_{int(time.time())}.pdf"
|
| 173 |
pdf = FPDF()
|
| 174 |
pdf.add_page()
|
| 175 |
pdf.set_font("Arial", 'B', 14)
|
| 176 |
pdf.cell(0, 10, f"ESTUDIO {zona.upper()}", ln=True)
|
| 177 |
pdf.ln(5)
|
| 178 |
+
for _, r in df_final.iterrows():
|
| 179 |
pdf.set_font("Arial", 'B', 10)
|
| 180 |
pdf.cell(0, 8, f"${r['Precio']:,.0f} - {r['Portal']}", ln=True)
|
| 181 |
pdf.set_font("Arial", '', 8)
|
|
|
|
| 185 |
pdf.set_text_color(0,0,0); pdf.ln(3)
|
| 186 |
pdf.output(pdf_path)
|
| 187 |
|
| 188 |
+
# --- CÁLCULOS ---
|
| 189 |
+
promedio = df_final['Precio_M2'].mean() * area
|
| 190 |
+
minimo = df_final['Precio'].min()
|
| 191 |
+
maximo = df_final['Precio'].max()
|
| 192 |
|
| 193 |
resumen = (
|
| 194 |
f"💰 **ESTIMACIÓN DE RENTA**\n"
|
| 195 |
+
f"🔹 **Canon Sugerido:** ${promedio:,.0f}\n"
|
| 196 |
+
f"📉 Mínimo Zona: ${minimo:,.0f}\n"
|
| 197 |
+
f"📈 Máximo Zona: ${maximo:,.0f}"
|
| 198 |
)
|
| 199 |
|
| 200 |
+
return f"{log_visible}\n✅ Proceso Terminado.", df_final, pdf_path, resumen
|
| 201 |
|
| 202 |
# --- INTERFAZ ---
|
| 203 |
with gr.Blocks(theme=gr.themes.Soft()) as demo:
|
| 204 |
+
gr.Markdown("## 🤖 TramitIA Pro: Extracción Balanceada")
|
| 205 |
|
| 206 |
with gr.Row():
|
| 207 |
with gr.Column(scale=1):
|
| 208 |
c = gr.Textbox(label="Ciudad", value="Bogota")
|
| 209 |
+
z = gr.Textbox(label="Zona (Ej: Usaquen)", value="Usaquen")
|
| 210 |
a = gr.Number(label="Área M2", value=60)
|
| 211 |
t = gr.Dropdown(["Apartamento", "Casa"], label="Tipo", value="Apartamento")
|
| 212 |
|
| 213 |
with gr.Row():
|
| 214 |
+
h = gr.Number(label="Habitaciones", value=3)
|
| 215 |
b = gr.Number(label="Baños", value=2)
|
| 216 |
+
p = gr.Number(label="Parqueaderos", value=1)
|
| 217 |
|
| 218 |
e = gr.Dropdown(
|
| 219 |
["Menos de 1 año", "1 a 8 años", "9 a 15 años", "16 a 30 años", "Más de 30 años"],
|
|
|
|
| 224 |
with gr.Column(scale=2):
|
| 225 |
res_fin = gr.Markdown("### 💰 Resultado...")
|
| 226 |
with gr.Tabs():
|
| 227 |
+
with gr.TabItem("Auditoría"): msg = gr.Textbox(lines=8)
|
| 228 |
+
with gr.TabItem("Comparables (Únicos)"): out_df = gr.Dataframe()
|
| 229 |
+
with gr.TabItem("Descargar PDF"): out_pdf = gr.File()
|
| 230 |
|
| 231 |
btn.click(motor_tramitia_camuflado, [z, c, a, t, h, b, p, e], [msg, out_df, out_pdf, res_fin])
|
| 232 |
|