ProyectoGao / src /analysis.py
barbaroyoel's picture
Upload 9 files
988a892 verified
import pandas as pd
import plotly.express as px
import json
from collections import Counter
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import unicodedata
PRIMARY_COLOR_1 = "#e4ab0d"
PRIMARY_COLOR_2 = "#2A4A6B"
def plot_properties_by_municipality(dataframe: pd.DataFrame, top_n: int = 10):
counts = dataframe['Municipio'].value_counts().reset_index()
counts.columns = ['Municipio', 'Cantidad']
top_counts = counts.head(top_n)
fig = px.bar(top_counts, x='Cantidad', y='Municipio', orientation='h',
title=f'Top {top_n} Municipios con más Propiedades',
labels={'Cantidad': 'Número de Propiedades', 'Municipio': 'Municipio'},
color='Cantidad',
color_continuous_scale=[PRIMARY_COLOR_2, PRIMARY_COLOR_1])
return fig
def plot_category_distribution(dataframe: pd.DataFrame):
counts = dataframe['Categoria'].value_counts().reset_index()
counts.columns = ['Categoria', 'Cantidad']
color_map = {
'alquiler': PRIMARY_COLOR_1,
'venta': PRIMARY_COLOR_2
}
fig = px.pie(counts, values='Cantidad', names='Categoria',
title='Proporción Alquiler vs Venta',
hole=0.3,
color='Categoria',
color_discrete_map=color_map)
return fig
def plot_property_type_distribution(dataframe: pd.DataFrame):
filtered_data = dataframe[dataframe['Tipo'].isin(['casa', 'apartamento'])]
counts = filtered_data['Tipo'].value_counts().reset_index()
counts.columns = ['Tipo', 'Cantidad']
color_map = {
'casa': PRIMARY_COLOR_1,
'apartamento': PRIMARY_COLOR_2
}
fig = px.pie(counts, values='Cantidad', names='Tipo',
title='Proporción Casas vs Apartamentos',
hole=0.3,
color='Tipo',
color_discrete_map=color_map)
return fig
def plot_price_by_municipality(dataframe: pd.DataFrame):
"""Precio promedio y mediana por municipio"""
price_data = dataframe.groupby('Municipio')['Precio'].agg(['mean', 'median']).reset_index()
price_data = price_data.sort_values('mean', ascending=False)
fig = px.bar(price_data,
x='Municipio',
y=['mean', 'median'],
barmode='group',
title='Precio Promedio y Mediano por Municipio',
labels={'value': 'Precio (USD)', 'variable': 'Métrica'},
color_discrete_sequence=[PRIMARY_COLOR_1, PRIMARY_COLOR_2])
return fig
def plot_price_trend_by_property_type(dataframe: pd.DataFrame):
"""Línea de tiempo de precios medianos por tipo de propiedad"""
filtered_data = dataframe[dataframe['Tipo'].isin(['casa', 'apartamento'])]
filtered_data['Fecha'] = pd.to_datetime(filtered_data['Fecha'], errors='coerce')
filtered_data['Mes'] = filtered_data['Fecha'].dt.to_period('M').dt.to_timestamp()
price_data = filtered_data.groupby(['Mes', 'Tipo'])['Precio'].median().reset_index()
fig = px.line(price_data,
x='Mes',
y='Precio',
color='Tipo',
title='Evolución del Precio Mediano por Tipo de Propiedad',
labels={'Precio': 'Precio Mediano (USD)', 'Mes': 'Fecha'},
color_discrete_map={
'casa': PRIMARY_COLOR_1,
'apartamento': PRIMARY_COLOR_2
})
return fig
def plot_price_trend(dataframe: pd.DataFrame):
"""Línea de tiempo de precio mediano filtrado"""
if not pd.api.types.is_datetime64_any_dtype(dataframe['Fecha']):
dataframe['Fecha'] = pd.to_datetime(dataframe['Fecha'], errors='coerce')
dataframe['Mes'] = dataframe['Fecha'].dt.to_period('M').dt.to_timestamp()
price_data = dataframe.groupby('Mes')['Precio'].median().reset_index()
if len(price_data) < 2:
return None
fig = px.line(
price_data,
x='Mes',
y='Precio',
markers=True,
title='Evolución del Precio Mediano',
labels={'Precio': 'Precio Mediano (USD)', 'Mes': 'Fecha'},
color_discrete_sequence=[PRIMARY_COLOR_1]
)
fig.update_traces(
text=price_data['Precio'].apply(lambda x: f"${x:,.0f}"),
textposition="top center",
hovertemplate="<b>%{x|%b %Y}</b><br>Precio: $%{y:,.0f} USD"
)
fig.update_layout(
hovermode="x unified",
xaxis=dict(
tickformat="%b %Y",
tickmode='auto',
nticks=min(12, len(price_data))
))
return fig
def plot_amenities_by_property_type(dataframe: pd.DataFrame, top_n: int = 10):
"""Analiza y grafica las amenidades más comunes por tipo de propiedad"""
filtered_data = dataframe[dataframe['Tipo'].isin(['casa', 'apartamento'])].copy()
filtered_data = filtered_data[filtered_data['Amenidades'].apply(lambda x: isinstance(x, list) and len(x) > 0)]
houses_data = filtered_data[filtered_data['Tipo'] == 'casa']
apartments_data = filtered_data[filtered_data['Tipo'] == 'apartamento']
def count_amenities(data_group):
counter = Counter()
for amenities in data_group['Amenidades']:
counter.update(amenities)
return counter
houses_counter = count_amenities(houses_data)
apartments_counter = count_amenities(apartments_data)
houses_count = pd.DataFrame(houses_counter.most_common(top_n),
columns=['Amenidad', 'Casas'])
apartments_count = pd.DataFrame(apartments_counter.most_common(top_n),
columns=['Amenidad', 'Apartamentos'])
comparison_data = pd.merge(houses_count, apartments_count,
on='Amenidad', how='outer').fillna(0)
comparison_data['Total'] = comparison_data['Casas'] + comparison_data['Apartamentos']
comparison_data = comparison_data.sort_values('Total', ascending=False).head(top_n)
fig = px.bar(
comparison_data,
x='Amenidad',
y=['Casas', 'Apartamentos'],
title=f'Top {top_n} Amenidades por Tipo de Propiedad',
labels={'value': 'Número de Propiedades', 'Amenidad': 'Amenidad'},
barmode='group',
color_discrete_sequence=[PRIMARY_COLOR_1, PRIMARY_COLOR_2]
)
fig.update_layout(
legend_title_text='Tipo de Propiedad',
xaxis_tickangle=-45,
height=500,
margin=dict(l=50, r=50, t=80, b=150)
)
fig.update_traces(
hovertemplate='<b>%{x}</b><br>Tipo: %{meta[0]}<br>Propiedades: %{y}',
marker_line_color='white',
marker_line_width=1,
meta=[['Casas']*len(comparison_data), ['Apartamentos']*len(comparison_data)]
)
return fig
def plot_top_amenities_by_filters(dataframe: pd.DataFrame, top_n: int = 10) -> go.Figure:
filtered_data = dataframe[dataframe['Amenidades'].apply(lambda x: isinstance(x, list) and len(x) > 0)].copy()
amenities_counter = Counter()
for amenities in filtered_data['Amenidades']:
amenities_counter.update(amenities)
top_amenities = amenities_counter.most_common(top_n)
amenities_data = pd.DataFrame(top_amenities, columns=['Amenidad', 'Cantidad'])
amenities_data = amenities_data.sort_values('Cantidad', ascending=True)
title = "Top Amenidades"
if len(filtered_data) > 0:
category = filtered_data['Categoria'].iloc[0] if 'Categoria' in filtered_data.columns and len(filtered_data['Categoria'].unique()) == 1 else None
property_type = filtered_data['Tipo'].iloc[0] if 'Tipo' in filtered_data.columns and len(filtered_data['Tipo'].unique()) == 1 else None
if category and property_type:
title = f"Amenidades más Comunes en {property_type.capitalize()}s para {'Venta' if category == 'venta' else 'Alquiler'}"
elif category:
title = f"Amenidades más Comunes en Propiedades para {'Venta' if category == 'venta' else 'Alquiler'}"
elif property_type:
title = f"Amenidades más Comunes en {property_type.capitalize()}s"
fig = px.bar(
amenities_data,
x='Cantidad',
y='Amenidad',
orientation='h',
title=title,
labels={'Cantidad': 'Número de Propiedades', 'Amenidad': ''},
color='Cantidad',
color_continuous_scale=[PRIMARY_COLOR_2, PRIMARY_COLOR_1]
)
fig.update_layout(
showlegend=False,
height=500,
margin=dict(l=150, r=50, t=80, b=50),
yaxis={'categoryorder': 'total ascending'}
)
return fig
def get_top_amenities_description(dataframe: pd.DataFrame) -> str:
if dataframe.empty:
return "No hay datos disponibles"
amenities_counter = Counter()
for amenities in dataframe['Amenidades']:
if isinstance(amenities, list):
amenities_counter.update(amenities)
top_amenities = [amenity for amenity, _ in amenities_counter.most_common(3)]
return ", ".join(top_amenities)
def get_infrastructure_description(dataframe: pd.DataFrame) -> str:
infrastructure_keywords = ['cisterna', 'tanque elevado', 'planta eléctrica', 'pozo']
return describe_keywords_presence(dataframe, infrastructure_keywords, "infraestructura")
def get_spaces_description(dataframe: pd.DataFrame) -> str:
space_keywords = ['jardín', 'patio', 'terraza', 'balcón']
return describe_keywords_presence(dataframe, space_keywords, "espacios")
def describe_keywords_presence(dataframe: pd.DataFrame, keywords: list, category_name: str) -> str:
if dataframe.empty:
return "No hay datos disponibles"
total_properties = len(dataframe)
keyword_counts = {keyword: 0 for keyword in keywords}
for amenities in dataframe['Amenidades']:
if isinstance(amenities, list):
for keyword in keywords:
if keyword in amenities:
keyword_counts[keyword] += 1
significant_keywords = {
kw: count for kw, count in keyword_counts.items()
if count / total_properties >= 0.1
}
if not significant_keywords:
return f"ninguna característica de {category_name} destacada"
sorted_keywords = sorted(significant_keywords.items(), key=lambda x: x[1], reverse=True)
return ", ".join([f"{kw} ({count/total_properties:.0%})" for kw, count in sorted_keywords])
def plot_amenities_distribution(dataframe: pd.DataFrame, top_n: int = 15) -> go.Figure:
amenities_counter = Counter()
for amenities in dataframe['Amenidades']:
if isinstance(amenities, list):
amenities_counter.update(amenities)
top_amenities = amenities_counter.most_common(top_n)
amenities_data = pd.DataFrame(top_amenities, columns=['Amenidad', 'Cantidad'])
if amenities_data.empty:
return None
fig_bar = px.bar(
amenities_data.sort_values('Cantidad', ascending=True),
x='Cantidad',
y='Amenidad',
orientation='h',
title='Amenidades más Comunes',
labels={'Cantidad': 'Número de Propiedades', 'Amenidad': ''},
color_discrete_sequence=[PRIMARY_COLOR_1]
)
fig_bar.update_layout(
showlegend=False,
height=500,
margin=dict(l=100, r=50, t=80, b=50)
)
fig_pie = px.pie(
amenities_data,
names='Amenidad',
values='Cantidad',
title='Distribución de Amenidades',
hole=0.3
)
fig_pie.update_traces(
textposition='inside',
textinfo='percent+label',
hovertemplate='<b>%{label}</b><br>%{value} propiedades (%{percent})',
marker=dict(colors=[PRIMARY_COLOR_1, PRIMARY_COLOR_2] + px.colors.sequential.Blues[2:])
)
fig_final = make_subplots(
rows=1, cols=2,
specs=[[{"type": "bar"}, {"type": "pie"}]],
subplot_titles=('Top Amenidades', 'Distribución Porcentual'),
horizontal_spacing=0.1
)
for trace in fig_bar.data:
fig_final.add_trace(trace, row=1, col=1)
fig_final.add_trace(fig_pie.data[0], row=1, col=2)
fig_final.update_layout(
title_text='Análisis de Amenidades',
height=400,
showlegend=False,
margin=dict(t=100)
)
return fig_final
def plot_housing_construction(onei_data: json) -> go.Figure:
construction_data = pd.DataFrame(onei_data['viviendas_terminadas'])
fig = px.bar(
construction_data,
x='año',
y='cantidad',
title='Viviendas Terminadas en La Habana (2020-2024)',
labels={'cantidad': 'Viviendas Terminadas', 'año': 'Año'},
text='cantidad',
color_discrete_sequence=[PRIMARY_COLOR_1]
)
fig.update_traces(
textposition='outside',
marker_line_color='black',
marker_line_width=1
)
fig.update_layout(
yaxis_range=[0, construction_data['cantidad'].max() + 1000],
xaxis=dict(tickmode='linear')
)
return fig
def remove_accents(input_str):
nfkd_form = unicodedata.normalize('NFKD', input_str)
return "".join([c for c in nfkd_form if not unicodedata.combining(c)])
def plot_habana_map(dataframe: pd.DataFrame, geojson_path: str, category: str) -> px.choropleth:
PRIMARY_COLOR = "#1b4a92"
SECONDARY_COLOR = "#e4ab0d"
BACKGROUND_COLOR = "#091b3f"
TEXT_COLOR = "#e4ab0d"
cat_map = {"Alquileres": "alquiler", "Ventas": "venta"}
filtered_data = dataframe[dataframe["Categoria"] == cat_map[category]].copy()
if len(filtered_data) < 3:
fig = go.Figure()
fig.add_annotation(
text="⚠️ No hay suficientes datos para mostrar este mapa",
xref="paper", yref="paper",
x=0.5, y=0.5, showarrow=False,
font=dict(size=20, color=TEXT_COLOR))
fig.update_layout(
title=f"Precio Mediano de {category} por Municipio",
paper_bgcolor=BACKGROUND_COLOR,
plot_bgcolor=BACKGROUND_COLOR,
font=dict(color=TEXT_COLOR)
)
return fig
filtered_data["Municipio"] = filtered_data["Municipio"].apply(lambda x: remove_accents(x).lower().strip())
median_price = filtered_data.groupby("Municipio", as_index=False)["Precio"].median()
try:
with open(geojson_path, encoding="utf-8") as f:
geojson = json.load(f)
except Exception as e:
print(f"Error cargando GeoJSON: {e}")
fig = go.Figure()
fig.add_annotation(
text=f"Error cargando GeoJSON: {str(e)}",
xref="paper", yref="paper",
x=0.5, y=0.5, showarrow=False,
font=dict(size=15, color=TEXT_COLOR))
fig.update_layout(
title=f"Precio Mediano de {category} por Municipio",
paper_bgcolor=BACKGROUND_COLOR,
plot_bgcolor=BACKGROUND_COLOR,
font=dict(color=TEXT_COLOR))
return fig
for feature in geojson['features']:
municipio_name = feature['properties']['municipality']
feature['properties']['municipality_clean'] = remove_accents(municipio_name).lower().strip()
fig = px.choropleth(
median_price,
geojson=geojson,
locations="Municipio",
featureidkey="properties.municipality_clean",
color="Precio",
color_continuous_scale=[PRIMARY_COLOR, SECONDARY_COLOR], # Escala azul a dorado
range_color=(median_price["Precio"].min(), median_price["Precio"].max()),
labels={"Precio": "Precio Mediano (USD)"},
title=f"Precio Mediano de {category} por Municipio",
hover_data={"Municipio": True, "Precio": ":.0f"}
)
fig.update_traces(
hovertemplate="<b>%{location}</b><br>Precio: $%{z:,.0f} USD<extra></extra>"
)
fig.update_geos(
visible=False,
center={"lat": 23.1136, "lon": -82.3666},
projection_scale=9,
fitbounds="locations",
bgcolor=BACKGROUND_COLOR
)
fig.update_layout(
margin={"r": 0, "t": 60, "l": 0, "b": 0},
height=550,
coloraxis_colorbar=dict(
title="USD",
thickness=15,
len=0.75,
tickformat=",",
tickprefix="$",
yanchor="middle",
y=0.5
),
paper_bgcolor=BACKGROUND_COLOR,
plot_bgcolor=BACKGROUND_COLOR,
font=dict(color=TEXT_COLOR),
title_font=dict(size=20, color=SECONDARY_COLOR),
coloraxis_colorbar_title_side="right",
annotations=[
dict(
x=0.5,
y=-0.1,
showarrow=False,
text="Fuente: Análisis GAO | Datos 2024-2025",
xref="paper",
yref="paper",
font=dict(size=12, color=TEXT_COLOR))
]
)
return fig