jccolon's picture
Update app/utils.py
5343ac6 verified
# app/utils.py
from __future__ import annotations
from typing import List, Any, Tuple
import ast
import json
import pandas as pd
import streamlit as st
# Columnas esperadas (las que exporta la app). Ajusta si tu export añade/quita columnas.
EXPECTED_COLS = [
"uri", "autor", "fecha", "texto_raw", "texto_clean",
"sentiment", "sent_desc", "p_neg", "p_neu", "p_pos", "sarcasm_score",
"n_palabras", "has_url", "hashtags", "mentions",
]
# Columnas que deben ser listas (si vienen como strings las convertimos)
LIST_COLUMNS = ["hashtags", "mentions"]
def validate_export_df(df: pd.DataFrame) -> Tuple[bool, List[str]]:
"""
Comprueba si el DataFrame tiene al menos las columnas mínimas esperadas.
Devuelve (is_valid, missing_cols_list).
"""
if df is None:
return False, EXPECTED_COLS.copy()
missing = [c for c in EXPECTED_COLS if c not in df.columns]
return len(missing) == 0, missing
def _to_list_safe(val: Any) -> list:
"""
Convierte de forma segura valores variados a lista:
- Si ya es lista -> la devuelve
- Si es string representando lista -> ast.literal_eval
- Si es string vacío / NaN / None -> []
- Si es otro tipo (ej str normal) -> []
"""
if val is None:
return []
if isinstance(val, list):
return val
if isinstance(val, (tuple, set)):
return list(val)
try:
if pd.isna(val):
return []
except Exception:
pass
if isinstance(val, str):
s = val.strip()
if s == "":
return []
if s.startswith("[") and s.endswith("]"):
try:
parsed = ast.literal_eval(s)
if isinstance(parsed, list):
return parsed
return [parsed]
except Exception:
inner = s[1:-1].strip()
if not inner:
return []
parts = [p.strip().strip("'\"") for p in inner.split(",") if p.strip()]
return parts
if "," in s:
parts = [p.strip() for p in s.split(",") if p.strip()]
return parts
return []
return []
def _ensure_list_columns_in_df(df: pd.DataFrame, cols: List[str]) -> pd.DataFrame:
"""
Garantiza que cada columna en `cols` exista en `df` y que todos sus valores sean listas.
"""
# si df es vacío, hacemo una copia con las columnas necesarias
if df is None:
df = pd.DataFrame(columns=EXPECTED_COLS)
for c in cols:
if c not in df.columns:
df[c] = [[] for _ in range(len(df))]
continue
df[c] = df[c].apply(_to_list_safe)
return df
def load_csv_to_df(uploaded) -> pd.DataFrame:
"""
Recibe el archivo subido por Streamlit (o ruta str) y devuelve DataFrame normalizado.
- Intentos con encoding heurístico.
- Normaliza columnas mínimas y convierte 'hashtags'/'mentions' a listas.
"""
try:
if isinstance(uploaded, str):
df = pd.read_csv(uploaded)
else:
uploaded.seek(0)
df = pd.read_csv(uploaded)
except Exception:
df = None
for enc in ("utf-8", "latin1", "iso-8859-1"):
try:
if isinstance(uploaded, str):
df = pd.read_csv(uploaded, encoding=enc)
else:
uploaded.seek(0)
df = pd.read_csv(uploaded, encoding=enc)
break
except Exception:
df = None
if df is None:
raise
if "texto_raw" in df.columns and "texto_clean" not in df.columns:
df["texto_clean"] = df["texto_raw"]
if "fecha" in df.columns:
try:
df["fecha"] = pd.to_datetime(df["fecha"], errors="coerce")
except Exception:
pass
for c in EXPECTED_COLS:
if c not in df.columns:
if c in LIST_COLUMNS:
df[c] = [[] for _ in range(len(df))]
else:
df[c] = None
df = _ensure_list_columns_in_df(df, LIST_COLUMNS)
cols_final = [c for c in EXPECTED_COLS if c in df.columns] + [c for c in df.columns if c not in EXPECTED_COLS]
return df[cols_final]
def merge_exported_with_new(existing: pd.DataFrame, new: pd.DataFrame, dedup_on: str = "uri") -> pd.DataFrame:
"""
Concatena existing + new y elimina duplicados por 'dedup_on' keeping newest (por fecha) si fecha existe.
Asegura homogeneidad en columnas lista (hashtags/mentions) para evitar errores de pyarrow/streamlit.
"""
# normalize inputs
ex = existing.copy() if existing is not None else pd.DataFrame()
nw = new.copy() if new is not None else pd.DataFrame()
# Ensure expected columns exist in both
for df_obj in (ex, nw):
for c in EXPECTED_COLS:
if c not in df_obj.columns:
if c in LIST_COLUMNS:
df_obj[c] = [[] for _ in range(len(df_obj))]
else:
df_obj[c] = None
# Ensure list columns
ex = _ensure_list_columns_in_df(ex, LIST_COLUMNS)
nw = _ensure_list_columns_in_df(nw, LIST_COLUMNS)
combined = pd.concat([ex, nw], ignore_index=True, sort=False)
if "fecha" in combined.columns:
try:
combined = combined.sort_values(by="fecha", ascending=False, na_position="last")
except Exception:
pass
if dedup_on in combined.columns:
combined = combined.drop_duplicates(subset=[dedup_on], keep="first")
else:
combined = combined.drop_duplicates(keep="first")
combined = combined.reset_index(drop=True)
combined = _ensure_list_columns_in_df(combined, LIST_COLUMNS)
# Convert lists to JSON strings for Streamlit / pyarrow compatibility
for col in LIST_COLUMNS:
if col in combined.columns:
combined[col] = combined[col].apply(lambda x: json.dumps(x) if isinstance(x, (list, tuple, set)) else x)
# ensure consistent column order
cols_final = [c for c in EXPECTED_COLS if c in combined.columns] + [c for c in combined.columns if c not in EXPECTED_COLS]
return combined[cols_final]
def preview_df(df: pd.DataFrame, n: int = 5):
"""Helper visual en Streamlit."""
if df is None or df.empty:
st.info("DataFrame vacío.")
return
st.write(f"Dimensiones: {df.shape}")
st.dataframe(df.head(n), use_container_width=True)