Spaces:

jccolon
/

bluesky-explorer

Sleeping

App Files Files Community

bluesky-explorer / app /utils.py

jccolon

Update app/utils.py

5343ac6 verified 3 months ago

raw

history blame contribute delete

6.46 kB

	# app/utils.py
	from __future__ import annotations
	from typing import List, Any, Tuple
	import ast
	import json
	import pandas as pd
	import streamlit as st

	# Columnas esperadas (las que exporta la app). Ajusta si tu export añade/quita columnas.
	EXPECTED_COLS = [
	"uri", "autor", "fecha", "texto_raw", "texto_clean",
	"sentiment", "sent_desc", "p_neg", "p_neu", "p_pos", "sarcasm_score",
	"n_palabras", "has_url", "hashtags", "mentions",
	]

	# Columnas que deben ser listas (si vienen como strings las convertimos)
	LIST_COLUMNS = ["hashtags", "mentions"]


	def validate_export_df(df: pd.DataFrame) -> Tuple[bool, List[str]]:
	"""
	Comprueba si el DataFrame tiene al menos las columnas mínimas esperadas.
	Devuelve (is_valid, missing_cols_list).
	"""
	if df is None:
	return False, EXPECTED_COLS.copy()
	missing = [c for c in EXPECTED_COLS if c not in df.columns]
	return len(missing) == 0, missing


	def _to_list_safe(val: Any) -> list:
	"""
	Convierte de forma segura valores variados a lista:
	- Si ya es lista -> la devuelve
	- Si es string representando lista -> ast.literal_eval
	- Si es string vacío / NaN / None -> []
	- Si es otro tipo (ej str normal) -> []
	"""
	if val is None:
	return []
	if isinstance(val, list):
	return val
	if isinstance(val, (tuple, set)):
	return list(val)
	try:
	if pd.isna(val):
	return []
	except Exception:
	pass
	if isinstance(val, str):
	s = val.strip()
	if s == "":
	return []
	if s.startswith("[") and s.endswith("]"):
	try:
	parsed = ast.literal_eval(s)
	if isinstance(parsed, list):
	return parsed
	return [parsed]
	except Exception:
	inner = s[1:-1].strip()
	if not inner:
	return []
	parts = [p.strip().strip("'\"") for p in inner.split(",") if p.strip()]
	return parts
	if "," in s:
	parts = [p.strip() for p in s.split(",") if p.strip()]
	return parts
	return []
	return []


	def _ensure_list_columns_in_df(df: pd.DataFrame, cols: List[str]) -> pd.DataFrame:
	"""
	Garantiza que cada columna en `cols` exista en `df` y que todos sus valores sean listas.
	"""
	# si df es vacío, hacemo una copia con las columnas necesarias
	if df is None:
	df = pd.DataFrame(columns=EXPECTED_COLS)

	for c in cols:
	if c not in df.columns:
	df[c] = [[] for _ in range(len(df))]
	continue
	df[c] = df[c].apply(_to_list_safe)
	return df


	def load_csv_to_df(uploaded) -> pd.DataFrame:
	"""
	Recibe el archivo subido por Streamlit (o ruta str) y devuelve DataFrame normalizado.
	- Intentos con encoding heurístico.
	- Normaliza columnas mínimas y convierte 'hashtags'/'mentions' a listas.
	"""
	try:
	if isinstance(uploaded, str):
	df = pd.read_csv(uploaded)
	else:
	uploaded.seek(0)
	df = pd.read_csv(uploaded)
	except Exception:
	df = None
	for enc in ("utf-8", "latin1", "iso-8859-1"):
	try:
	if isinstance(uploaded, str):
	df = pd.read_csv(uploaded, encoding=enc)
	else:
	uploaded.seek(0)
	df = pd.read_csv(uploaded, encoding=enc)
	break
	except Exception:
	df = None
	if df is None:
	raise

	if "texto_raw" in df.columns and "texto_clean" not in df.columns:
	df["texto_clean"] = df["texto_raw"]

	if "fecha" in df.columns:
	try:
	df["fecha"] = pd.to_datetime(df["fecha"], errors="coerce")
	except Exception:
	pass

	for c in EXPECTED_COLS:
	if c not in df.columns:
	if c in LIST_COLUMNS:
	df[c] = [[] for _ in range(len(df))]
	else:
	df[c] = None

	df = _ensure_list_columns_in_df(df, LIST_COLUMNS)

	cols_final = [c for c in EXPECTED_COLS if c in df.columns] + [c for c in df.columns if c not in EXPECTED_COLS]
	return df[cols_final]


	def merge_exported_with_new(existing: pd.DataFrame, new: pd.DataFrame, dedup_on: str = "uri") -> pd.DataFrame:
	"""
	Concatena existing + new y elimina duplicados por 'dedup_on' keeping newest (por fecha) si fecha existe.
	Asegura homogeneidad en columnas lista (hashtags/mentions) para evitar errores de pyarrow/streamlit.
	"""
	# normalize inputs
	ex = existing.copy() if existing is not None else pd.DataFrame()
	nw = new.copy() if new is not None else pd.DataFrame()

	# Ensure expected columns exist in both
	for df_obj in (ex, nw):
	for c in EXPECTED_COLS:
	if c not in df_obj.columns:
	if c in LIST_COLUMNS:
	df_obj[c] = [[] for _ in range(len(df_obj))]
	else:
	df_obj[c] = None

	# Ensure list columns
	ex = _ensure_list_columns_in_df(ex, LIST_COLUMNS)
	nw = _ensure_list_columns_in_df(nw, LIST_COLUMNS)

	combined = pd.concat([ex, nw], ignore_index=True, sort=False)

	if "fecha" in combined.columns:
	try:
	combined = combined.sort_values(by="fecha", ascending=False, na_position="last")
	except Exception:
	pass

	if dedup_on in combined.columns:
	combined = combined.drop_duplicates(subset=[dedup_on], keep="first")
	else:
	combined = combined.drop_duplicates(keep="first")

	combined = combined.reset_index(drop=True)

	combined = _ensure_list_columns_in_df(combined, LIST_COLUMNS)

	# Convert lists to JSON strings for Streamlit / pyarrow compatibility
	for col in LIST_COLUMNS:
	if col in combined.columns:
	combined[col] = combined[col].apply(lambda x: json.dumps(x) if isinstance(x, (list, tuple, set)) else x)

	# ensure consistent column order
	cols_final = [c for c in EXPECTED_COLS if c in combined.columns] + [c for c in combined.columns if c not in EXPECTED_COLS]
	return combined[cols_final]


	def preview_df(df: pd.DataFrame, n: int = 5):
	"""Helper visual en Streamlit."""
	if df is None or df.empty:
	st.info("DataFrame vacío.")
	return
	st.write(f"Dimensiones: {df.shape}")
	st.dataframe(df.head(n), use_container_width=True)