bluesky-explorer

Sleeping

App Files Files Community

jccolon commited on Oct 27, 2025

Commit

ffac8cc

verified ·

1 Parent(s): 8c270b9

Upload 21 files

Browse files

Files changed (21) hide show

app/__init__.py +0 -0
app/__pycache__/__init__.cpython-311.pyc +0 -0
app/__pycache__/analyzer.cpython-311.pyc +0 -0
app/__pycache__/charts.cpython-311.pyc +0 -0
app/__pycache__/client_manager.cpython-311.pyc +0 -0
app/__pycache__/fetcher.cpython-311.pyc +0 -0
app/__pycache__/reporting.cpython-311.pyc +0 -0
app/analyzer.py +248 -0
app/charts.py +334 -0
app/client_manager.py +50 -0
app/fetcher.py +139 -0
app/reporting.py +63 -0
app/ui/__init__.py +0 -0
app/ui/__pycache__/__init__.cpython-311.pyc +0 -0
app/ui/__pycache__/components.cpython-311.pyc +0 -0
app/ui/__pycache__/main_app.cpython-311.pyc +0 -0
app/ui/__pycache__/panel.cpython-311.pyc +0 -0
app/ui/components.py +42 -0
app/ui/main_app.py +294 -0
app/ui/panel.py +83 -0
app/utils.py +0 -0

app/__init__.py ADDED Viewed

File without changes

app/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (153 Bytes). View file

app/__pycache__/analyzer.cpython-311.pyc ADDED Viewed

Binary file (11.9 kB). View file

app/__pycache__/charts.cpython-311.pyc ADDED Viewed

Binary file (18.8 kB). View file

app/__pycache__/client_manager.cpython-311.pyc ADDED Viewed

Binary file (2.66 kB). View file

app/__pycache__/fetcher.cpython-311.pyc ADDED Viewed

Binary file (6.17 kB). View file

app/__pycache__/reporting.cpython-311.pyc ADDED Viewed

Binary file (3.66 kB). View file

app/analyzer.py ADDED Viewed

	@@ -0,0 +1,248 @@

+# app/analyzer.py
+from __future__ import annotations
+import re
+from typing import List, Tuple
+import pandas as pd
+import torch
+import torch.nn.functional as F
+from transformers import AutoTokenizer, AutoModelForSequenceClassification
+import streamlit as st
+# ======================================================================
+# Config
+# ======================================================================
+# Modelo con pesos en safetensors (evita vulnerabilidad de torch.load en .bin)
+MODEL_ID = "nlptown/bert-base-multilingual-uncased-sentiment"
+# Si prefieres otro: "cardiffnlp/twitter-xlm-roberta-base-sentiment"
+# Tamaños de batch recomendados (ajustables)
+BATCH_SIZE_CPU = 32
+BATCH_SIZE_GPU = 64
+# Mapeo a etiquetas en español (compatible con tu UI)
+LABEL_MAP_ES = {
+    "1 star": "muy negativo",
+    "2 stars": "negativo",
+    "3 stars": "neutral",
+    "4 stars": "positivo",
+    "5 stars": "muy positivo",
+    # variantes defensivas por si el modelo devuelve singular/plural distinto
+    "1 stars": "muy negativo",
+    "2 star": "negativo",
+    "3 star": "neutral",
+    "4 star": "positivo",
+    "5 star": "muy positivo",
+}
+NEG_SET = {"muy negativo", "negativo"}
+POS_SET = {"positivo", "muy positivo"}
+# ======================================================================
+# Carga del modelo/tokenizer (cacheada) + device
+# ======================================================================
+@st.cache_resource(show_spinner=False)
+def load_sentiment_components():
+    tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
+    model = AutoModelForSequenceClassification.from_pretrained(MODEL_ID)
+    model.eval()
+    device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
+    model.to(device)
+    return tokenizer, model, device
+# ======================================================================
+# Limpieza básica de texto (rápida)
+# ======================================================================
+_url = re.compile(r"https?://\S+")
+_mention = re.compile(r"@\w+")
+_ws = re.compile(r"\s+")
+def clean_text_basic(t: str) -> str:
+    """
+    Limpieza ligera: elimina URLs/mentions, normaliza espacios.
+    Conserva hashtags y signos (útiles en redes).
+    """
+    if not t:
+        return ""
+    t = _url.sub(" ", t)
+    t = _mention.sub(" ", t)
+    t = _ws.sub(" ", t).strip()
+    return t
+# ======================================================================
+# Heurística de sarcasmo/ironía (rápida y transparente)
+# ======================================================================
+SARC_HASHTAGS = {
+    "#sarcasmo", "#sarcasm", "#ironia", "#irony", "#sarc", "#irónica", "#irónico"
+}
+SARC_MARKERS = {
+    "/s",  # convención Reddit/foros
+    "sí claro", "claro que sí", "yeah right", "ajá",
+    "gracias por nada", "qué podría salir mal", "buenísimo...", "genial...", "perfecto...",
+}
+SARC_EMOJIS = {"🙃", "😒", "🙄"}
+def sarcasm_score(t: str) -> int:
+    """Devuelve 0/1/2 según señales de sarcasmo encontradas."""
+    if not t:
+        return 0
+    tl = t.lower()
+    score = 0
+    # hashtags
+    for tag in SARC_HASHTAGS:
+        if tag in tl:
+            score += 2
+    # marcadores
+    for m in SARC_MARKERS:
+        if m in tl:
+            score += 1
+    # emojis
+    if any(e in t for e in SARC_EMOJIS):
+        score += 1
+    # exceso de comillas + adjetivo positivo (muy simplificado)
+    if ('"' in t or "“" in t or "”" in t) and any(p in tl for p in ("genial", "perfecto", "maravilloso")):
+        score += 1
+    return min(score, 3)
+def adjust_with_sarcasm(label_es: str, score: int) -> str:
+    """Ajusta la etiqueta en español ante señales de sarcasmo."""
+    if score <= 0:
+        return label_es
+    # heurística conservadora:
+    # - si el modelo dice positivo pero hay sarcasmo, degradar a neutral/negativo
+    if label_es in POS_SET:
+        return "negativo" if score >= 2 else "neutral"
+    # - si el modelo dice neutral y hay señales fuertes, degradar a negativo
+    if label_es == "neutral" and score >= 1:
+        return "negativo" if score >= 2 else "neutral"
+    # - si ya es negativo y el sarcasmo es muy alto, enfatizar "muy negativo"
+    if label_es in NEG_SET and score >= 3:
+        return "muy negativo"
+    return label_es
+# ======================================================================
+# Inferencia PyTorch pura (sin numpy)
+# ======================================================================
+def _predict_batch(texts: List[str], max_length: int = 256) -> Tuple[List[str], List[Tuple[float, float, float]], List[int]]:
+    """
+    Devuelve:
+      - labels_raw: etiquetas originales del modelo ("1 star"...)
+      - probs_agg:  lista de (p_neg, p_neu, p_pos) agregadas
+      - sarc_scores: sarcasm score por texto (para trazabilidad)
+    """
+    tokenizer, model, device = load_sentiment_components()
+    if torch.cuda.is_available():
+        bs = BATCH_SIZE_GPU
+    else:
+        bs = BATCH_SIZE_CPU
+    labels_raw: List[str] = []
+    probs_agg: List[Tuple[float, float, float]] = []
+    sarc_scores: List[int] = []
+    with torch.inference_mode():
+        for i in range(0, len(texts), bs):
+            chunk = texts[i:i+bs]
+            enc = tokenizer(
+                chunk,
+                padding=True,
+                truncation=True,
+                max_length=max_length,
+                return_tensors="pt",
+            )
+            enc = {k: v.to(device) for k, v in enc.items()}
+            out = model(**enc)  # logits shape: [B, 5] en este modelo
+            probs = F.softmax(out.logits, dim=-1)  # [B,5]
+            # agregamos (neg=1+2, neu=3, pos=4+5)
+            # índices según orden del modelo nlptown: 0..4 = '1 star'..'5 stars'
+            p_neg = probs[:, 0] + probs[:, 1]
+            p_neu = probs[:, 2]
+            p_pos = probs[:, 3] + probs[:, 4]
+            top_idx = torch.argmax(probs, dim=-1).tolist()
+            labels_raw.extend([model.config.id2label[int(j)] for j in top_idx])
+            probs_agg.extend([(float(a), float(b), float(c)) for a, b, c in zip(p_neg, p_neu, p_pos)])
+            # sarcasmo para cada texto del batch
+            sarc_scores.extend([sarcasm_score(t) for t in chunk])
+    return labels_raw, probs_agg, sarc_scores
+# ======================================================================
+# API principal para la app
+# ======================================================================
+@st.cache_data(show_spinner=False)
+def clean_and_analyze(
+    df: pd.DataFrame,
+    min_chars: int = 20,
+    dedup_cols: List[str] | None = None,
+    use_clean_text: bool = True,
+) -> pd.DataFrame:
+    """
+    Limpia, deduplica y enriquece el DataFrame, y añade columnas de sentimiento.
+    - min_chars=0 y dedup_cols=['uri'] → modo “volumen máximo”.
+    - use_clean_text=True → analiza sobre texto limpio (mejor estabilidad del modelo).
+    """
+    if df is None or df.empty:
+        return df
+    d = df.copy()
+    # --- Deduplicado ---
+    if dedup_cols:
+        d = d.drop_duplicates(subset=dedup_cols)
+    # --- Texto básico + filtros ---
+    d["texto"] = d["texto"].fillna("")
+    if min_chars and min_chars > 0:
+        d = d[d["texto"].str.len() >= min_chars]
+    if d.empty:
+        return d
+    # Guardamos el original y creamos una versión limpia para el modelo
+    d["texto_raw"] = d["texto"]
+    d["texto_clean"] = d["texto_raw"].map(clean_text_basic) if use_clean_text else d["texto_raw"]
+    # --- Enriquecimiento rápido ---
+    d["n_palabras"] = d["texto_raw"].str.split().str.len()
+    d["has_url"] = d["texto_raw"].str.contains(r"https?://", na=False)
+    d["hashtags"] = d["texto_raw"].str.findall(r"#\w+")
+    d["mentions"] = d["texto_raw"].str.findall(r"@\w+")
+    # --- Inferencia ---
+    texts_for_model = d["texto_clean"].astype(str).tolist()
+    labels_raw, probs_agg, sarc_scores = _predict_batch(texts_for_model, max_length=256)
+    d["sentiment"] = labels_raw
+    d["p_neg"], d["p_neu"], d["p_pos"] = zip(*probs_agg)
+    d["sarcasm_score"] = sarc_scores
+    # --- Etiquetas en español + ajuste por sarcasmo ---
+    d["sent_desc"] = d["sentiment"].map(LABEL_MAP_ES).fillna("neutral")
+    d["sent_desc_adj"] = [adjust_with_sarcasm(lbl, sc) for lbl, sc in zip(d["sent_desc"], d["sarcasm_score"])]
+    # Por compatibilidad con el resto de la app, exponemos 'sent_desc' como final:
+    d["sent_desc"] = d["sent_desc_adj"]
+    d = d.drop(columns=["sent_desc_adj"], errors="ignore")
+    # Orden sugerido de columnas
+    cols_order = [
+        "uri", "autor", "fecha", "texto_raw", "texto_clean",
+        "sentiment", "sent_desc", "p_neg", "p_neu", "p_pos", "sarcasm_score",
+        "n_palabras", "has_url", "hashtags", "mentions",
+    ]
+    # Mantén también las columnas originales no listadas
+    cols_final = [c for c in cols_order if c in d.columns] + [c for c in d.columns if c not in cols_order]
+    d = d[cols_final]
+    return d

app/charts.py ADDED Viewed

	@@ -0,0 +1,334 @@

+import pandas as pd
+import altair as alt
+from io import BytesIO
+from typing import Optional
+import re
+from collections import Counter
+from io import BytesIO
+from wordcloud import WordCloud
+# =========================
+# Helpers
+# =========================
+def preprocess_dates(df: pd.DataFrame) -> pd.DataFrame:
+    """
+    Normaliza 'fecha':
+    - convierte a datetime (UTC) y lo deja naive
+    - crea 'date_day' (datetime64[ns] a medianoche)
+    - crea 'date_iso' (str, opcional)
+    """
+    d = df.copy()
+    s = pd.to_datetime(d["fecha"], utc=True, errors="coerce")
+    s = s.dt.tz_convert("UTC").dt.tz_localize(None)
+    d["fecha"] = s
+    d = d.dropna(subset=["fecha"])
+    d["date_day"] = d["fecha"].dt.normalize()
+    d["date_iso"] = d["date_day"].dt.strftime("%Y-%m-%d")
+    return d
+def export_chart_png(chart: alt.Chart, scale: int = 2) -> Optional[BytesIO]:
+    """
+    Exporta un Altair Chart a PNG en memoria.
+    Requiere `pip install vl-convert-python`.
+    """
+    try:
+        buf = BytesIO()
+        chart.save(buf, format="png", scale=scale)
+        buf.seek(0)
+        return buf
+    except Exception:
+        return None
+# =========================
+# Gráficos
+# =========================
+def chart_posts_diario_ma(dfin: pd.DataFrame, window: int = 7) -> alt.Chart:
+    tmp = dfin[["date_day"]].copy()
+    serie = tmp.groupby("date_day").size().reset_index(name="posts")
+    serie["MA"] = serie["posts"].rolling(window, min_periods=1).mean()
+    c_posts = (
+        alt.Chart(serie)
+        .mark_line(point=False)
+        .encode(
+            x=alt.X("date_day:T", title="Fecha"),
+            y=alt.Y("posts:Q", title="Posts"),
+            tooltip=[alt.Tooltip("date_day:T", title="Fecha"), "posts:Q"],
+        )
+    )
+    c_ma = (
+        alt.Chart(serie)
+        .mark_line(strokeDash=[4, 3])
+        .encode(
+            x="date_day:T",
+            y=alt.Y("MA:Q", title="Media móvil"),
+            tooltip=[alt.Tooltip("date_day:T", title="Fecha"), "MA:Q"],
+        )
+    )
+    return (c_posts + c_ma).properties(height=260)
+def chart_sentimiento_barras(dfin: pd.DataFrame) -> alt.Chart:
+    """
+    Barras por categoría de sentimiento.
+    Soporta 'Neutral' y 'Neutral o mixto' según cómo venga en dfin['sent_desc'].
+    """
+    if "sent_desc" not in dfin.columns:
+        # Nada que mostrar
+        return alt.Chart(pd.DataFrame({"sentimiento": [], "posts": []})).mark_bar()
+    # Orden preferido (incluimos ambas variantes de 'neutral')
+    order_pref = ["Muy negativo", "Negativo", "Neutral o mixto", "Neutral", "Positivo", "Muy positivo"]
+    # Conteo y orden estable
+    vc = (
+        dfin["sent_desc"]
+        .fillna("Desconocido")
+        .value_counts()
+        .rename_axis("sentimiento")
+        .reset_index(name="posts")
+    )
+    # Filtramos al orden preferido que realmente exista (y mantenemos ese orden)
+    present = [c for c in order_pref if c in vc["sentimiento"].values]
+    if not present:
+        # Si no hay ninguna de las conocidas, mostramos lo que haya
+        present = list(vc["sentimiento"].values)
+    vc["sentimiento"] = pd.Categorical(vc["sentimiento"], categories=present, ordered=True)
+    vc = vc.sort_values("sentimiento")
+    # Gráfico
+    return (
+        alt.Chart(vc)
+        .mark_bar()
+        .encode(
+            x=alt.X("sentimiento:N", sort=present, title="Sentimiento"),
+            y=alt.Y("posts:Q", title="Posts"),
+            tooltip=["sentimiento:N", "posts:Q"],
+        )
+        .properties(height=260)
+    )
+def chart_sentimiento_apilado(dfin: pd.DataFrame) -> alt.Chart:
+    tmp = dfin[["date_day", "sent_desc"]].copy()
+    g = tmp.groupby(["date_day", "sent_desc"]).size().reset_index(name="posts")
+    return (
+        alt.Chart(g)
+        .mark_area()
+        .encode(
+            x=alt.X("date_day:T", title="Fecha"),
+            y=alt.Y("posts:Q", stack="zero", title="Posts"),
+            color=alt.Color("sent_desc:N", title="Sentimiento"),
+            tooltip=[alt.Tooltip("date_day:T", title="Fecha"), "sent_desc:N", "posts:Q"],
+        )
+        .properties(height=260)
+    )
+def chart_heatmap_dia_hora(dfin: pd.DataFrame) -> alt.Chart:
+    tmp = pd.DataFrame(
+        {"dow": dfin["fecha"].dt.day_name(), "hour": dfin["fecha"].dt.hour}
+    )
+    order = ["Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"]
+    tmp["dow"] = pd.Categorical(tmp["dow"], categories=order, ordered=True)
+    counts = tmp.groupby(["dow", "hour"]).size().reset_index(name="posts")
+    return (
+        alt.Chart(counts)
+        .mark_rect()
+        .encode(
+            x=alt.X("hour:O", title="Hora"),
+            y=alt.Y("dow:N", title="Día de semana", sort=order),
+            color=alt.Color("posts:Q", title="Posts"),
+            tooltip=["dow:N", "hour:O", "posts:Q"],
+        )
+        .properties(height=260)
+    )
+def chart_top_hashtags(dfin: pd.DataFrame, topn: int = 20):
+    if "hashtags" not in dfin.columns:
+        return None
+    exp = dfin[["hashtags"]].explode("hashtags").dropna(subset=["hashtags"])
+    if exp.empty:
+        return None
+    vc = exp["hashtags"].value_counts().head(topn).rename_axis("hashtag").reset_index(name="conteo")
+    return (
+        alt.Chart(vc)
+        .mark_bar()
+        .encode(
+            x="conteo:Q",
+            y=alt.Y("hashtag:N", sort="-x"),
+            tooltip=["hashtag:N", "conteo:Q"],
+        )
+        .properties(height=max(260, 24 * len(vc) + 20))
+    )
+def chart_sent_por_hashtag(dfin: pd.DataFrame, topn: int = 15):
+    if "hashtags" not in dfin.columns:
+        return None
+    exp = dfin[["hashtags", "sent_desc"]].explode("hashtags").dropna(subset=["hashtags"])
+    if exp.empty:
+        return None
+    top_tags = exp["hashtags"].value_counts().head(topn).index
+    g = (
+        exp[exp["hashtags"].isin(top_tags)]
+        .groupby(["hashtags", "sent_desc"])
+        .size()
+        .reset_index(name="posts")
+    )
+    return (
+        alt.Chart(g)
+        .mark_bar()
+        .encode(
+            y=alt.Y("hashtags:N", title="Hashtag", sort="-x"),
+            x=alt.X("posts:Q", title="Posts"),
+            color=alt.Color("sent_desc:N", title="Sentimiento"),
+            tooltip=["hashtags:N", "sent_desc:N", "posts:Q"],
+        )
+        .properties(height=28 * len(top_tags) + 20)
+    )
+def chart_top_menciones(dfin: pd.DataFrame, topn: int = 20):
+    if "mentions" not in dfin.columns:
+        return None
+    exp = dfin[["mentions"]].explode("mentions").dropna(subset=["mentions"])
+    if exp.empty:
+        return None
+    vc = exp["mentions"].value_counts().head(topn).rename_axis("mención").reset_index(name="conteo")
+    return (
+        alt.Chart(vc)
+        .mark_bar()
+        .encode(
+            x="conteo:Q",
+            y=alt.Y("mención:N", sort="-x"),
+            tooltip=["mención:N", "conteo:Q"],
+        )
+        .properties(height=28 * len(vc) + 20)
+    )
+def chart_hist_longitud(dfin: pd.DataFrame):
+    if "n_palabras" not in dfin.columns:
+        return None
+    tmp = dfin[["n_palabras"]].copy()
+    return (
+        alt.Chart(tmp)
+        .mark_bar()
+        .encode(
+            x=alt.X("n_palabras:Q", bin=alt.Bin(maxbins=30), title="Número de palabras"),
+            y=alt.Y("count():Q", title="Posts"),
+            tooltip=[alt.Tooltip("count():Q", title="Posts")],
+        )
+        .properties(height=260)
+    )
+def chart_top_dominios(dfin: pd.DataFrame, topn: int = 20):
+    tmp = dfin[["texto"]].copy()
+    s = tmp["texto"].str.extractall(r"https?://([^/\s]+)")[0]
+    if s.empty:
+        return None
+    vc = s.value_counts().head(topn).rename_axis("dominio").reset_index(name="conteo")
+    return (
+        alt.Chart(vc)
+        .mark_bar()
+        .encode(
+            x="conteo:Q",
+            y=alt.Y("dominio:N", sort="-x"),
+            tooltip=["dominio:N", "conteo:Q"],
+        )
+        .properties(height=28 * len(vc) + 20)
+    )
+def chart_nube_palabras(dfin: pd.DataFrame, max_words: int = 150) -> BytesIO | None:
+    """
+    Genera una nube de palabras (PNG en memoria) a partir de dfin['texto'].
+    - Limpia URLs, menciones, hashtags y tokens cortos.
+    - Stopwords ES/EN básicas incluidas.
+    - Devuelve BytesIO con PNG o None si no hay texto útil.
+    """
+    if "texto" not in dfin.columns or dfin.empty:
+        return None
+    # --- recolecta y limpia texto ---
+    texts = dfin["texto"].dropna().astype(str).tolist()
+    if not texts:
+        return None
+    text = " ".join(texts)
+    # quita URLs, menciones y hashtags (sólo el #/@, mantenemos la palabra)
+    text = re.sub(r"https?://\S+", " ", text)
+    text = re.sub(r"[@#]", " ", text)
+    # tokens básicos
+    tokens = re.findall(r"[A-Za-zÁÉÍÓÚÜÑáéíóúüñ0-9]+", text)
+    # stopwords sencillas ES/EN (puedes ampliarlas si quieres)
+    stop_es = {
+        "de","la","que","el","en","y","a","los","del","se","las","por","un","para","con","no",
+        "una","su","al","lo","como","más","pero","sus","le","ya","o","fue","este","ha","sí",
+        "porque","esta","son","entre","cuando","muy","sin","sobre","también","me","hasta",
+        "hay","donde","quien","desde","todo","nos","durante","todos","uno","les","ni","contra",
+        "otros","ese","eso","ante","ellos","e","esto","mí","antes","algunos","qué","unos",
+        "yo","otro","otras","otra","él","tanto","esa","estos","mucho","quienes","nada","muchos",
+        "cual","poco","ella","estar","estas","algunas","algo","nosotros","mi","mis","tú","te",
+        "ti","tu","tus","ellas","nosotras","vosotros","vosotras","os","mío","mía","míos","mías",
+        "tuyo","tuya","tuyos","tuyas","suyo","suya","suyos","suyas","nuestro","nuestra","nuestros",
+        "nuestras","vuestro","vuestra","vuestros","vuestras","esos","esas","estoy","estás","está",
+        "estamos","estáis","están","esté","estés","estemos","estéis","estén","estaré","estarás",
+        "estará","estaremos","estaréis","estarán"
+    }
+    stop_en = {
+        "the","a","an","and","or","but","to","of","for","in","on","at","by","with","from","as",
+        "is","are","was","were","be","been","being","it","its","this","that","these","those",
+        "i","you","he","she","we","they","me","him","her","us","them","my","your","his","their",
+        "our","mine","yours","hers","theirs","ours","not","no","so","if","than","then","too",
+        "very","can","could","should","would","will","just","also","into","over","under",
+    }
+    stops = {t.lower() for t in (stop_es | stop_en)}
+    # filtra tokens (longitud mínima y no stopword)
+    tokens = [t for t in tokens if len(t) >= 3 and t.lower() not in stops]
+    if not tokens:
+        return None
+    # frecuencias
+    freqs = Counter(t.lower() for t in tokens)
+    # genera la nube
+    wc = WordCloud(
+        width=1400,
+        height=800,
+        background_color="white",
+        prefer_horizontal=0.9,
+        collocations=False,   # no agrupa bi/trigramas, mejor control de tokens
+        max_words=max_words,
+    ).generate_from_frequencies(freqs)
+    # a PNG en memoria
+    png = BytesIO()
+    wc.to_image().save(png, format="PNG")
+    png.seek(0)
+    return png
+CHARTS = {
+    "📈 Posts por día + media móvil": chart_posts_diario_ma,
+    "📊 Recuento por sentimiento": chart_sentimiento_barras,   # ⬅️ nueva entrada
+    "🧭 Sentimiento (área apilada)": chart_sentimiento_apilado,
+    "☁️ Nube de palabras": chart_nube_palabras,      # ⬅️ NUEVO
+    "🗓️ Calor Día×Hora": chart_heatmap_dia_hora,
+    "🏷️ Top hashtags": chart_top_hashtags,
+    "🏷️ Hashtag × Sentimiento": chart_sent_por_hashtag,
+    "👤 Top menciones": chart_top_menciones,
+    "📏 Longitud del texto": chart_hist_longitud,
+    "🔗 Top dominios (URLs)": chart_top_dominios,
+}

app/client_manager.py ADDED Viewed

	@@ -0,0 +1,50 @@

+import streamlit as st
+from atproto import Client
+# ========= Gestión de sesión (login per-user) =========
+def _get_client():
+    """Devuelve el cliente actual de Bluesky desde la sesión."""
+    return st.session_state.get("bsky_client")
+def _set_client(c, handle):
+    """Guarda el cliente y el handle en la sesión."""
+    st.session_state["bsky_client"] = c
+    st.session_state["bsky_handle"] = handle
+def _is_logged_in():
+    """Devuelve True si hay un cliente activo."""
+    return "bsky_client" in st.session_state and st.session_state["bsky_client"] is not None
+def _logout():
+    """Cierra sesión eliminando los datos de la sesión."""
+    st.session_state.pop("bsky_client", None)
+    st.session_state.pop("bsky_handle", None)
+def login_bsky(handle: str, app_password: str):
+    """
+    Inicia sesión en Bluesky con el handle y la App Password del usuario.
+    """
+    client = Client()  # usa el servicio por defecto https://bsky.social
+    client.login(handle, app_password)
+    return client
+# ---- Aliases públicos para compatibilidad con el resto del código ----
+def get_client():
+    return _get_client()
+def set_client(c, handle):
+    return _set_client(c, handle)
+def is_logged_in():
+    return _is_logged_in()
+def logout():
+    return _logout()
+def login(handle: str, app_password: str):
+    return login_bsky(handle, app_password)

app/fetcher.py ADDED Viewed

	@@ -0,0 +1,139 @@

+import math
+from datetime import datetime, timedelta, timezone
+from typing import Iterable, Optional, List
+import pandas as pd
+from atproto import models
+from app.client_manager import get_client
+def _iso_to_dt_utc_naive(iso: str) -> Optional[datetime]:
+    """Convierte ISO (posible 'Z') a datetime naive en UTC."""
+    if not iso:
+        return None
+    try:
+        iso = iso.replace("Z", "+00:00")
+        aware = datetime.fromisoformat(iso)
+        return aware.astimezone(timezone.utc).replace(tzinfo=None)
+    except Exception:
+        return None
+def _search_one_term(
+    term: str,
+    days_back: int,
+    max_posts: Optional[int],
+) -> pd.DataFrame:
+    """
+    Busca posts de un único término usando la API oficial (app.bsky.feed.search_posts).
+    Devuelve un DataFrame con columnas: texto, autor, fecha (datetime naive UTC), uri.
+    Respeta el corte por días y el límite max_posts.
+    """
+    client = get_client()
+    if client is None:
+        raise RuntimeError("No hay sesión de Bluesky.")
+    cutoff_aware = datetime.now(timezone.utc) - timedelta(days=days_back)
+    rows: List[dict] = []
+    cursor = None
+    while True:
+        remaining = None if max_posts is None else max(max_posts - len(rows), 0)
+        if remaining == 0:
+            break
+        limit = 100 if remaining is None else max(1, min(100, remaining))
+        params = models.AppBskyFeedSearchPosts.Params(q=term, limit=limit, cursor=cursor)
+        resp = client.app.bsky.feed.search_posts(params=params)
+        posts = resp.posts or []
+        if not posts:
+            break
+        # Si encontramos algún post más antiguo que el cutoff, paramos este término
+        stop_for_age = False
+        for p in posts:
+            created_raw = getattr(p.record, "created_at", "") or ""
+            # comparar con cutoff usando AWARE
+            try:
+                aware = datetime.fromisoformat(created_raw.replace("Z", "+00:00"))
+                if aware < cutoff_aware:
+                    stop_for_age = True
+                    break
+            except Exception:
+                pass
+            created_dt = _iso_to_dt_utc_naive(created_raw)
+            rows.append(
+                {
+                    "texto": getattr(p.record, "text", "") or "",
+                    "autor": getattr(p.author, "handle", "") or "",
+                    "fecha": created_dt,
+                    "uri": getattr(p, "uri", "") or "",
+                }
+            )
+            if max_posts is not None and len(rows) >= max_posts:
+                stop_for_age = True
+                break
+        if stop_for_age:
+            break
+        cursor = resp.cursor
+        if not cursor:
+            break
+    return pd.DataFrame(rows, columns=["texto", "autor", "fecha", "uri"])
+def fetch_posts(topic: str, days_back: int, user_handle: str, max_posts: Optional[int] = None) -> pd.DataFrame:
+    """
+    Búsqueda tipo AND (la API ya hace matching por 'q').
+    """
+    return _search_one_term(topic, days_back, max_posts)
+def fetch_posts_or(terms: Iterable[str], days_back: int, user_handle: str, max_posts: Optional[int] = None) -> pd.DataFrame:
+    """
+    Búsqueda OR: reparte el cupo entre términos, combina y elimina duplicados.
+    """
+    terms = [t.strip() for t in terms if t and t.strip()]
+    if not terms:
+        return pd.DataFrame(columns=["texto", "autor", "fecha", "uri"])
+    remaining = max_posts
+    frames: List[pd.DataFrame] = []
+    aportes = {}
+    for i, term in enumerate(terms):
+        limit_i = None
+        if remaining is not None:
+            # reparte lo que queda entre los que faltan (redondeo hacia arriba)
+            limit_i = math.ceil(remaining / (len(terms) - i))
+        try:
+            df_i = _search_one_term(term, days_back, limit_i)
+        except Exception as e:
+            # devolvemos vacío y que la app lo muestre como aviso
+            df_i = pd.DataFrame(columns=["texto", "autor", "fecha", "uri"])
+        aportes[term] = len(df_i)
+        frames.append(df_i)
+        if remaining is not None:
+            remaining = max(0, remaining - len(df_i))
+    df = pd.concat(frames, ignore_index=True)
+    df = df.drop_duplicates(subset=["uri", "texto", "autor"])
+    if max_posts is not None:
+        df = df.head(max_posts)
+    # guardamos “aportes” como atributo para que la UI lo muestre
+    df.attrs["aportes"] = aportes
+    return df

app/reporting.py ADDED Viewed

	@@ -0,0 +1,63 @@

+# app/reporting.py
+from __future__ import annotations
+from io import BytesIO
+from typing import List, Tuple
+import altair as alt
+import pandas as pd
+from reportlab.lib.pagesizes import A4
+from reportlab.pdfgen import canvas
+from reportlab.lib.utils import ImageReader
+def altair_to_png_bytes(chart: alt.Chart, scale: int = 2) -> BytesIO:
+    buf = BytesIO()
+    chart.save(buf, format="png", scale=scale)  # requiere vl-convert-python
+    buf.seek(0)
+    return buf
+def build_pdf_with_images(df: pd.DataFrame, images: List[Tuple[str, BytesIO]]) -> bytes:
+    buffer = BytesIO()
+    c = canvas.Canvas(buffer, pagesize=A4)
+    page_w, page_h = A4
+    margin = 36
+    y = page_h - margin
+    c.setFont("Helvetica-Bold", 18)
+    c.drawString(margin, y, "Informe de Análisis - Bluesky Explorer")
+    y -= 22
+    c.setFont("Helvetica", 11)
+    c.drawString(margin, y, f"Total de publicaciones analizadas: {len(df)}")
+    y -= 8
+    c.drawString(margin, y, f"Gráficos incluidos: {len(images)}")
+    y -= 20
+    max_w = page_w - 2 * margin
+    for idx, (title, png_bytes) in enumerate(images, start=1):
+        if y < 140:
+            c.showPage()
+            y = page_h - margin
+        c.setFont("Helvetica-Bold", 12)
+        c.drawString(margin, y, f"{idx}. {title}")
+        y -= 12
+        img_reader = ImageReader(png_bytes)
+        iw, ih = img_reader.getSize()
+        scale = min(max_w / iw, 1)
+        w = iw * scale
+        h = ih * scale
+        if y - h < margin:
+            c.showPage()
+            y = page_h - margin - 12
+            c.setFont("Helvetica-Bold", 12)
+            c.drawString(margin, y, f"{idx}. {title}")
+            y -= 12
+        c.drawImage(img_reader, margin, y - h, width=w, height=h, preserveAspectRatio=True)
+        y -= h + 18
+    c.showPage()
+    c.save()
+    buffer.seek(0)
+    return buffer.getvalue()

app/ui/__init__.py ADDED Viewed

File without changes

app/ui/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (156 Bytes). View file

app/ui/__pycache__/components.cpython-311.pyc ADDED Viewed

Binary file (4.26 kB). View file

app/ui/__pycache__/main_app.cpython-311.pyc ADDED Viewed

Binary file (4.7 kB). View file

app/ui/__pycache__/panel.cpython-311.pyc ADDED Viewed

Binary file (4.73 kB). View file

app/ui/components.py ADDED Viewed

	@@ -0,0 +1,42 @@

+from __future__ import annotations
+import streamlit as st
+from config.settings import MAX_POSTS_WARNING
+from app.client_manager import is_logged_in, login, logout, set_client
+def render_login_sidebar():
+    st.sidebar.header("Acceso a Bluesky")
+    if not is_logged_in():
+        handle = st.sidebar.text_input("Handle", "", key="login_handle")
+        app_password = st.sidebar.text_input("App Password", type="password", key="login_password")
+        if st.sidebar.button("Iniciar sesión", key="btn_login"):
+            try:
+                client = login(handle, app_password)
+                set_client(client, handle)
+                st.sidebar.success("Autenticado correctamente.")
+                st.rerun()
+            except Exception:
+                st.sidebar.error("Usuario o contraseña incorrectos.")
+    else:
+        st.sidebar.success(f"Sesión iniciada como {st.session_state.get('bsky_handle')}")
+        if st.sidebar.button("Cerrar sesión", key="btn_logout"):
+            logout()
+            st.rerun()
+def render_search_form():
+    st.sidebar.header("Configuración de búsqueda")
+    with st.sidebar.form("search_form", clear_on_submit=False):
+        topic = st.text_input("Término", value="apagón", key="search_topic")
+        days_back = st.slider("Días atrás", 7, 365, 30, key="search_days_back")
+        max_posts = st.number_input("Máximo de posts", 1, 30000, 1000, key="search_max_posts")
+        operator = st.radio("Operador", ["AND", "OR"], horizontal=True, key="search_operator")
+        require_confirm = max_posts > MAX_POSTS_WARNING
+        if require_confirm:
+            st.warning(f"🚨 Has solicitado {int(max_posts)} publicaciones. Puede ralentizar el proceso.", icon="⚠️")
+            confirm_heavy = st.checkbox("Entiendo el aviso y deseo continuar", key="search_confirm_heavy")
+        else:
+            confirm_heavy = True
+        submitted = st.form_submit_button("Buscar", use_container_width=True, type="primary")
+    return submitted, str(topic).strip(), int(days_back), int(max_posts), operator, bool(require_confirm), bool(confirm_heavy)

app/ui/main_app.py ADDED Viewed

	@@ -0,0 +1,294 @@

+# --- ensure project root on sys.path ---
+import sys
+from pathlib import Path
+ROOT = Path(__file__).resolve().parents[2]  # .../TFM
+if str(ROOT) not in sys.path:
+    sys.path.insert(0, str(ROOT))
+# ---------------------------------------
+import json
+import hashlib
+from io import BytesIO
+import altair as alt
+import pandas as pd
+import streamlit as st
+from reportlab.lib.pagesizes import A4
+from reportlab.pdfgen import canvas
+from reportlab.lib.utils import ImageReader
+from config.settings import MAX_POSTS_WARNING
+from app.client_manager import login, set_client, logout, is_logged_in
+from app.fetcher import fetch_posts, fetch_posts_or
+from app.analyzer import clean_and_analyze
+from app import charts  # <- nuestras funciones de gráficos
+st.set_page_config(page_title="Bluesky Explorer", page_icon="🔎", layout="wide")
+st.title("🔎 Bluesky Explorer")
+# -------------------------------
+# Helpers
+# -------------------------------
+def _result_key(handle, topic, days_back, max_posts, operator):
+    payload = json.dumps(
+        {"h": handle, "t": topic, "d": int(days_back), "m": int(max_posts), "o": operator},
+        ensure_ascii=False,
+        sort_keys=True,
+    ).encode()
+    return hashlib.md5(payload).hexdigest()
+# -------------------------------
+# Login
+# -------------------------------
+st.sidebar.header("Acceso a Bluesky")
+if not is_logged_in():
+    handle = st.sidebar.text_input("Handle", "", key="login_handle")
+    app_password = st.sidebar.text_input("App Password", type="password", key="login_pwd")
+    if st.sidebar.button("Iniciar sesión", key="btn_login"):
+        try:
+            client = login(handle, app_password)
+            set_client(client, handle)
+            st.sidebar.success("Autenticado correctamente.")
+            st.rerun()
+        except Exception:
+            st.sidebar.error("Usuario o contraseña incorrectos.")
+else:
+    st.sidebar.success(f"Sesión iniciada como {st.session_state.get('bsky_handle')}")
+    if st.sidebar.button("Cerrar sesión", key="btn_logout"):
+        logout()
+        st.rerun()
+if not is_logged_in():
+    st.stop()
+# -------------------------------
+# Parámetros
+# -------------------------------
+st.sidebar.header("Configuración de búsqueda")
+with st.sidebar.form("search_form", clear_on_submit=False):
+    topic = st.text_input("Término", value="apagón", key="param_topic")
+    days_back = st.slider("Días atrás", 7, 365, 30, key="param_days")
+    max_posts = st.number_input("Máximo de posts", 1, 30000, 1000, key="param_max")
+    operator = st.radio("Operador", ["AND", "OR"], horizontal=True, key="param_op")
+    require_confirm = max_posts > MAX_POSTS_WARNING
+    if require_confirm:
+        st.warning(
+            f"🚨 Has solicitado {int(max_posts)} publicaciones. Puede ralentizar el proceso.",
+            icon="⚠️",
+        )
+        confirm_heavy = st.checkbox("Entiendo el aviso y deseo continuar", key="param_confirm")
+    else:
+        confirm_heavy = True
+    submitted = st.form_submit_button("Buscar", use_container_width=True)
+# -------------------------------
+# Buscar + Analizar
+# -------------------------------
+if submitted:
+    topic = topic.strip()
+    if not topic:
+        st.error("Debes introducir un término de búsqueda.")
+        st.stop()
+    if days_back <= 0 or max_posts <= 0:
+        st.error("Parámetros inválidos.")
+        st.stop()
+    if require_confirm and not confirm_heavy:
+        st.error("Debes marcar la casilla para continuar.")
+        st.stop()
+    try:
+        with st.spinner("🔎 Buscando publicaciones en Bluesky..."):
+            if operator == "AND":
+                df_raw = fetch_posts(topic, days_back, st.session_state["bsky_handle"], int(max_posts))
+            else:
+                df_raw = fetch_posts_or(topic.split(), days_back, st.session_state["bsky_handle"], int(max_posts))
+    except Exception as e:
+        st.error(f"Error al recuperar publicaciones: {e}")
+        st.stop()
+    # Info de aportes por término (si viene)
+    aportes = df_raw.attrs.get("aportes")
+    if aportes:
+        st.caption("📊 Posts por término: " + ", ".join(f"{k}: {v}" for k, v in aportes.items()))
+    if df_raw is None or df_raw.empty:
+        st.warning("No se encontraron publicaciones.")
+        st.stop()
+    # Analizar
+    try:
+        with st.spinner("🧠 Analizando sentimiento y limpiando datos..."):
+            df = clean_and_analyze(df_raw)  # tu función cacheada
+    except Exception as e:
+        st.error(f"Error durante el análisis: {e}")
+        st.stop()
+    if df is None or df.empty:
+        st.info("No se encontraron publicaciones válidas tras limpieza.")
+        st.stop()
+    # Fechas seguras para graficar/exportar
+    df = charts.preprocess_dates(df)
+    # Guardar en sesión
+    key = _result_key(st.session_state["bsky_handle"], topic, days_back, int(max_posts), operator)
+    st.session_state["current_key"] = key
+    st.session_state.setdefault("results", {})[key] = df
+# -------------------------------
+# Recuperar último resultado
+# -------------------------------
+df = None
+key = st.session_state.get("current_key")
+if key:
+    df = st.session_state.get("results", {}).get(key)
+if df is None or df.empty:
+    st.info("Realiza una búsqueda para construir el panel de gráficos.")
+    st.stop()
+# -------------------------------
+# Métricas + Datos
+# -------------------------------
+st.metric("Posts totales", len(df))
+st.dataframe(df, use_container_width=True)
+# -------------------------------
+# Panel de gráficos
+# -------------------------------
+st.sidebar.header("Panel de gráficos")
+# Evita que el texto de los botones se parta
+st.sidebar.markdown(
+    """
+    <style>
+    section[data-testid="stSidebar"] button { white-space: nowrap; }
+    </style>
+    """,
+    unsafe_allow_html=True,
+)
+st.session_state.setdefault("panels", {})
+st.session_state["panels"].setdefault(key, [])
+panels = st.session_state["panels"][key]
+choice = st.sidebar.selectbox("Añadir gráfico", list(charts.CHARTS.keys()), key="chart_select")
+c1, c2, c3 = st.sidebar.columns(3, gap="small")
+with c1:
+    add = st.button(" ➕ ", use_container_width=True, key="btn_add_chart")
+with c2:
+    undo = st.button(" ↩️ ", use_container_width=True, key="btn_undo_chart")
+with c3:
+    clear = st.button(" 🗑 ", use_container_width=True, key="btn_clear_chart")
+if add:
+    panels.append(choice)
+if undo and panels:
+    panels.pop()
+if clear:
+    panels.clear()
+if panels:
+    st.subheader("📊 Panel de gráficos")
+    chart_pngs: list[tuple[str, BytesIO]] = []
+    for i, name in enumerate(panels, start=1):
+        st.markdown(f"**{i}. {name}**")
+        chart_func = charts.CHARTS[name]
+        chart_obj = chart_func(df)
+        # 1) Cualquier gráfico de Altair (Chart, LayerChart, Facet, Concat, etc.)
+        if hasattr(chart_obj, "to_dict"):  # duck-typing para objetos Altair
+            st.altair_chart(chart_obj, use_container_width=True)
+            # Exportar a PNG (requiere vl-convert-python)
+            png = charts.export_chart_png(chart_obj, scale=2)
+            if png:
+                chart_pngs.append((name, png))
+            else:
+                st.warning(f"No se pudo exportar '{name}' como imagen (Altair).")
+        # 2) Imagen generada (por ejemplo la nube de palabras -> BytesIO)
+        elif isinstance(chart_obj, BytesIO):
+            st.image(chart_obj, use_container_width=True)  # <-- sin el parámetro deprecado
+            chart_pngs.append((name, chart_obj))
+        # 3) Sin datos
+        elif chart_obj is None:
+            st.info("No hay datos suficientes para este gráfico.")
+        # 4) Tipo inesperado
+        else:
+            st.warning(f"Tipo de salida no soportado para '{name}'.")
+    # PDF
+    def _pdf_from_images(df_data: pd.DataFrame, images: list[tuple[str, BytesIO]]) -> bytes:
+        buf = BytesIO()
+        c = canvas.Canvas(buf, pagesize=A4)
+        page_w, page_h = A4
+        margin = 36
+        y = page_h - margin
+        c.setFont("Helvetica-Bold", 18)
+        c.drawString(margin, y, "Informe de Análisis - Bluesky Explorer")
+        y -= 22
+        c.setFont("Helvetica", 11)
+        c.drawString(margin, y, f"Total de publicaciones analizadas: {len(df_data)}")
+        y -= 8
+        c.drawString(margin, y, f"Gráficos incluidos: {len(images)}")
+        y -= 20
+        max_w = page_w - 2 * margin
+        for idx, (title, png_bytes) in enumerate(images, start=1):
+            if y < 140:
+                c.showPage()
+                y = page_h - margin
+            c.setFont("Helvetica-Bold", 12)
+            c.drawString(margin, y, f"{idx}. {title}")
+            y -= 12
+            img = ImageReader(png_bytes)
+            iw, ih = img.getSize()
+            scale = min(max_w / iw, 1.0)
+            w = iw * scale
+            h = ih * scale
+            if y - h < margin:
+                c.showPage()
+                y = page_h - margin - 12
+                c.setFont("Helvetica-Bold", 12)
+                c.drawString(margin, y, f"{idx}. {title}")
+                y -= 12
+            c.drawImage(img, margin, y - h, width=w, height=h, preserveAspectRatio=True)
+            y -= h + 18
+        c.showPage()
+        c.save()
+        buf.seek(0)
+        return buf.getvalue()
+    if chart_pngs:
+        pdf_bytes = _pdf_from_images(df, chart_pngs)
+        st.download_button(
+            label="📄 Descargar informe en PDF",
+            data=pdf_bytes,
+            file_name="informe_bluesky.pdf",
+            mime="application/pdf",
+            key="btn_pdf",
+        )
+else:
+    st.info("Selecciona un tipo de gráfico en la barra lateral y pulsa **Añadir** para construir tu panel.")

app/ui/panel.py ADDED Viewed

	@@ -0,0 +1,83 @@

+from __future__ import annotations
+from io import BytesIO
+from typing import List, Tuple
+import streamlit as st
+import pandas as pd
+from app.charts import CHARTS
+from app.reporting import altair_to_png_bytes, build_pdf_with_images
+def render_panel(df: pd.DataFrame, state_key: str):
+    st.sidebar.header("Panel de gráficos")
+    # Evita cortes raros en botones
+    st.sidebar.markdown(
+        """
+        <style>
+        section[data-testid="stSidebar"] button { white-space: nowrap; }
+        </style>
+        """,
+        unsafe_allow_html=True,
+    )
+    # Estado por resultado
+    st.session_state.setdefault("panels", {})
+    st.session_state["panels"].setdefault(state_key, [])
+    panels: list[str] = st.session_state["panels"][state_key]
+    # Selectbox + botones con keys únicas por 'state_key'
+    choice = st.sidebar.selectbox(
+        "Añadir gráfico",
+        list(CHARTS.keys()),
+        key=f"chart_select_{state_key}",
+    )
+    c1, c2, c3 = st.sidebar.columns(3, gap="small")
+    with c1:
+        add = st.button(" ➕ ", use_container_width=True, key=f"btn_add_{state_key}")
+    with c2:
+        undo = st.button(" ↩️ ", use_container_width=True, key=f"btn_undo_{state_key}")
+    with c3:
+        clear = st.button(" 🗑 ", use_container_width=True, key=f"btn_clear_{state_key}")
+    if add:
+        panels.append(choice)
+    if undo and panels:
+        panels.pop()
+    if clear:
+        panels.clear()
+    if not panels:
+        st.info("Selecciona un tipo de gráfico en la barra lateral y pulsa **Añadir** para construir tu panel.")
+        return
+    st.subheader("📊 Panel de gráficos")
+    chart_pngs: List[Tuple[str, BytesIO]] = []
+    for i, name in enumerate(panels, start=1):
+        st.markdown(f"**{i}. {name}**")
+        chart_func = CHARTS[name]
+        chart_obj = chart_func(df)
+        if chart_obj is None:
+            st.info("No hay datos suficientes para este gráfico.")
+            continue
+        st.altair_chart(chart_obj, use_container_width=True)
+        # Exportación a PNG (necesita vl-convert-python instalado)
+        try:
+            png_buf = altair_to_png_bytes(chart_obj, scale=2)
+            chart_pngs.append((name, png_buf))
+        except Exception as e:
+            st.warning(f"No se pudo exportar '{name}' como imagen: {e}")
+    if chart_pngs:
+        pdf_bytes = build_pdf_with_images(df, chart_pngs)
+        st.download_button(
+            label="📄 Descargar informe en PDF",
+            data=pdf_bytes,
+            file_name="informe_bluesky.pdf",
+            mime="application/pdf",
+            key=f"btn_pdf_{state_key}",
+        )
+    else:
+        st.info("No se pudieron exportar imágenes para el PDF.")

app/utils.py ADDED Viewed

File without changes