jccolon commited on
Commit
ffac8cc
·
verified ·
1 Parent(s): 8c270b9

Upload 21 files

Browse files
app/__init__.py ADDED
File without changes
app/__pycache__/__init__.cpython-311.pyc ADDED
Binary file (153 Bytes). View file
 
app/__pycache__/analyzer.cpython-311.pyc ADDED
Binary file (11.9 kB). View file
 
app/__pycache__/charts.cpython-311.pyc ADDED
Binary file (18.8 kB). View file
 
app/__pycache__/client_manager.cpython-311.pyc ADDED
Binary file (2.66 kB). View file
 
app/__pycache__/fetcher.cpython-311.pyc ADDED
Binary file (6.17 kB). View file
 
app/__pycache__/reporting.cpython-311.pyc ADDED
Binary file (3.66 kB). View file
 
app/analyzer.py ADDED
@@ -0,0 +1,248 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # app/analyzer.py
2
+ from __future__ import annotations
3
+
4
+ import re
5
+ from typing import List, Tuple
6
+
7
+ import pandas as pd
8
+ import torch
9
+ import torch.nn.functional as F
10
+ from transformers import AutoTokenizer, AutoModelForSequenceClassification
11
+ import streamlit as st
12
+
13
+
14
+ # ======================================================================
15
+ # Config
16
+ # ======================================================================
17
+ # Modelo con pesos en safetensors (evita vulnerabilidad de torch.load en .bin)
18
+ MODEL_ID = "nlptown/bert-base-multilingual-uncased-sentiment"
19
+ # Si prefieres otro: "cardiffnlp/twitter-xlm-roberta-base-sentiment"
20
+
21
+ # Tamaños de batch recomendados (ajustables)
22
+ BATCH_SIZE_CPU = 32
23
+ BATCH_SIZE_GPU = 64
24
+
25
+ # Mapeo a etiquetas en español (compatible con tu UI)
26
+ LABEL_MAP_ES = {
27
+ "1 star": "muy negativo",
28
+ "2 stars": "negativo",
29
+ "3 stars": "neutral",
30
+ "4 stars": "positivo",
31
+ "5 stars": "muy positivo",
32
+ # variantes defensivas por si el modelo devuelve singular/plural distinto
33
+ "1 stars": "muy negativo",
34
+ "2 star": "negativo",
35
+ "3 star": "neutral",
36
+ "4 star": "positivo",
37
+ "5 star": "muy positivo",
38
+ }
39
+
40
+ NEG_SET = {"muy negativo", "negativo"}
41
+ POS_SET = {"positivo", "muy positivo"}
42
+
43
+
44
+ # ======================================================================
45
+ # Carga del modelo/tokenizer (cacheada) + device
46
+ # ======================================================================
47
+ @st.cache_resource(show_spinner=False)
48
+ def load_sentiment_components():
49
+ tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
50
+ model = AutoModelForSequenceClassification.from_pretrained(MODEL_ID)
51
+ model.eval()
52
+ device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
53
+ model.to(device)
54
+ return tokenizer, model, device
55
+
56
+
57
+ # ======================================================================
58
+ # Limpieza básica de texto (rápida)
59
+ # ======================================================================
60
+ _url = re.compile(r"https?://\S+")
61
+ _mention = re.compile(r"@\w+")
62
+ _ws = re.compile(r"\s+")
63
+
64
+ def clean_text_basic(t: str) -> str:
65
+ """
66
+ Limpieza ligera: elimina URLs/mentions, normaliza espacios.
67
+ Conserva hashtags y signos (útiles en redes).
68
+ """
69
+ if not t:
70
+ return ""
71
+ t = _url.sub(" ", t)
72
+ t = _mention.sub(" ", t)
73
+ t = _ws.sub(" ", t).strip()
74
+ return t
75
+
76
+
77
+ # ======================================================================
78
+ # Heurística de sarcasmo/ironía (rápida y transparente)
79
+ # ======================================================================
80
+ SARC_HASHTAGS = {
81
+ "#sarcasmo", "#sarcasm", "#ironia", "#irony", "#sarc", "#irónica", "#irónico"
82
+ }
83
+ SARC_MARKERS = {
84
+ "/s", # convención Reddit/foros
85
+ "sí claro", "claro que sí", "yeah right", "ajá",
86
+ "gracias por nada", "qué podría salir mal", "buenísimo...", "genial...", "perfecto...",
87
+ }
88
+ SARC_EMOJIS = {"🙃", "😒", "🙄"}
89
+
90
+ def sarcasm_score(t: str) -> int:
91
+ """Devuelve 0/1/2 según señales de sarcasmo encontradas."""
92
+ if not t:
93
+ return 0
94
+ tl = t.lower()
95
+ score = 0
96
+ # hashtags
97
+ for tag in SARC_HASHTAGS:
98
+ if tag in tl:
99
+ score += 2
100
+ # marcadores
101
+ for m in SARC_MARKERS:
102
+ if m in tl:
103
+ score += 1
104
+ # emojis
105
+ if any(e in t for e in SARC_EMOJIS):
106
+ score += 1
107
+ # exceso de comillas + adjetivo positivo (muy simplificado)
108
+ if ('"' in t or "“" in t or "”" in t) and any(p in tl for p in ("genial", "perfecto", "maravilloso")):
109
+ score += 1
110
+ return min(score, 3)
111
+
112
+
113
+ def adjust_with_sarcasm(label_es: str, score: int) -> str:
114
+ """Ajusta la etiqueta en español ante señales de sarcasmo."""
115
+ if score <= 0:
116
+ return label_es
117
+ # heurística conservadora:
118
+ # - si el modelo dice positivo pero hay sarcasmo, degradar a neutral/negativo
119
+ if label_es in POS_SET:
120
+ return "negativo" if score >= 2 else "neutral"
121
+ # - si el modelo dice neutral y hay señales fuertes, degradar a negativo
122
+ if label_es == "neutral" and score >= 1:
123
+ return "negativo" if score >= 2 else "neutral"
124
+ # - si ya es negativo y el sarcasmo es muy alto, enfatizar "muy negativo"
125
+ if label_es in NEG_SET and score >= 3:
126
+ return "muy negativo"
127
+ return label_es
128
+
129
+
130
+ # ======================================================================
131
+ # Inferencia PyTorch pura (sin numpy)
132
+ # ======================================================================
133
+ def _predict_batch(texts: List[str], max_length: int = 256) -> Tuple[List[str], List[Tuple[float, float, float]], List[int]]:
134
+ """
135
+ Devuelve:
136
+ - labels_raw: etiquetas originales del modelo ("1 star"...)
137
+ - probs_agg: lista de (p_neg, p_neu, p_pos) agregadas
138
+ - sarc_scores: sarcasm score por texto (para trazabilidad)
139
+ """
140
+ tokenizer, model, device = load_sentiment_components()
141
+
142
+ if torch.cuda.is_available():
143
+ bs = BATCH_SIZE_GPU
144
+ else:
145
+ bs = BATCH_SIZE_CPU
146
+
147
+ labels_raw: List[str] = []
148
+ probs_agg: List[Tuple[float, float, float]] = []
149
+ sarc_scores: List[int] = []
150
+
151
+ with torch.inference_mode():
152
+ for i in range(0, len(texts), bs):
153
+ chunk = texts[i:i+bs]
154
+ enc = tokenizer(
155
+ chunk,
156
+ padding=True,
157
+ truncation=True,
158
+ max_length=max_length,
159
+ return_tensors="pt",
160
+ )
161
+ enc = {k: v.to(device) for k, v in enc.items()}
162
+ out = model(**enc) # logits shape: [B, 5] en este modelo
163
+ probs = F.softmax(out.logits, dim=-1) # [B,5]
164
+
165
+ # agregamos (neg=1+2, neu=3, pos=4+5)
166
+ # índices según orden del modelo nlptown: 0..4 = '1 star'..'5 stars'
167
+ p_neg = probs[:, 0] + probs[:, 1]
168
+ p_neu = probs[:, 2]
169
+ p_pos = probs[:, 3] + probs[:, 4]
170
+
171
+ top_idx = torch.argmax(probs, dim=-1).tolist()
172
+ labels_raw.extend([model.config.id2label[int(j)] for j in top_idx])
173
+
174
+ probs_agg.extend([(float(a), float(b), float(c)) for a, b, c in zip(p_neg, p_neu, p_pos)])
175
+
176
+ # sarcasmo para cada texto del batch
177
+ sarc_scores.extend([sarcasm_score(t) for t in chunk])
178
+
179
+ return labels_raw, probs_agg, sarc_scores
180
+
181
+
182
+ # ======================================================================
183
+ # API principal para la app
184
+ # ======================================================================
185
+ @st.cache_data(show_spinner=False)
186
+ def clean_and_analyze(
187
+ df: pd.DataFrame,
188
+ min_chars: int = 20,
189
+ dedup_cols: List[str] | None = None,
190
+ use_clean_text: bool = True,
191
+ ) -> pd.DataFrame:
192
+ """
193
+ Limpia, deduplica y enriquece el DataFrame, y añade columnas de sentimiento.
194
+ - min_chars=0 y dedup_cols=['uri'] → modo “volumen máximo”.
195
+ - use_clean_text=True → analiza sobre texto limpio (mejor estabilidad del modelo).
196
+ """
197
+ if df is None or df.empty:
198
+ return df
199
+
200
+ d = df.copy()
201
+
202
+ # --- Deduplicado ---
203
+ if dedup_cols:
204
+ d = d.drop_duplicates(subset=dedup_cols)
205
+
206
+ # --- Texto básico + filtros ---
207
+ d["texto"] = d["texto"].fillna("")
208
+ if min_chars and min_chars > 0:
209
+ d = d[d["texto"].str.len() >= min_chars]
210
+ if d.empty:
211
+ return d
212
+
213
+ # Guardamos el original y creamos una versión limpia para el modelo
214
+ d["texto_raw"] = d["texto"]
215
+ d["texto_clean"] = d["texto_raw"].map(clean_text_basic) if use_clean_text else d["texto_raw"]
216
+
217
+ # --- Enriquecimiento rápido ---
218
+ d["n_palabras"] = d["texto_raw"].str.split().str.len()
219
+ d["has_url"] = d["texto_raw"].str.contains(r"https?://", na=False)
220
+ d["hashtags"] = d["texto_raw"].str.findall(r"#\w+")
221
+ d["mentions"] = d["texto_raw"].str.findall(r"@\w+")
222
+
223
+ # --- Inferencia ---
224
+ texts_for_model = d["texto_clean"].astype(str).tolist()
225
+ labels_raw, probs_agg, sarc_scores = _predict_batch(texts_for_model, max_length=256)
226
+ d["sentiment"] = labels_raw
227
+ d["p_neg"], d["p_neu"], d["p_pos"] = zip(*probs_agg)
228
+ d["sarcasm_score"] = sarc_scores
229
+
230
+ # --- Etiquetas en español + ajuste por sarcasmo ---
231
+ d["sent_desc"] = d["sentiment"].map(LABEL_MAP_ES).fillna("neutral")
232
+ d["sent_desc_adj"] = [adjust_with_sarcasm(lbl, sc) for lbl, sc in zip(d["sent_desc"], d["sarcasm_score"])]
233
+
234
+ # Por compatibilidad con el resto de la app, exponemos 'sent_desc' como final:
235
+ d["sent_desc"] = d["sent_desc_adj"]
236
+ d = d.drop(columns=["sent_desc_adj"], errors="ignore")
237
+
238
+ # Orden sugerido de columnas
239
+ cols_order = [
240
+ "uri", "autor", "fecha", "texto_raw", "texto_clean",
241
+ "sentiment", "sent_desc", "p_neg", "p_neu", "p_pos", "sarcasm_score",
242
+ "n_palabras", "has_url", "hashtags", "mentions",
243
+ ]
244
+ # Mantén también las columnas originales no listadas
245
+ cols_final = [c for c in cols_order if c in d.columns] + [c for c in d.columns if c not in cols_order]
246
+ d = d[cols_final]
247
+
248
+ return d
app/charts.py ADDED
@@ -0,0 +1,334 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import altair as alt
3
+ from io import BytesIO
4
+ from typing import Optional
5
+ import re
6
+ from collections import Counter
7
+ from io import BytesIO
8
+ from wordcloud import WordCloud
9
+
10
+
11
+ # =========================
12
+ # Helpers
13
+ # =========================
14
+ def preprocess_dates(df: pd.DataFrame) -> pd.DataFrame:
15
+ """
16
+ Normaliza 'fecha':
17
+ - convierte a datetime (UTC) y lo deja naive
18
+ - crea 'date_day' (datetime64[ns] a medianoche)
19
+ - crea 'date_iso' (str, opcional)
20
+ """
21
+ d = df.copy()
22
+ s = pd.to_datetime(d["fecha"], utc=True, errors="coerce")
23
+ s = s.dt.tz_convert("UTC").dt.tz_localize(None)
24
+ d["fecha"] = s
25
+ d = d.dropna(subset=["fecha"])
26
+ d["date_day"] = d["fecha"].dt.normalize()
27
+ d["date_iso"] = d["date_day"].dt.strftime("%Y-%m-%d")
28
+ return d
29
+
30
+
31
+ def export_chart_png(chart: alt.Chart, scale: int = 2) -> Optional[BytesIO]:
32
+ """
33
+ Exporta un Altair Chart a PNG en memoria.
34
+ Requiere `pip install vl-convert-python`.
35
+ """
36
+ try:
37
+ buf = BytesIO()
38
+ chart.save(buf, format="png", scale=scale)
39
+ buf.seek(0)
40
+ return buf
41
+ except Exception:
42
+ return None
43
+
44
+
45
+ # =========================
46
+ # Gráficos
47
+ # =========================
48
+ def chart_posts_diario_ma(dfin: pd.DataFrame, window: int = 7) -> alt.Chart:
49
+ tmp = dfin[["date_day"]].copy()
50
+ serie = tmp.groupby("date_day").size().reset_index(name="posts")
51
+ serie["MA"] = serie["posts"].rolling(window, min_periods=1).mean()
52
+
53
+ c_posts = (
54
+ alt.Chart(serie)
55
+ .mark_line(point=False)
56
+ .encode(
57
+ x=alt.X("date_day:T", title="Fecha"),
58
+ y=alt.Y("posts:Q", title="Posts"),
59
+ tooltip=[alt.Tooltip("date_day:T", title="Fecha"), "posts:Q"],
60
+ )
61
+ )
62
+ c_ma = (
63
+ alt.Chart(serie)
64
+ .mark_line(strokeDash=[4, 3])
65
+ .encode(
66
+ x="date_day:T",
67
+ y=alt.Y("MA:Q", title="Media móvil"),
68
+ tooltip=[alt.Tooltip("date_day:T", title="Fecha"), "MA:Q"],
69
+ )
70
+ )
71
+ return (c_posts + c_ma).properties(height=260)
72
+
73
+ def chart_sentimiento_barras(dfin: pd.DataFrame) -> alt.Chart:
74
+ """
75
+ Barras por categoría de sentimiento.
76
+ Soporta 'Neutral' y 'Neutral o mixto' según cómo venga en dfin['sent_desc'].
77
+ """
78
+ if "sent_desc" not in dfin.columns:
79
+ # Nada que mostrar
80
+ return alt.Chart(pd.DataFrame({"sentimiento": [], "posts": []})).mark_bar()
81
+
82
+ # Orden preferido (incluimos ambas variantes de 'neutral')
83
+ order_pref = ["Muy negativo", "Negativo", "Neutral o mixto", "Neutral", "Positivo", "Muy positivo"]
84
+
85
+ # Conteo y orden estable
86
+ vc = (
87
+ dfin["sent_desc"]
88
+ .fillna("Desconocido")
89
+ .value_counts()
90
+ .rename_axis("sentimiento")
91
+ .reset_index(name="posts")
92
+ )
93
+
94
+ # Filtramos al orden preferido que realmente exista (y mantenemos ese orden)
95
+ present = [c for c in order_pref if c in vc["sentimiento"].values]
96
+ if not present:
97
+ # Si no hay ninguna de las conocidas, mostramos lo que haya
98
+ present = list(vc["sentimiento"].values)
99
+
100
+ vc["sentimiento"] = pd.Categorical(vc["sentimiento"], categories=present, ordered=True)
101
+ vc = vc.sort_values("sentimiento")
102
+
103
+ # Gráfico
104
+ return (
105
+ alt.Chart(vc)
106
+ .mark_bar()
107
+ .encode(
108
+ x=alt.X("sentimiento:N", sort=present, title="Sentimiento"),
109
+ y=alt.Y("posts:Q", title="Posts"),
110
+ tooltip=["sentimiento:N", "posts:Q"],
111
+ )
112
+ .properties(height=260)
113
+ )
114
+
115
+
116
+ def chart_sentimiento_apilado(dfin: pd.DataFrame) -> alt.Chart:
117
+ tmp = dfin[["date_day", "sent_desc"]].copy()
118
+ g = tmp.groupby(["date_day", "sent_desc"]).size().reset_index(name="posts")
119
+
120
+ return (
121
+ alt.Chart(g)
122
+ .mark_area()
123
+ .encode(
124
+ x=alt.X("date_day:T", title="Fecha"),
125
+ y=alt.Y("posts:Q", stack="zero", title="Posts"),
126
+ color=alt.Color("sent_desc:N", title="Sentimiento"),
127
+ tooltip=[alt.Tooltip("date_day:T", title="Fecha"), "sent_desc:N", "posts:Q"],
128
+ )
129
+ .properties(height=260)
130
+ )
131
+
132
+
133
+ def chart_heatmap_dia_hora(dfin: pd.DataFrame) -> alt.Chart:
134
+ tmp = pd.DataFrame(
135
+ {"dow": dfin["fecha"].dt.day_name(), "hour": dfin["fecha"].dt.hour}
136
+ )
137
+ order = ["Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"]
138
+ tmp["dow"] = pd.Categorical(tmp["dow"], categories=order, ordered=True)
139
+ counts = tmp.groupby(["dow", "hour"]).size().reset_index(name="posts")
140
+
141
+ return (
142
+ alt.Chart(counts)
143
+ .mark_rect()
144
+ .encode(
145
+ x=alt.X("hour:O", title="Hora"),
146
+ y=alt.Y("dow:N", title="Día de semana", sort=order),
147
+ color=alt.Color("posts:Q", title="Posts"),
148
+ tooltip=["dow:N", "hour:O", "posts:Q"],
149
+ )
150
+ .properties(height=260)
151
+ )
152
+
153
+
154
+ def chart_top_hashtags(dfin: pd.DataFrame, topn: int = 20):
155
+ if "hashtags" not in dfin.columns:
156
+ return None
157
+ exp = dfin[["hashtags"]].explode("hashtags").dropna(subset=["hashtags"])
158
+ if exp.empty:
159
+ return None
160
+ vc = exp["hashtags"].value_counts().head(topn).rename_axis("hashtag").reset_index(name="conteo")
161
+ return (
162
+ alt.Chart(vc)
163
+ .mark_bar()
164
+ .encode(
165
+ x="conteo:Q",
166
+ y=alt.Y("hashtag:N", sort="-x"),
167
+ tooltip=["hashtag:N", "conteo:Q"],
168
+ )
169
+ .properties(height=max(260, 24 * len(vc) + 20))
170
+ )
171
+
172
+
173
+ def chart_sent_por_hashtag(dfin: pd.DataFrame, topn: int = 15):
174
+ if "hashtags" not in dfin.columns:
175
+ return None
176
+ exp = dfin[["hashtags", "sent_desc"]].explode("hashtags").dropna(subset=["hashtags"])
177
+ if exp.empty:
178
+ return None
179
+ top_tags = exp["hashtags"].value_counts().head(topn).index
180
+ g = (
181
+ exp[exp["hashtags"].isin(top_tags)]
182
+ .groupby(["hashtags", "sent_desc"])
183
+ .size()
184
+ .reset_index(name="posts")
185
+ )
186
+ return (
187
+ alt.Chart(g)
188
+ .mark_bar()
189
+ .encode(
190
+ y=alt.Y("hashtags:N", title="Hashtag", sort="-x"),
191
+ x=alt.X("posts:Q", title="Posts"),
192
+ color=alt.Color("sent_desc:N", title="Sentimiento"),
193
+ tooltip=["hashtags:N", "sent_desc:N", "posts:Q"],
194
+ )
195
+ .properties(height=28 * len(top_tags) + 20)
196
+ )
197
+
198
+
199
+ def chart_top_menciones(dfin: pd.DataFrame, topn: int = 20):
200
+ if "mentions" not in dfin.columns:
201
+ return None
202
+ exp = dfin[["mentions"]].explode("mentions").dropna(subset=["mentions"])
203
+ if exp.empty:
204
+ return None
205
+ vc = exp["mentions"].value_counts().head(topn).rename_axis("mención").reset_index(name="conteo")
206
+ return (
207
+ alt.Chart(vc)
208
+ .mark_bar()
209
+ .encode(
210
+ x="conteo:Q",
211
+ y=alt.Y("mención:N", sort="-x"),
212
+ tooltip=["mención:N", "conteo:Q"],
213
+ )
214
+ .properties(height=28 * len(vc) + 20)
215
+ )
216
+
217
+ def chart_hist_longitud(dfin: pd.DataFrame):
218
+ if "n_palabras" not in dfin.columns:
219
+ return None
220
+ tmp = dfin[["n_palabras"]].copy()
221
+ return (
222
+ alt.Chart(tmp)
223
+ .mark_bar()
224
+ .encode(
225
+ x=alt.X("n_palabras:Q", bin=alt.Bin(maxbins=30), title="Número de palabras"),
226
+ y=alt.Y("count():Q", title="Posts"),
227
+ tooltip=[alt.Tooltip("count():Q", title="Posts")],
228
+ )
229
+ .properties(height=260)
230
+ )
231
+
232
+
233
+ def chart_top_dominios(dfin: pd.DataFrame, topn: int = 20):
234
+ tmp = dfin[["texto"]].copy()
235
+ s = tmp["texto"].str.extractall(r"https?://([^/\s]+)")[0]
236
+ if s.empty:
237
+ return None
238
+ vc = s.value_counts().head(topn).rename_axis("dominio").reset_index(name="conteo")
239
+ return (
240
+ alt.Chart(vc)
241
+ .mark_bar()
242
+ .encode(
243
+ x="conteo:Q",
244
+ y=alt.Y("dominio:N", sort="-x"),
245
+ tooltip=["dominio:N", "conteo:Q"],
246
+ )
247
+ .properties(height=28 * len(vc) + 20)
248
+ )
249
+
250
+ def chart_nube_palabras(dfin: pd.DataFrame, max_words: int = 150) -> BytesIO | None:
251
+ """
252
+ Genera una nube de palabras (PNG en memoria) a partir de dfin['texto'].
253
+ - Limpia URLs, menciones, hashtags y tokens cortos.
254
+ - Stopwords ES/EN básicas incluidas.
255
+ - Devuelve BytesIO con PNG o None si no hay texto útil.
256
+ """
257
+ if "texto" not in dfin.columns or dfin.empty:
258
+ return None
259
+
260
+ # --- recolecta y limpia texto ---
261
+ texts = dfin["texto"].dropna().astype(str).tolist()
262
+ if not texts:
263
+ return None
264
+
265
+ text = " ".join(texts)
266
+
267
+ # quita URLs, menciones y hashtags (sólo el #/@, mantenemos la palabra)
268
+ text = re.sub(r"https?://\S+", " ", text)
269
+ text = re.sub(r"[@#]", " ", text)
270
+
271
+ # tokens básicos
272
+ tokens = re.findall(r"[A-Za-zÁÉÍÓÚÜÑáéíóúüñ0-9]+", text)
273
+
274
+ # stopwords sencillas ES/EN (puedes ampliarlas si quieres)
275
+ stop_es = {
276
+ "de","la","que","el","en","y","a","los","del","se","las","por","un","para","con","no",
277
+ "una","su","al","lo","como","más","pero","sus","le","ya","o","fue","este","ha","sí",
278
+ "porque","esta","son","entre","cuando","muy","sin","sobre","también","me","hasta",
279
+ "hay","donde","quien","desde","todo","nos","durante","todos","uno","les","ni","contra",
280
+ "otros","ese","eso","ante","ellos","e","esto","mí","antes","algunos","qué","unos",
281
+ "yo","otro","otras","otra","él","tanto","esa","estos","mucho","quienes","nada","muchos",
282
+ "cual","poco","ella","estar","estas","algunas","algo","nosotros","mi","mis","tú","te",
283
+ "ti","tu","tus","ellas","nosotras","vosotros","vosotras","os","mío","mía","míos","mías",
284
+ "tuyo","tuya","tuyos","tuyas","suyo","suya","suyos","suyas","nuestro","nuestra","nuestros",
285
+ "nuestras","vuestro","vuestra","vuestros","vuestras","esos","esas","estoy","estás","está",
286
+ "estamos","estáis","están","esté","estés","estemos","estéis","estén","estaré","estarás",
287
+ "estará","estaremos","estaréis","estarán"
288
+ }
289
+ stop_en = {
290
+ "the","a","an","and","or","but","to","of","for","in","on","at","by","with","from","as",
291
+ "is","are","was","were","be","been","being","it","its","this","that","these","those",
292
+ "i","you","he","she","we","they","me","him","her","us","them","my","your","his","their",
293
+ "our","mine","yours","hers","theirs","ours","not","no","so","if","than","then","too",
294
+ "very","can","could","should","would","will","just","also","into","over","under",
295
+ }
296
+ stops = {t.lower() for t in (stop_es | stop_en)}
297
+
298
+ # filtra tokens (longitud mínima y no stopword)
299
+ tokens = [t for t in tokens if len(t) >= 3 and t.lower() not in stops]
300
+ if not tokens:
301
+ return None
302
+
303
+ # frecuencias
304
+ freqs = Counter(t.lower() for t in tokens)
305
+
306
+ # genera la nube
307
+ wc = WordCloud(
308
+ width=1400,
309
+ height=800,
310
+ background_color="white",
311
+ prefer_horizontal=0.9,
312
+ collocations=False, # no agrupa bi/trigramas, mejor control de tokens
313
+ max_words=max_words,
314
+ ).generate_from_frequencies(freqs)
315
+
316
+ # a PNG en memoria
317
+ png = BytesIO()
318
+ wc.to_image().save(png, format="PNG")
319
+ png.seek(0)
320
+ return png
321
+
322
+
323
+ CHARTS = {
324
+ "📈 Posts por día + media móvil": chart_posts_diario_ma,
325
+ "📊 Recuento por sentimiento": chart_sentimiento_barras, # ⬅️ nueva entrada
326
+ "🧭 Sentimiento (área apilada)": chart_sentimiento_apilado,
327
+ "☁️ Nube de palabras": chart_nube_palabras, # ⬅️ NUEVO
328
+ "🗓️ Calor Día×Hora": chart_heatmap_dia_hora,
329
+ "🏷️ Top hashtags": chart_top_hashtags,
330
+ "🏷️ Hashtag × Sentimiento": chart_sent_por_hashtag,
331
+ "👤 Top menciones": chart_top_menciones,
332
+ "📏 Longitud del texto": chart_hist_longitud,
333
+ "🔗 Top dominios (URLs)": chart_top_dominios,
334
+ }
app/client_manager.py ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from atproto import Client
3
+
4
+ # ========= Gestión de sesión (login per-user) =========
5
+
6
+ def _get_client():
7
+ """Devuelve el cliente actual de Bluesky desde la sesión."""
8
+ return st.session_state.get("bsky_client")
9
+
10
+
11
+ def _set_client(c, handle):
12
+ """Guarda el cliente y el handle en la sesión."""
13
+ st.session_state["bsky_client"] = c
14
+ st.session_state["bsky_handle"] = handle
15
+
16
+
17
+ def _is_logged_in():
18
+ """Devuelve True si hay un cliente activo."""
19
+ return "bsky_client" in st.session_state and st.session_state["bsky_client"] is not None
20
+
21
+
22
+ def _logout():
23
+ """Cierra sesión eliminando los datos de la sesión."""
24
+ st.session_state.pop("bsky_client", None)
25
+ st.session_state.pop("bsky_handle", None)
26
+
27
+
28
+ def login_bsky(handle: str, app_password: str):
29
+ """
30
+ Inicia sesión en Bluesky con el handle y la App Password del usuario.
31
+ """
32
+ client = Client() # usa el servicio por defecto https://bsky.social
33
+ client.login(handle, app_password)
34
+ return client
35
+
36
+ # ---- Aliases públicos para compatibilidad con el resto del código ----
37
+ def get_client():
38
+ return _get_client()
39
+
40
+ def set_client(c, handle):
41
+ return _set_client(c, handle)
42
+
43
+ def is_logged_in():
44
+ return _is_logged_in()
45
+
46
+ def logout():
47
+ return _logout()
48
+
49
+ def login(handle: str, app_password: str):
50
+ return login_bsky(handle, app_password)
app/fetcher.py ADDED
@@ -0,0 +1,139 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import math
2
+ from datetime import datetime, timedelta, timezone
3
+ from typing import Iterable, Optional, List
4
+
5
+ import pandas as pd
6
+ from atproto import models
7
+
8
+ from app.client_manager import get_client
9
+
10
+
11
+ def _iso_to_dt_utc_naive(iso: str) -> Optional[datetime]:
12
+ """Convierte ISO (posible 'Z') a datetime naive en UTC."""
13
+ if not iso:
14
+ return None
15
+ try:
16
+ iso = iso.replace("Z", "+00:00")
17
+ aware = datetime.fromisoformat(iso)
18
+ return aware.astimezone(timezone.utc).replace(tzinfo=None)
19
+ except Exception:
20
+ return None
21
+
22
+
23
+ def _search_one_term(
24
+ term: str,
25
+ days_back: int,
26
+ max_posts: Optional[int],
27
+ ) -> pd.DataFrame:
28
+ """
29
+ Busca posts de un único término usando la API oficial (app.bsky.feed.search_posts).
30
+ Devuelve un DataFrame con columnas: texto, autor, fecha (datetime naive UTC), uri.
31
+ Respeta el corte por días y el límite max_posts.
32
+ """
33
+ client = get_client()
34
+ if client is None:
35
+ raise RuntimeError("No hay sesión de Bluesky.")
36
+
37
+ cutoff_aware = datetime.now(timezone.utc) - timedelta(days=days_back)
38
+
39
+ rows: List[dict] = []
40
+ cursor = None
41
+
42
+ while True:
43
+ remaining = None if max_posts is None else max(max_posts - len(rows), 0)
44
+ if remaining == 0:
45
+ break
46
+ limit = 100 if remaining is None else max(1, min(100, remaining))
47
+
48
+ params = models.AppBskyFeedSearchPosts.Params(q=term, limit=limit, cursor=cursor)
49
+ resp = client.app.bsky.feed.search_posts(params=params)
50
+
51
+ posts = resp.posts or []
52
+ if not posts:
53
+ break
54
+
55
+ # Si encontramos algún post más antiguo que el cutoff, paramos este término
56
+ stop_for_age = False
57
+
58
+ for p in posts:
59
+ created_raw = getattr(p.record, "created_at", "") or ""
60
+ # comparar con cutoff usando AWARE
61
+ try:
62
+ aware = datetime.fromisoformat(created_raw.replace("Z", "+00:00"))
63
+ if aware < cutoff_aware:
64
+ stop_for_age = True
65
+ break
66
+ except Exception:
67
+ pass
68
+
69
+ created_dt = _iso_to_dt_utc_naive(created_raw)
70
+
71
+ rows.append(
72
+ {
73
+ "texto": getattr(p.record, "text", "") or "",
74
+ "autor": getattr(p.author, "handle", "") or "",
75
+ "fecha": created_dt,
76
+ "uri": getattr(p, "uri", "") or "",
77
+ }
78
+ )
79
+
80
+ if max_posts is not None and len(rows) >= max_posts:
81
+ stop_for_age = True
82
+ break
83
+
84
+ if stop_for_age:
85
+ break
86
+
87
+ cursor = resp.cursor
88
+ if not cursor:
89
+ break
90
+
91
+ return pd.DataFrame(rows, columns=["texto", "autor", "fecha", "uri"])
92
+
93
+
94
+ def fetch_posts(topic: str, days_back: int, user_handle: str, max_posts: Optional[int] = None) -> pd.DataFrame:
95
+ """
96
+ Búsqueda tipo AND (la API ya hace matching por 'q').
97
+ """
98
+ return _search_one_term(topic, days_back, max_posts)
99
+
100
+
101
+ def fetch_posts_or(terms: Iterable[str], days_back: int, user_handle: str, max_posts: Optional[int] = None) -> pd.DataFrame:
102
+ """
103
+ Búsqueda OR: reparte el cupo entre términos, combina y elimina duplicados.
104
+ """
105
+ terms = [t.strip() for t in terms if t and t.strip()]
106
+ if not terms:
107
+ return pd.DataFrame(columns=["texto", "autor", "fecha", "uri"])
108
+
109
+ remaining = max_posts
110
+ frames: List[pd.DataFrame] = []
111
+ aportes = {}
112
+
113
+ for i, term in enumerate(terms):
114
+ limit_i = None
115
+ if remaining is not None:
116
+ # reparte lo que queda entre los que faltan (redondeo hacia arriba)
117
+ limit_i = math.ceil(remaining / (len(terms) - i))
118
+
119
+ try:
120
+ df_i = _search_one_term(term, days_back, limit_i)
121
+ except Exception as e:
122
+ # devolvemos vacío y que la app lo muestre como aviso
123
+ df_i = pd.DataFrame(columns=["texto", "autor", "fecha", "uri"])
124
+
125
+ aportes[term] = len(df_i)
126
+ frames.append(df_i)
127
+
128
+ if remaining is not None:
129
+ remaining = max(0, remaining - len(df_i))
130
+
131
+ df = pd.concat(frames, ignore_index=True)
132
+ df = df.drop_duplicates(subset=["uri", "texto", "autor"])
133
+
134
+ if max_posts is not None:
135
+ df = df.head(max_posts)
136
+
137
+ # guardamos “aportes” como atributo para que la UI lo muestre
138
+ df.attrs["aportes"] = aportes
139
+ return df
app/reporting.py ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # app/reporting.py
2
+ from __future__ import annotations
3
+ from io import BytesIO
4
+ from typing import List, Tuple
5
+ import altair as alt
6
+ import pandas as pd
7
+ from reportlab.lib.pagesizes import A4
8
+ from reportlab.pdfgen import canvas
9
+ from reportlab.lib.utils import ImageReader
10
+
11
+ def altair_to_png_bytes(chart: alt.Chart, scale: int = 2) -> BytesIO:
12
+ buf = BytesIO()
13
+ chart.save(buf, format="png", scale=scale) # requiere vl-convert-python
14
+ buf.seek(0)
15
+ return buf
16
+
17
+ def build_pdf_with_images(df: pd.DataFrame, images: List[Tuple[str, BytesIO]]) -> bytes:
18
+ buffer = BytesIO()
19
+ c = canvas.Canvas(buffer, pagesize=A4)
20
+ page_w, page_h = A4
21
+ margin = 36
22
+ y = page_h - margin
23
+
24
+ c.setFont("Helvetica-Bold", 18)
25
+ c.drawString(margin, y, "Informe de Análisis - Bluesky Explorer")
26
+ y -= 22
27
+ c.setFont("Helvetica", 11)
28
+ c.drawString(margin, y, f"Total de publicaciones analizadas: {len(df)}")
29
+ y -= 8
30
+ c.drawString(margin, y, f"Gráficos incluidos: {len(images)}")
31
+ y -= 20
32
+
33
+ max_w = page_w - 2 * margin
34
+
35
+ for idx, (title, png_bytes) in enumerate(images, start=1):
36
+ if y < 140:
37
+ c.showPage()
38
+ y = page_h - margin
39
+
40
+ c.setFont("Helvetica-Bold", 12)
41
+ c.drawString(margin, y, f"{idx}. {title}")
42
+ y -= 12
43
+
44
+ img_reader = ImageReader(png_bytes)
45
+ iw, ih = img_reader.getSize()
46
+ scale = min(max_w / iw, 1)
47
+ w = iw * scale
48
+ h = ih * scale
49
+
50
+ if y - h < margin:
51
+ c.showPage()
52
+ y = page_h - margin - 12
53
+ c.setFont("Helvetica-Bold", 12)
54
+ c.drawString(margin, y, f"{idx}. {title}")
55
+ y -= 12
56
+
57
+ c.drawImage(img_reader, margin, y - h, width=w, height=h, preserveAspectRatio=True)
58
+ y -= h + 18
59
+
60
+ c.showPage()
61
+ c.save()
62
+ buffer.seek(0)
63
+ return buffer.getvalue()
app/ui/__init__.py ADDED
File without changes
app/ui/__pycache__/__init__.cpython-311.pyc ADDED
Binary file (156 Bytes). View file
 
app/ui/__pycache__/components.cpython-311.pyc ADDED
Binary file (4.26 kB). View file
 
app/ui/__pycache__/main_app.cpython-311.pyc ADDED
Binary file (4.7 kB). View file
 
app/ui/__pycache__/panel.cpython-311.pyc ADDED
Binary file (4.73 kB). View file
 
app/ui/components.py ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+ import streamlit as st
3
+ from config.settings import MAX_POSTS_WARNING
4
+ from app.client_manager import is_logged_in, login, logout, set_client
5
+
6
+ def render_login_sidebar():
7
+ st.sidebar.header("Acceso a Bluesky")
8
+ if not is_logged_in():
9
+ handle = st.sidebar.text_input("Handle", "", key="login_handle")
10
+ app_password = st.sidebar.text_input("App Password", type="password", key="login_password")
11
+ if st.sidebar.button("Iniciar sesión", key="btn_login"):
12
+ try:
13
+ client = login(handle, app_password)
14
+ set_client(client, handle)
15
+ st.sidebar.success("Autenticado correctamente.")
16
+ st.rerun()
17
+ except Exception:
18
+ st.sidebar.error("Usuario o contraseña incorrectos.")
19
+ else:
20
+ st.sidebar.success(f"Sesión iniciada como {st.session_state.get('bsky_handle')}")
21
+ if st.sidebar.button("Cerrar sesión", key="btn_logout"):
22
+ logout()
23
+ st.rerun()
24
+
25
+ def render_search_form():
26
+ st.sidebar.header("Configuración de búsqueda")
27
+ with st.sidebar.form("search_form", clear_on_submit=False):
28
+ topic = st.text_input("Término", value="apagón", key="search_topic")
29
+ days_back = st.slider("Días atrás", 7, 365, 30, key="search_days_back")
30
+ max_posts = st.number_input("Máximo de posts", 1, 30000, 1000, key="search_max_posts")
31
+ operator = st.radio("Operador", ["AND", "OR"], horizontal=True, key="search_operator")
32
+
33
+ require_confirm = max_posts > MAX_POSTS_WARNING
34
+ if require_confirm:
35
+ st.warning(f"🚨 Has solicitado {int(max_posts)} publicaciones. Puede ralentizar el proceso.", icon="⚠️")
36
+ confirm_heavy = st.checkbox("Entiendo el aviso y deseo continuar", key="search_confirm_heavy")
37
+ else:
38
+ confirm_heavy = True
39
+
40
+ submitted = st.form_submit_button("Buscar", use_container_width=True, type="primary")
41
+
42
+ return submitted, str(topic).strip(), int(days_back), int(max_posts), operator, bool(require_confirm), bool(confirm_heavy)
app/ui/main_app.py ADDED
@@ -0,0 +1,294 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # --- ensure project root on sys.path ---
2
+ import sys
3
+ from pathlib import Path
4
+
5
+ ROOT = Path(__file__).resolve().parents[2] # .../TFM
6
+ if str(ROOT) not in sys.path:
7
+ sys.path.insert(0, str(ROOT))
8
+ # ---------------------------------------
9
+
10
+ import json
11
+ import hashlib
12
+ from io import BytesIO
13
+
14
+ import altair as alt
15
+ import pandas as pd
16
+ import streamlit as st
17
+ from reportlab.lib.pagesizes import A4
18
+ from reportlab.pdfgen import canvas
19
+ from reportlab.lib.utils import ImageReader
20
+
21
+ from config.settings import MAX_POSTS_WARNING
22
+ from app.client_manager import login, set_client, logout, is_logged_in
23
+ from app.fetcher import fetch_posts, fetch_posts_or
24
+ from app.analyzer import clean_and_analyze
25
+ from app import charts # <- nuestras funciones de gráficos
26
+
27
+
28
+ st.set_page_config(page_title="Bluesky Explorer", page_icon="🔎", layout="wide")
29
+ st.title("🔎 Bluesky Explorer")
30
+
31
+
32
+ # -------------------------------
33
+ # Helpers
34
+ # -------------------------------
35
+ def _result_key(handle, topic, days_back, max_posts, operator):
36
+ payload = json.dumps(
37
+ {"h": handle, "t": topic, "d": int(days_back), "m": int(max_posts), "o": operator},
38
+ ensure_ascii=False,
39
+ sort_keys=True,
40
+ ).encode()
41
+ return hashlib.md5(payload).hexdigest()
42
+
43
+
44
+ # -------------------------------
45
+ # Login
46
+ # -------------------------------
47
+ st.sidebar.header("Acceso a Bluesky")
48
+
49
+ if not is_logged_in():
50
+ handle = st.sidebar.text_input("Handle", "", key="login_handle")
51
+ app_password = st.sidebar.text_input("App Password", type="password", key="login_pwd")
52
+ if st.sidebar.button("Iniciar sesión", key="btn_login"):
53
+ try:
54
+ client = login(handle, app_password)
55
+ set_client(client, handle)
56
+ st.sidebar.success("Autenticado correctamente.")
57
+ st.rerun()
58
+ except Exception:
59
+ st.sidebar.error("Usuario o contraseña incorrectos.")
60
+ else:
61
+ st.sidebar.success(f"Sesión iniciada como {st.session_state.get('bsky_handle')}")
62
+ if st.sidebar.button("Cerrar sesión", key="btn_logout"):
63
+ logout()
64
+ st.rerun()
65
+
66
+ if not is_logged_in():
67
+ st.stop()
68
+
69
+ # -------------------------------
70
+ # Parámetros
71
+ # -------------------------------
72
+ st.sidebar.header("Configuración de búsqueda")
73
+ with st.sidebar.form("search_form", clear_on_submit=False):
74
+ topic = st.text_input("Término", value="apagón", key="param_topic")
75
+ days_back = st.slider("Días atrás", 7, 365, 30, key="param_days")
76
+ max_posts = st.number_input("Máximo de posts", 1, 30000, 1000, key="param_max")
77
+ operator = st.radio("Operador", ["AND", "OR"], horizontal=True, key="param_op")
78
+
79
+ require_confirm = max_posts > MAX_POSTS_WARNING
80
+ if require_confirm:
81
+ st.warning(
82
+ f"🚨 Has solicitado {int(max_posts)} publicaciones. Puede ralentizar el proceso.",
83
+ icon="⚠️",
84
+ )
85
+ confirm_heavy = st.checkbox("Entiendo el aviso y deseo continuar", key="param_confirm")
86
+ else:
87
+ confirm_heavy = True
88
+
89
+ submitted = st.form_submit_button("Buscar", use_container_width=True)
90
+
91
+
92
+ # -------------------------------
93
+ # Buscar + Analizar
94
+ # -------------------------------
95
+ if submitted:
96
+ topic = topic.strip()
97
+ if not topic:
98
+ st.error("Debes introducir un término de búsqueda.")
99
+ st.stop()
100
+ if days_back <= 0 or max_posts <= 0:
101
+ st.error("Parámetros inválidos.")
102
+ st.stop()
103
+ if require_confirm and not confirm_heavy:
104
+ st.error("Debes marcar la casilla para continuar.")
105
+ st.stop()
106
+
107
+ try:
108
+ with st.spinner("🔎 Buscando publicaciones en Bluesky..."):
109
+ if operator == "AND":
110
+ df_raw = fetch_posts(topic, days_back, st.session_state["bsky_handle"], int(max_posts))
111
+ else:
112
+ df_raw = fetch_posts_or(topic.split(), days_back, st.session_state["bsky_handle"], int(max_posts))
113
+ except Exception as e:
114
+ st.error(f"Error al recuperar publicaciones: {e}")
115
+ st.stop()
116
+
117
+ # Info de aportes por término (si viene)
118
+ aportes = df_raw.attrs.get("aportes")
119
+ if aportes:
120
+ st.caption("📊 Posts por término: " + ", ".join(f"{k}: {v}" for k, v in aportes.items()))
121
+
122
+ if df_raw is None or df_raw.empty:
123
+ st.warning("No se encontraron publicaciones.")
124
+ st.stop()
125
+
126
+ # Analizar
127
+ try:
128
+ with st.spinner("🧠 Analizando sentimiento y limpiando datos..."):
129
+ df = clean_and_analyze(df_raw) # tu función cacheada
130
+ except Exception as e:
131
+ st.error(f"Error durante el análisis: {e}")
132
+ st.stop()
133
+
134
+ if df is None or df.empty:
135
+ st.info("No se encontraron publicaciones válidas tras limpieza.")
136
+ st.stop()
137
+
138
+ # Fechas seguras para graficar/exportar
139
+ df = charts.preprocess_dates(df)
140
+
141
+ # Guardar en sesión
142
+ key = _result_key(st.session_state["bsky_handle"], topic, days_back, int(max_posts), operator)
143
+ st.session_state["current_key"] = key
144
+ st.session_state.setdefault("results", {})[key] = df
145
+
146
+
147
+ # -------------------------------
148
+ # Recuperar último resultado
149
+ # -------------------------------
150
+ df = None
151
+ key = st.session_state.get("current_key")
152
+ if key:
153
+ df = st.session_state.get("results", {}).get(key)
154
+
155
+ if df is None or df.empty:
156
+ st.info("Realiza una búsqueda para construir el panel de gráficos.")
157
+ st.stop()
158
+
159
+ # -------------------------------
160
+ # Métricas + Datos
161
+ # -------------------------------
162
+ st.metric("Posts totales", len(df))
163
+ st.dataframe(df, use_container_width=True)
164
+
165
+ # -------------------------------
166
+ # Panel de gráficos
167
+ # -------------------------------
168
+ st.sidebar.header("Panel de gráficos")
169
+
170
+ # Evita que el texto de los botones se parta
171
+ st.sidebar.markdown(
172
+ """
173
+ <style>
174
+ section[data-testid="stSidebar"] button { white-space: nowrap; }
175
+ </style>
176
+ """,
177
+ unsafe_allow_html=True,
178
+ )
179
+
180
+ st.session_state.setdefault("panels", {})
181
+ st.session_state["panels"].setdefault(key, [])
182
+ panels = st.session_state["panels"][key]
183
+
184
+ choice = st.sidebar.selectbox("Añadir gráfico", list(charts.CHARTS.keys()), key="chart_select")
185
+
186
+ c1, c2, c3 = st.sidebar.columns(3, gap="small")
187
+ with c1:
188
+ add = st.button(" ➕ ", use_container_width=True, key="btn_add_chart")
189
+ with c2:
190
+ undo = st.button(" ↩️ ", use_container_width=True, key="btn_undo_chart")
191
+ with c3:
192
+ clear = st.button(" 🗑 ", use_container_width=True, key="btn_clear_chart")
193
+
194
+ if add:
195
+ panels.append(choice)
196
+ if undo and panels:
197
+ panels.pop()
198
+ if clear:
199
+ panels.clear()
200
+
201
+ if panels:
202
+ st.subheader("📊 Panel de gráficos")
203
+
204
+ chart_pngs: list[tuple[str, BytesIO]] = []
205
+
206
+ for i, name in enumerate(panels, start=1):
207
+ st.markdown(f"**{i}. {name}**")
208
+ chart_func = charts.CHARTS[name]
209
+ chart_obj = chart_func(df)
210
+
211
+ # 1) Cualquier gráfico de Altair (Chart, LayerChart, Facet, Concat, etc.)
212
+ if hasattr(chart_obj, "to_dict"): # duck-typing para objetos Altair
213
+ st.altair_chart(chart_obj, use_container_width=True)
214
+ # Exportar a PNG (requiere vl-convert-python)
215
+ png = charts.export_chart_png(chart_obj, scale=2)
216
+ if png:
217
+ chart_pngs.append((name, png))
218
+ else:
219
+ st.warning(f"No se pudo exportar '{name}' como imagen (Altair).")
220
+
221
+ # 2) Imagen generada (por ejemplo la nube de palabras -> BytesIO)
222
+ elif isinstance(chart_obj, BytesIO):
223
+ st.image(chart_obj, use_container_width=True) # <-- sin el parámetro deprecado
224
+ chart_pngs.append((name, chart_obj))
225
+
226
+ # 3) Sin datos
227
+ elif chart_obj is None:
228
+ st.info("No hay datos suficientes para este gráfico.")
229
+
230
+ # 4) Tipo inesperado
231
+ else:
232
+ st.warning(f"Tipo de salida no soportado para '{name}'.")
233
+
234
+
235
+ # PDF
236
+ def _pdf_from_images(df_data: pd.DataFrame, images: list[tuple[str, BytesIO]]) -> bytes:
237
+ buf = BytesIO()
238
+ c = canvas.Canvas(buf, pagesize=A4)
239
+ page_w, page_h = A4
240
+ margin = 36
241
+ y = page_h - margin
242
+
243
+ c.setFont("Helvetica-Bold", 18)
244
+ c.drawString(margin, y, "Informe de Análisis - Bluesky Explorer")
245
+ y -= 22
246
+ c.setFont("Helvetica", 11)
247
+ c.drawString(margin, y, f"Total de publicaciones analizadas: {len(df_data)}")
248
+ y -= 8
249
+ c.drawString(margin, y, f"Gráficos incluidos: {len(images)}")
250
+ y -= 20
251
+
252
+ max_w = page_w - 2 * margin
253
+
254
+ for idx, (title, png_bytes) in enumerate(images, start=1):
255
+ if y < 140:
256
+ c.showPage()
257
+ y = page_h - margin
258
+
259
+ c.setFont("Helvetica-Bold", 12)
260
+ c.drawString(margin, y, f"{idx}. {title}")
261
+ y -= 12
262
+
263
+ img = ImageReader(png_bytes)
264
+ iw, ih = img.getSize()
265
+ scale = min(max_w / iw, 1.0)
266
+ w = iw * scale
267
+ h = ih * scale
268
+
269
+ if y - h < margin:
270
+ c.showPage()
271
+ y = page_h - margin - 12
272
+ c.setFont("Helvetica-Bold", 12)
273
+ c.drawString(margin, y, f"{idx}. {title}")
274
+ y -= 12
275
+
276
+ c.drawImage(img, margin, y - h, width=w, height=h, preserveAspectRatio=True)
277
+ y -= h + 18
278
+
279
+ c.showPage()
280
+ c.save()
281
+ buf.seek(0)
282
+ return buf.getvalue()
283
+
284
+ if chart_pngs:
285
+ pdf_bytes = _pdf_from_images(df, chart_pngs)
286
+ st.download_button(
287
+ label="📄 Descargar informe en PDF",
288
+ data=pdf_bytes,
289
+ file_name="informe_bluesky.pdf",
290
+ mime="application/pdf",
291
+ key="btn_pdf",
292
+ )
293
+ else:
294
+ st.info("Selecciona un tipo de gráfico en la barra lateral y pulsa **Añadir** para construir tu panel.")
app/ui/panel.py ADDED
@@ -0,0 +1,83 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+ from io import BytesIO
3
+ from typing import List, Tuple
4
+ import streamlit as st
5
+ import pandas as pd
6
+ from app.charts import CHARTS
7
+ from app.reporting import altair_to_png_bytes, build_pdf_with_images
8
+
9
+ def render_panel(df: pd.DataFrame, state_key: str):
10
+ st.sidebar.header("Panel de gráficos")
11
+
12
+ # Evita cortes raros en botones
13
+ st.sidebar.markdown(
14
+ """
15
+ <style>
16
+ section[data-testid="stSidebar"] button { white-space: nowrap; }
17
+ </style>
18
+ """,
19
+ unsafe_allow_html=True,
20
+ )
21
+
22
+ # Estado por resultado
23
+ st.session_state.setdefault("panels", {})
24
+ st.session_state["panels"].setdefault(state_key, [])
25
+ panels: list[str] = st.session_state["panels"][state_key]
26
+
27
+ # Selectbox + botones con keys únicas por 'state_key'
28
+ choice = st.sidebar.selectbox(
29
+ "Añadir gráfico",
30
+ list(CHARTS.keys()),
31
+ key=f"chart_select_{state_key}",
32
+ )
33
+ c1, c2, c3 = st.sidebar.columns(3, gap="small")
34
+ with c1:
35
+ add = st.button(" ➕ ", use_container_width=True, key=f"btn_add_{state_key}")
36
+ with c2:
37
+ undo = st.button(" ↩️ ", use_container_width=True, key=f"btn_undo_{state_key}")
38
+ with c3:
39
+ clear = st.button(" 🗑 ", use_container_width=True, key=f"btn_clear_{state_key}")
40
+
41
+ if add:
42
+ panels.append(choice)
43
+ if undo and panels:
44
+ panels.pop()
45
+ if clear:
46
+ panels.clear()
47
+
48
+ if not panels:
49
+ st.info("Selecciona un tipo de gráfico en la barra lateral y pulsa **Añadir** para construir tu panel.")
50
+ return
51
+
52
+ st.subheader("📊 Panel de gráficos")
53
+
54
+ chart_pngs: List[Tuple[str, BytesIO]] = []
55
+
56
+ for i, name in enumerate(panels, start=1):
57
+ st.markdown(f"**{i}. {name}**")
58
+ chart_func = CHARTS[name]
59
+ chart_obj = chart_func(df)
60
+ if chart_obj is None:
61
+ st.info("No hay datos suficientes para este gráfico.")
62
+ continue
63
+
64
+ st.altair_chart(chart_obj, use_container_width=True)
65
+
66
+ # Exportación a PNG (necesita vl-convert-python instalado)
67
+ try:
68
+ png_buf = altair_to_png_bytes(chart_obj, scale=2)
69
+ chart_pngs.append((name, png_buf))
70
+ except Exception as e:
71
+ st.warning(f"No se pudo exportar '{name}' como imagen: {e}")
72
+
73
+ if chart_pngs:
74
+ pdf_bytes = build_pdf_with_images(df, chart_pngs)
75
+ st.download_button(
76
+ label="📄 Descargar informe en PDF",
77
+ data=pdf_bytes,
78
+ file_name="informe_bluesky.pdf",
79
+ mime="application/pdf",
80
+ key=f"btn_pdf_{state_key}",
81
+ )
82
+ else:
83
+ st.info("No se pudieron exportar imágenes para el PDF.")
app/utils.py ADDED
File without changes