Spaces:
Sleeping
Sleeping
Update app/charts.py
Browse files- app/charts.py +45 -50
app/charts.py
CHANGED
|
@@ -1,13 +1,18 @@
|
|
| 1 |
import pandas as pd
|
| 2 |
import altair as alt
|
| 3 |
import streamlit as st
|
| 4 |
-
from io import BytesIO
|
| 5 |
from typing import Optional
|
| 6 |
import re
|
| 7 |
from collections import Counter
|
| 8 |
from io import BytesIO
|
| 9 |
from wordcloud import WordCloud
|
| 10 |
import ast
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 11 |
|
| 12 |
# =========================
|
| 13 |
# Helpers
|
|
@@ -467,78 +472,68 @@ def chart_top_dominios(dfin: pd.DataFrame, topn: int = 20):
|
|
| 467 |
|
| 468 |
@st.cache_data(show_spinner=False)
|
| 469 |
def chart_nube_palabras(dfin: pd.DataFrame, max_words: int = 150) -> BytesIO | None:
|
| 470 |
-
"""
|
| 471 |
-
Genera una nube de palabras (PNG en memoria) a partir de dfin['texto'].
|
| 472 |
-
- Limpia URLs, menciones, hashtags y tokens cortos.
|
| 473 |
-
- Stopwords ES/EN básicas incluidas.
|
| 474 |
-
- Devuelve BytesIO con PNG o None si no hay texto útil.
|
| 475 |
-
"""
|
| 476 |
if "texto" not in dfin.columns or dfin.empty:
|
| 477 |
return None
|
| 478 |
|
| 479 |
-
# --- recolecta y limpia texto ---
|
| 480 |
texts = dfin["texto"].dropna().astype(str).tolist()
|
| 481 |
if not texts:
|
| 482 |
return None
|
| 483 |
|
| 484 |
-
|
| 485 |
-
|
| 486 |
-
|
| 487 |
-
|
| 488 |
-
|
| 489 |
-
|
| 490 |
-
|
| 491 |
-
|
| 492 |
-
|
| 493 |
-
|
| 494 |
-
|
| 495 |
-
|
| 496 |
-
|
| 497 |
-
|
| 498 |
-
|
| 499 |
-
|
| 500 |
-
|
| 501 |
-
|
| 502 |
-
|
| 503 |
-
|
| 504 |
-
|
| 505 |
-
|
| 506 |
-
|
| 507 |
-
|
| 508 |
-
|
| 509 |
-
|
| 510 |
-
|
| 511 |
-
|
| 512 |
-
|
| 513 |
-
|
| 514 |
-
|
| 515 |
-
|
| 516 |
-
|
| 517 |
-
|
| 518 |
-
|
| 519 |
-
if not tokens:
|
| 520 |
return None
|
| 521 |
|
| 522 |
-
|
| 523 |
-
freqs = Counter(t.lower() for t in tokens)
|
| 524 |
|
| 525 |
-
# genera la nube
|
| 526 |
wc = WordCloud(
|
| 527 |
width=1400,
|
| 528 |
height=800,
|
| 529 |
background_color="white",
|
| 530 |
-
prefer_horizontal=0.9,
|
| 531 |
-
collocations=False, # no agrupa bi/trigramas, mejor control de tokens
|
| 532 |
max_words=max_words,
|
|
|
|
|
|
|
| 533 |
).generate_from_frequencies(freqs)
|
| 534 |
|
| 535 |
-
# a PNG en memoria
|
| 536 |
png = BytesIO()
|
| 537 |
wc.to_image().save(png, format="PNG")
|
| 538 |
png.seek(0)
|
| 539 |
return png
|
| 540 |
|
| 541 |
|
|
|
|
| 542 |
CHARTS = {
|
| 543 |
"posts_per_day": chart_posts_diario_ma,
|
| 544 |
"sentiment_count": chart_sentimiento_barras,
|
|
|
|
| 1 |
import pandas as pd
|
| 2 |
import altair as alt
|
| 3 |
import streamlit as st
|
|
|
|
| 4 |
from typing import Optional
|
| 5 |
import re
|
| 6 |
from collections import Counter
|
| 7 |
from io import BytesIO
|
| 8 |
from wordcloud import WordCloud
|
| 9 |
import ast
|
| 10 |
+
import spacy
|
| 11 |
+
from langdetect import detect
|
| 12 |
+
|
| 13 |
+
NLP_ES = spacy.load("es_core_news_sm")
|
| 14 |
+
NLP_EN = spacy.load("en_core_web_sm")
|
| 15 |
+
|
| 16 |
|
| 17 |
# =========================
|
| 18 |
# Helpers
|
|
|
|
| 472 |
|
| 473 |
@st.cache_data(show_spinner=False)
|
| 474 |
def chart_nube_palabras(dfin: pd.DataFrame, max_words: int = 150) -> BytesIO | None:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 475 |
if "texto" not in dfin.columns or dfin.empty:
|
| 476 |
return None
|
| 477 |
|
|
|
|
| 478 |
texts = dfin["texto"].dropna().astype(str).tolist()
|
| 479 |
if not texts:
|
| 480 |
return None
|
| 481 |
|
| 482 |
+
lemmas = []
|
| 483 |
+
|
| 484 |
+
for text in texts:
|
| 485 |
+
try:
|
| 486 |
+
lang = detect(text)
|
| 487 |
+
except Exception:
|
| 488 |
+
continue
|
| 489 |
+
|
| 490 |
+
if lang == "es":
|
| 491 |
+
doc = NLP_ES(text)
|
| 492 |
+
elif lang == "en":
|
| 493 |
+
doc = NLP_EN(text)
|
| 494 |
+
else:
|
| 495 |
+
continue
|
| 496 |
+
|
| 497 |
+
for token in doc:
|
| 498 |
+
if (
|
| 499 |
+
token.is_stop
|
| 500 |
+
or token.is_punct
|
| 501 |
+
or token.is_space
|
| 502 |
+
or token.like_url
|
| 503 |
+
or token.like_email
|
| 504 |
+
):
|
| 505 |
+
continue
|
| 506 |
+
|
| 507 |
+
if token.pos_ not in {"NOUN", "PROPN", "ADJ"}:
|
| 508 |
+
continue
|
| 509 |
+
|
| 510 |
+
lemma = token.lemma_.lower().strip()
|
| 511 |
+
if len(lemma) < 3:
|
| 512 |
+
continue
|
| 513 |
+
|
| 514 |
+
lemmas.append(lemma)
|
| 515 |
+
|
| 516 |
+
if not lemmas:
|
|
|
|
| 517 |
return None
|
| 518 |
|
| 519 |
+
freqs = Counter(lemmas)
|
|
|
|
| 520 |
|
|
|
|
| 521 |
wc = WordCloud(
|
| 522 |
width=1400,
|
| 523 |
height=800,
|
| 524 |
background_color="white",
|
|
|
|
|
|
|
| 525 |
max_words=max_words,
|
| 526 |
+
collocations=False,
|
| 527 |
+
prefer_horizontal=0.9,
|
| 528 |
).generate_from_frequencies(freqs)
|
| 529 |
|
|
|
|
| 530 |
png = BytesIO()
|
| 531 |
wc.to_image().save(png, format="PNG")
|
| 532 |
png.seek(0)
|
| 533 |
return png
|
| 534 |
|
| 535 |
|
| 536 |
+
|
| 537 |
CHARTS = {
|
| 538 |
"posts_per_day": chart_posts_diario_ma,
|
| 539 |
"sentiment_count": chart_sentimiento_barras,
|