bluesky-explorer

Sleeping

App Files Files Community

jccolon commited on Feb 5

Commit

4848dbc

verified ·

1 Parent(s): eb629b4

Update app/charts.py

Browse files

Files changed (1) hide show

app/charts.py +45 -50

app/charts.py CHANGED Viewed

@@ -1,13 +1,18 @@
 import pandas as pd
 import altair as alt
 import streamlit as st
-from io import BytesIO
 from typing import Optional
 import re
 from collections import Counter
 from io import BytesIO
 from wordcloud import WordCloud
 import ast
 # =========================
 # Helpers
@@ -467,78 +472,68 @@ def chart_top_dominios(dfin: pd.DataFrame, topn: int = 20):
 @st.cache_data(show_spinner=False)
 def chart_nube_palabras(dfin: pd.DataFrame, max_words: int = 150) -> BytesIO | None:
-    """
-    Genera una nube de palabras (PNG en memoria) a partir de dfin['texto'].
-    - Limpia URLs, menciones, hashtags y tokens cortos.
-    - Stopwords ES/EN básicas incluidas.
-    - Devuelve BytesIO con PNG o None si no hay texto útil.
-    """
     if "texto" not in dfin.columns or dfin.empty:
         return None
-    # --- recolecta y limpia texto ---
     texts = dfin["texto"].dropna().astype(str).tolist()
     if not texts:
         return None
-    text = " ".join(texts)
-    # quita URLs, menciones y hashtags (sólo el #/@, mantenemos la palabra)
-    text = re.sub(r"https?://\S+", " ", text)
-    text = re.sub(r"[@#]", " ", text)
-    # tokens básicos
-    tokens = re.findall(r"[A-Za-zÁÉÍÓÚÜÑáéíóúüñ0-9]+", text)
-    # stopwords sencillas ES/EN (puedes ampliarlas si quieres)
-    stop_es = {
-        "de","la","que","el","en","y","a","los","del","se","las","por","un","para","con","no",
-        "una","su","al","lo","como","más","pero","sus","le","ya","o","fue","este","ha","sí",
-        "porque","esta","son","entre","cuando","muy","sin","sobre","también","me","hasta",
-        "hay","donde","quien","desde","todo","nos","durante","todos","uno","les","ni","contra",
-        "otros","ese","eso","ante","ellos","e","esto","mí","antes","algunos","qué","unos",
-        "yo","otro","otras","otra","él","tanto","esa","estos","mucho","quienes","nada","muchos",
-        "cual","poco","ella","estar","estas","algunas","algo","nosotros","mi","mis","tú","te",
-        "ti","tu","tus","ellas","nosotras","vosotros","vosotras","os","mío","mía","míos","mías",
-        "tuyo","tuya","tuyos","tuyas","suyo","suya","suyos","suyas","nuestro","nuestra","nuestros",
-        "nuestras","vuestro","vuestra","vuestros","vuestras","esos","esas","estoy","estás","está",
-        "estamos","estáis","están","esté","estés","estemos","estéis","estén","estaré","estarás",
-        "estará","estaremos","estaréis","estarán"
-    }
-    stop_en = {
-        "the","a","an","and","or","but","to","of","for","in","on","at","by","with","from","as",
-        "is","are","was","were","be","been","being","it","its","this","that","these","those",
-        "i","you","he","she","we","they","me","him","her","us","them","my","your","his","their",
-        "our","mine","yours","hers","theirs","ours","not","no","so","if","than","then","too",
-        "very","can","could","should","would","will","just","also","into","over","under",
-    }
-    stops = {t.lower() for t in (stop_es | stop_en)}
-    # filtra tokens (longitud mínima y no stopword)
-    tokens = [t for t in tokens if len(t) >= 3 and t.lower() not in stops]
-    if not tokens:
         return None
-    # frecuencias
-    freqs = Counter(t.lower() for t in tokens)
-    # genera la nube
     wc = WordCloud(
         width=1400,
         height=800,
         background_color="white",
-        prefer_horizontal=0.9,
-        collocations=False,   # no agrupa bi/trigramas, mejor control de tokens
         max_words=max_words,
     ).generate_from_frequencies(freqs)
-    # a PNG en memoria
     png = BytesIO()
     wc.to_image().save(png, format="PNG")
     png.seek(0)
     return png
 CHARTS = {
     "posts_per_day": chart_posts_diario_ma,
     "sentiment_count": chart_sentimiento_barras,

 import pandas as pd
 import altair as alt
 import streamlit as st
 from typing import Optional
 import re
 from collections import Counter
 from io import BytesIO
 from wordcloud import WordCloud
 import ast
+import spacy
+from langdetect import detect
+NLP_ES = spacy.load("es_core_news_sm")
+NLP_EN = spacy.load("en_core_web_sm")
 # =========================
 # Helpers
 @st.cache_data(show_spinner=False)
 def chart_nube_palabras(dfin: pd.DataFrame, max_words: int = 150) -> BytesIO | None:
     if "texto" not in dfin.columns or dfin.empty:
         return None
     texts = dfin["texto"].dropna().astype(str).tolist()
     if not texts:
         return None
+    lemmas = []
+    for text in texts:
+        try:
+            lang = detect(text)
+        except Exception:
+            continue
+        if lang == "es":
+            doc = NLP_ES(text)
+        elif lang == "en":
+            doc = NLP_EN(text)
+        else:
+            continue
+        for token in doc:
+            if (
+                token.is_stop
+                or token.is_punct
+                or token.is_space
+                or token.like_url
+                or token.like_email
+            ):
+                continue
+            if token.pos_ not in {"NOUN", "PROPN", "ADJ"}:
+                continue
+            lemma = token.lemma_.lower().strip()
+            if len(lemma) < 3:
+                continue
+            lemmas.append(lemma)
+    if not lemmas:
         return None
+    freqs = Counter(lemmas)
     wc = WordCloud(
         width=1400,
         height=800,
         background_color="white",
         max_words=max_words,
+        collocations=False,
+        prefer_horizontal=0.9,
     ).generate_from_frequencies(freqs)
     png = BytesIO()
     wc.to_image().save(png, format="PNG")
     png.seek(0)
     return png
 CHARTS = {
     "posts_per_day": chart_posts_diario_ma,
     "sentiment_count": chart_sentimiento_barras,