jccolon commited on
Commit
4848dbc
·
verified ·
1 Parent(s): eb629b4

Update app/charts.py

Browse files
Files changed (1) hide show
  1. app/charts.py +45 -50
app/charts.py CHANGED
@@ -1,13 +1,18 @@
1
  import pandas as pd
2
  import altair as alt
3
  import streamlit as st
4
- from io import BytesIO
5
  from typing import Optional
6
  import re
7
  from collections import Counter
8
  from io import BytesIO
9
  from wordcloud import WordCloud
10
  import ast
 
 
 
 
 
 
11
 
12
  # =========================
13
  # Helpers
@@ -467,78 +472,68 @@ def chart_top_dominios(dfin: pd.DataFrame, topn: int = 20):
467
 
468
  @st.cache_data(show_spinner=False)
469
  def chart_nube_palabras(dfin: pd.DataFrame, max_words: int = 150) -> BytesIO | None:
470
- """
471
- Genera una nube de palabras (PNG en memoria) a partir de dfin['texto'].
472
- - Limpia URLs, menciones, hashtags y tokens cortos.
473
- - Stopwords ES/EN básicas incluidas.
474
- - Devuelve BytesIO con PNG o None si no hay texto útil.
475
- """
476
  if "texto" not in dfin.columns or dfin.empty:
477
  return None
478
 
479
- # --- recolecta y limpia texto ---
480
  texts = dfin["texto"].dropna().astype(str).tolist()
481
  if not texts:
482
  return None
483
 
484
- text = " ".join(texts)
485
-
486
- # quita URLs, menciones y hashtags (sólo el #/@, mantenemos la palabra)
487
- text = re.sub(r"https?://\S+", " ", text)
488
- text = re.sub(r"[@#]", " ", text)
489
-
490
- # tokens básicos
491
- tokens = re.findall(r"[A-Za-zÁÉÍÓÚÜÑáéíóúüñ0-9]+", text)
492
-
493
- # stopwords sencillas ES/EN (puedes ampliarlas si quieres)
494
- stop_es = {
495
- "de","la","que","el","en","y","a","los","del","se","las","por","un","para","con","no",
496
- "una","su","al","lo","como","más","pero","sus","le","ya","o","fue","este","ha","sí",
497
- "porque","esta","son","entre","cuando","muy","sin","sobre","también","me","hasta",
498
- "hay","donde","quien","desde","todo","nos","durante","todos","uno","les","ni","contra",
499
- "otros","ese","eso","ante","ellos","e","esto","mí","antes","algunos","qué","unos",
500
- "yo","otro","otras","otra","él","tanto","esa","estos","mucho","quienes","nada","muchos",
501
- "cual","poco","ella","estar","estas","algunas","algo","nosotros","mi","mis","tú","te",
502
- "ti","tu","tus","ellas","nosotras","vosotros","vosotras","os","mío","mía","míos","mías",
503
- "tuyo","tuya","tuyos","tuyas","suyo","suya","suyos","suyas","nuestro","nuestra","nuestros",
504
- "nuestras","vuestro","vuestra","vuestros","vuestras","esos","esas","estoy","estás","está",
505
- "estamos","estáis","están","esté","estés","estemos","estéis","estén","estaré","estarás",
506
- "estará","estaremos","estaréis","estarán"
507
- }
508
- stop_en = {
509
- "the","a","an","and","or","but","to","of","for","in","on","at","by","with","from","as",
510
- "is","are","was","were","be","been","being","it","its","this","that","these","those",
511
- "i","you","he","she","we","they","me","him","her","us","them","my","your","his","their",
512
- "our","mine","yours","hers","theirs","ours","not","no","so","if","than","then","too",
513
- "very","can","could","should","would","will","just","also","into","over","under",
514
- }
515
- stops = {t.lower() for t in (stop_es | stop_en)}
516
-
517
- # filtra tokens (longitud mínima y no stopword)
518
- tokens = [t for t in tokens if len(t) >= 3 and t.lower() not in stops]
519
- if not tokens:
520
  return None
521
 
522
- # frecuencias
523
- freqs = Counter(t.lower() for t in tokens)
524
 
525
- # genera la nube
526
  wc = WordCloud(
527
  width=1400,
528
  height=800,
529
  background_color="white",
530
- prefer_horizontal=0.9,
531
- collocations=False, # no agrupa bi/trigramas, mejor control de tokens
532
  max_words=max_words,
 
 
533
  ).generate_from_frequencies(freqs)
534
 
535
- # a PNG en memoria
536
  png = BytesIO()
537
  wc.to_image().save(png, format="PNG")
538
  png.seek(0)
539
  return png
540
 
541
 
 
542
  CHARTS = {
543
  "posts_per_day": chart_posts_diario_ma,
544
  "sentiment_count": chart_sentimiento_barras,
 
1
  import pandas as pd
2
  import altair as alt
3
  import streamlit as st
 
4
  from typing import Optional
5
  import re
6
  from collections import Counter
7
  from io import BytesIO
8
  from wordcloud import WordCloud
9
  import ast
10
+ import spacy
11
+ from langdetect import detect
12
+
13
+ NLP_ES = spacy.load("es_core_news_sm")
14
+ NLP_EN = spacy.load("en_core_web_sm")
15
+
16
 
17
  # =========================
18
  # Helpers
 
472
 
473
  @st.cache_data(show_spinner=False)
474
  def chart_nube_palabras(dfin: pd.DataFrame, max_words: int = 150) -> BytesIO | None:
 
 
 
 
 
 
475
  if "texto" not in dfin.columns or dfin.empty:
476
  return None
477
 
 
478
  texts = dfin["texto"].dropna().astype(str).tolist()
479
  if not texts:
480
  return None
481
 
482
+ lemmas = []
483
+
484
+ for text in texts:
485
+ try:
486
+ lang = detect(text)
487
+ except Exception:
488
+ continue
489
+
490
+ if lang == "es":
491
+ doc = NLP_ES(text)
492
+ elif lang == "en":
493
+ doc = NLP_EN(text)
494
+ else:
495
+ continue
496
+
497
+ for token in doc:
498
+ if (
499
+ token.is_stop
500
+ or token.is_punct
501
+ or token.is_space
502
+ or token.like_url
503
+ or token.like_email
504
+ ):
505
+ continue
506
+
507
+ if token.pos_ not in {"NOUN", "PROPN", "ADJ"}:
508
+ continue
509
+
510
+ lemma = token.lemma_.lower().strip()
511
+ if len(lemma) < 3:
512
+ continue
513
+
514
+ lemmas.append(lemma)
515
+
516
+ if not lemmas:
 
517
  return None
518
 
519
+ freqs = Counter(lemmas)
 
520
 
 
521
  wc = WordCloud(
522
  width=1400,
523
  height=800,
524
  background_color="white",
 
 
525
  max_words=max_words,
526
+ collocations=False,
527
+ prefer_horizontal=0.9,
528
  ).generate_from_frequencies(freqs)
529
 
 
530
  png = BytesIO()
531
  wc.to_image().save(png, format="PNG")
532
  png.seek(0)
533
  return png
534
 
535
 
536
+
537
  CHARTS = {
538
  "posts_per_day": chart_posts_diario_ma,
539
  "sentiment_count": chart_sentimiento_barras,