Spaces:
Sleeping
Sleeping
lazy import of heavy dependencies
Browse files- frontend/app.py +0 -2
- frontend/text_analysis.py +104 -24
frontend/app.py
CHANGED
|
@@ -1,9 +1,7 @@
|
|
| 1 |
-
import os
|
| 2 |
import streamlit as st
|
| 3 |
import pandas as pd
|
| 4 |
import numpy as np
|
| 5 |
import altair as alt
|
| 6 |
-
from datetime import date, timedelta, datetime
|
| 7 |
|
| 8 |
# Call page config BEFORE importing modules that use Streamlit commands
|
| 9 |
st.set_page_config(page_title="Reddit Sentiment Trends", layout="wide")
|
|
|
|
|
|
|
| 1 |
import streamlit as st
|
| 2 |
import pandas as pd
|
| 3 |
import numpy as np
|
| 4 |
import altair as alt
|
|
|
|
| 5 |
|
| 6 |
# Call page config BEFORE importing modules that use Streamlit commands
|
| 7 |
st.set_page_config(page_title="Reddit Sentiment Trends", layout="wide")
|
frontend/text_analysis.py
CHANGED
|
@@ -3,23 +3,73 @@ Text analysis utilities for Reddit content insights.
|
|
| 3 |
Provides keyword extraction and similarity matching functions.
|
| 4 |
"""
|
| 5 |
import pandas as pd
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
#
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 18 |
nlp = spacy.load("en_core_web_sm")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 19 |
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
|
|
|
|
| 23 |
|
| 24 |
def keywords_for_df(df: pd.DataFrame, top_n=5):
|
| 25 |
"""
|
|
@@ -34,23 +84,53 @@ def keywords_for_df(df: pd.DataFrame, top_n=5):
|
|
| 34 |
"""
|
| 35 |
if df.empty:
|
| 36 |
return []
|
| 37 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 38 |
# Join all text from the dataframe
|
| 39 |
raw = " ".join(df["text"].astype(str))
|
| 40 |
-
|
| 41 |
# Process with spaCy to extract noun chunks and named entities
|
| 42 |
doc = nlp(raw.lower())
|
| 43 |
-
|
| 44 |
# Combine noun chunks and relevant named entities
|
| 45 |
cand = " ".join(
|
| 46 |
-
[c.text for c in doc.noun_chunks]
|
| 47 |
-
[e.text for e in doc.ents if e.label_ in {"PRODUCT", "EVENT", "ORG", "GPE"}]
|
| 48 |
)
|
| 49 |
-
|
| 50 |
# Quick stopword list to filter common terms
|
| 51 |
-
for ex in [
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 52 |
cand = cand.replace(ex, " ")
|
| 53 |
-
|
| 54 |
# Use KeyBERT to extract keywords with diversity
|
| 55 |
return kw_model.extract_keywords(
|
| 56 |
cand,
|
|
@@ -58,5 +138,5 @@ def keywords_for_df(df: pd.DataFrame, top_n=5):
|
|
| 58 |
stop_words="english",
|
| 59 |
use_mmr=True,
|
| 60 |
diversity=0.8,
|
| 61 |
-
top_n=top_n
|
| 62 |
)
|
|
|
|
| 3 |
Provides keyword extraction and similarity matching functions.
|
| 4 |
"""
|
| 5 |
import pandas as pd
|
| 6 |
+
|
| 7 |
+
# NOTE:
|
| 8 |
+
# Heavy NLP/ML libraries (spaCy, sentence-transformers, KeyBERT, torch, etc.) can take a
|
| 9 |
+
# long time to import or may not be available in constrained environments (e.g. the
|
| 10 |
+
# default HuggingFace Spaces CPU image). Importing them at module import time can cause
|
| 11 |
+
# the module to fail to initialise which, in turn, leads to cryptic errors such as
|
| 12 |
+
# "cannot import name 'keywords_for_df'". To avoid this we lazily import the heavy
|
| 13 |
+
# dependencies the first time they are actually needed. The helper is cached so that
|
| 14 |
+
# subsequent calls are fast.
|
| 15 |
+
|
| 16 |
+
from functools import lru_cache
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
# -----------------------------------------------------------------------------
|
| 20 |
+
# Internal helpers
|
| 21 |
+
# -----------------------------------------------------------------------------
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
@lru_cache(maxsize=1)
|
| 25 |
+
def _load_models():
|
| 26 |
+
"""Lazily load and cache NLP models.
|
| 27 |
+
|
| 28 |
+
Returns
|
| 29 |
+
-------
|
| 30 |
+
tuple
|
| 31 |
+
(nlp, kw_model) where ``nlp`` is a spaCy language model and ``kw_model`` is a
|
| 32 |
+
KeyBERT instance. If the required libraries are not available the function
|
| 33 |
+
raises ImportError *inside* the helper so the caller can decide how to handle
|
| 34 |
+
the failure gracefully.
|
| 35 |
+
"""
|
| 36 |
+
|
| 37 |
+
import importlib
|
| 38 |
+
|
| 39 |
+
# Import spaCy and ensure the small English model is available
|
| 40 |
+
spacy = importlib.import_module("spacy")
|
| 41 |
+
try:
|
| 42 |
nlp = spacy.load("en_core_web_sm")
|
| 43 |
+
except OSError:
|
| 44 |
+
# Streamlit is only available after `st.set_page_config` is called, which the
|
| 45 |
+
# main app does before importing this module. We therefore import it lazily
|
| 46 |
+
# here to avoid a hard dependency when the module is imported outside a
|
| 47 |
+
# Streamlit context (e.g. unit tests).
|
| 48 |
+
try:
|
| 49 |
+
import streamlit as st # noqa: WPS433 (allow late import)
|
| 50 |
+
|
| 51 |
+
with st.spinner("Downloading spaCy model (first run only)..."):
|
| 52 |
+
from spacy.cli import download # noqa: WPS433 (late import)
|
| 53 |
+
|
| 54 |
+
download("en_core_web_sm")
|
| 55 |
+
nlp = spacy.load("en_core_web_sm")
|
| 56 |
+
except ModuleNotFoundError:
|
| 57 |
+
# If Streamlit isn't available, fall back to downloading silently.
|
| 58 |
+
from spacy.cli import download # noqa: WPS433 (late import)
|
| 59 |
+
|
| 60 |
+
download("en_core_web_sm")
|
| 61 |
+
nlp = spacy.load("en_core_web_sm")
|
| 62 |
+
|
| 63 |
+
# Sentence-Transformers and KeyBERT (which depends on it)
|
| 64 |
+
sent_trans = importlib.import_module("sentence_transformers")
|
| 65 |
+
SentenceTransformer = sent_trans.SentenceTransformer
|
| 66 |
+
|
| 67 |
+
KeyBERT = importlib.import_module("keybert").KeyBERT
|
| 68 |
|
| 69 |
+
embedder = SentenceTransformer("all-MiniLM-L6-v2")
|
| 70 |
+
kw_model = KeyBERT(embedder)
|
| 71 |
+
|
| 72 |
+
return nlp, kw_model
|
| 73 |
|
| 74 |
def keywords_for_df(df: pd.DataFrame, top_n=5):
|
| 75 |
"""
|
|
|
|
| 84 |
"""
|
| 85 |
if df.empty:
|
| 86 |
return []
|
| 87 |
+
|
| 88 |
+
# Attempt to load heavy models. If this fails we degrade gracefully by returning
|
| 89 |
+
# an empty list rather than crashing the whole application.
|
| 90 |
+
try:
|
| 91 |
+
nlp, kw_model = _load_models()
|
| 92 |
+
except Exception as exc: # noqa: BLE001 (broad, but we degrade gracefully)
|
| 93 |
+
# Log the failure inside Streamlit if available; otherwise swallow silently.
|
| 94 |
+
try:
|
| 95 |
+
import streamlit as st # noqa: WPS433
|
| 96 |
+
|
| 97 |
+
st.warning(
|
| 98 |
+
f"Keyword extraction disabled due to model loading error: {exc}",
|
| 99 |
+
icon="⚠️",
|
| 100 |
+
)
|
| 101 |
+
except ModuleNotFoundError:
|
| 102 |
+
pass
|
| 103 |
+
|
| 104 |
+
return []
|
| 105 |
+
|
| 106 |
# Join all text from the dataframe
|
| 107 |
raw = " ".join(df["text"].astype(str))
|
| 108 |
+
|
| 109 |
# Process with spaCy to extract noun chunks and named entities
|
| 110 |
doc = nlp(raw.lower())
|
| 111 |
+
|
| 112 |
# Combine noun chunks and relevant named entities
|
| 113 |
cand = " ".join(
|
| 114 |
+
[c.text for c in doc.noun_chunks]
|
| 115 |
+
+ [e.text for e in doc.ents if e.label_ in {"PRODUCT", "EVENT", "ORG", "GPE"}]
|
| 116 |
)
|
| 117 |
+
|
| 118 |
# Quick stopword list to filter common terms
|
| 119 |
+
for ex in [
|
| 120 |
+
"blog",
|
| 121 |
+
"topic",
|
| 122 |
+
"locked",
|
| 123 |
+
"author",
|
| 124 |
+
"moderator",
|
| 125 |
+
"error",
|
| 126 |
+
"bot",
|
| 127 |
+
"comments",
|
| 128 |
+
"archive",
|
| 129 |
+
"support",
|
| 130 |
+
"discord",
|
| 131 |
+
]:
|
| 132 |
cand = cand.replace(ex, " ")
|
| 133 |
+
|
| 134 |
# Use KeyBERT to extract keywords with diversity
|
| 135 |
return kw_model.extract_keywords(
|
| 136 |
cand,
|
|
|
|
| 138 |
stop_words="english",
|
| 139 |
use_mmr=True,
|
| 140 |
diversity=0.8,
|
| 141 |
+
top_n=top_n,
|
| 142 |
)
|