hblim commited on
Commit
612d63d
·
1 Parent(s): 3930884

lazy import of heavy dependencies

Browse files
Files changed (2) hide show
  1. frontend/app.py +0 -2
  2. frontend/text_analysis.py +104 -24
frontend/app.py CHANGED
@@ -1,9 +1,7 @@
1
- import os
2
  import streamlit as st
3
  import pandas as pd
4
  import numpy as np
5
  import altair as alt
6
- from datetime import date, timedelta, datetime
7
 
8
  # Call page config BEFORE importing modules that use Streamlit commands
9
  st.set_page_config(page_title="Reddit Sentiment Trends", layout="wide")
 
 
1
  import streamlit as st
2
  import pandas as pd
3
  import numpy as np
4
  import altair as alt
 
5
 
6
  # Call page config BEFORE importing modules that use Streamlit commands
7
  st.set_page_config(page_title="Reddit Sentiment Trends", layout="wide")
frontend/text_analysis.py CHANGED
@@ -3,23 +3,73 @@ Text analysis utilities for Reddit content insights.
3
  Provides keyword extraction and similarity matching functions.
4
  """
5
  import pandas as pd
6
- import spacy
7
- from sentence_transformers import SentenceTransformer
8
- from keybert import KeyBERT
9
-
10
- # Initialize spaCy and sentence transformer models
11
- try:
12
- nlp = spacy.load("en_core_web_sm")
13
- except OSError:
14
- import streamlit as st
15
- with st.spinner("Downloading NLP model (first run only)..."):
16
- from spacy.cli import download
17
- download("en_core_web_sm")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
18
  nlp = spacy.load("en_core_web_sm")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19
 
20
- # Cache models at module scope for reuse
21
- embedder = SentenceTransformer("all-MiniLM-L6-v2")
22
- kw_model = KeyBERT(embedder)
 
23
 
24
  def keywords_for_df(df: pd.DataFrame, top_n=5):
25
  """
@@ -34,23 +84,53 @@ def keywords_for_df(df: pd.DataFrame, top_n=5):
34
  """
35
  if df.empty:
36
  return []
37
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
38
  # Join all text from the dataframe
39
  raw = " ".join(df["text"].astype(str))
40
-
41
  # Process with spaCy to extract noun chunks and named entities
42
  doc = nlp(raw.lower())
43
-
44
  # Combine noun chunks and relevant named entities
45
  cand = " ".join(
46
- [c.text for c in doc.noun_chunks] +
47
- [e.text for e in doc.ents if e.label_ in {"PRODUCT", "EVENT", "ORG", "GPE"}]
48
  )
49
-
50
  # Quick stopword list to filter common terms
51
- for ex in ['blog','topic','locked','author','moderator','error','bot','comments','archive','support','discord']:
 
 
 
 
 
 
 
 
 
 
 
 
52
  cand = cand.replace(ex, " ")
53
-
54
  # Use KeyBERT to extract keywords with diversity
55
  return kw_model.extract_keywords(
56
  cand,
@@ -58,5 +138,5 @@ def keywords_for_df(df: pd.DataFrame, top_n=5):
58
  stop_words="english",
59
  use_mmr=True,
60
  diversity=0.8,
61
- top_n=top_n
62
  )
 
3
  Provides keyword extraction and similarity matching functions.
4
  """
5
  import pandas as pd
6
+
7
+ # NOTE:
8
+ # Heavy NLP/ML libraries (spaCy, sentence-transformers, KeyBERT, torch, etc.) can take a
9
+ # long time to import or may not be available in constrained environments (e.g. the
10
+ # default HuggingFace Spaces CPU image). Importing them at module import time can cause
11
+ # the module to fail to initialise which, in turn, leads to cryptic errors such as
12
+ # "cannot import name 'keywords_for_df'". To avoid this we lazily import the heavy
13
+ # dependencies the first time they are actually needed. The helper is cached so that
14
+ # subsequent calls are fast.
15
+
16
+ from functools import lru_cache
17
+
18
+
19
+ # -----------------------------------------------------------------------------
20
+ # Internal helpers
21
+ # -----------------------------------------------------------------------------
22
+
23
+
24
+ @lru_cache(maxsize=1)
25
+ def _load_models():
26
+ """Lazily load and cache NLP models.
27
+
28
+ Returns
29
+ -------
30
+ tuple
31
+ (nlp, kw_model) where ``nlp`` is a spaCy language model and ``kw_model`` is a
32
+ KeyBERT instance. If the required libraries are not available the function
33
+ raises ImportError *inside* the helper so the caller can decide how to handle
34
+ the failure gracefully.
35
+ """
36
+
37
+ import importlib
38
+
39
+ # Import spaCy and ensure the small English model is available
40
+ spacy = importlib.import_module("spacy")
41
+ try:
42
  nlp = spacy.load("en_core_web_sm")
43
+ except OSError:
44
+ # Streamlit is only available after `st.set_page_config` is called, which the
45
+ # main app does before importing this module. We therefore import it lazily
46
+ # here to avoid a hard dependency when the module is imported outside a
47
+ # Streamlit context (e.g. unit tests).
48
+ try:
49
+ import streamlit as st # noqa: WPS433 (allow late import)
50
+
51
+ with st.spinner("Downloading spaCy model (first run only)..."):
52
+ from spacy.cli import download # noqa: WPS433 (late import)
53
+
54
+ download("en_core_web_sm")
55
+ nlp = spacy.load("en_core_web_sm")
56
+ except ModuleNotFoundError:
57
+ # If Streamlit isn't available, fall back to downloading silently.
58
+ from spacy.cli import download # noqa: WPS433 (late import)
59
+
60
+ download("en_core_web_sm")
61
+ nlp = spacy.load("en_core_web_sm")
62
+
63
+ # Sentence-Transformers and KeyBERT (which depends on it)
64
+ sent_trans = importlib.import_module("sentence_transformers")
65
+ SentenceTransformer = sent_trans.SentenceTransformer
66
+
67
+ KeyBERT = importlib.import_module("keybert").KeyBERT
68
 
69
+ embedder = SentenceTransformer("all-MiniLM-L6-v2")
70
+ kw_model = KeyBERT(embedder)
71
+
72
+ return nlp, kw_model
73
 
74
  def keywords_for_df(df: pd.DataFrame, top_n=5):
75
  """
 
84
  """
85
  if df.empty:
86
  return []
87
+
88
+ # Attempt to load heavy models. If this fails we degrade gracefully by returning
89
+ # an empty list rather than crashing the whole application.
90
+ try:
91
+ nlp, kw_model = _load_models()
92
+ except Exception as exc: # noqa: BLE001 (broad, but we degrade gracefully)
93
+ # Log the failure inside Streamlit if available; otherwise swallow silently.
94
+ try:
95
+ import streamlit as st # noqa: WPS433
96
+
97
+ st.warning(
98
+ f"Keyword extraction disabled due to model loading error: {exc}",
99
+ icon="⚠️",
100
+ )
101
+ except ModuleNotFoundError:
102
+ pass
103
+
104
+ return []
105
+
106
  # Join all text from the dataframe
107
  raw = " ".join(df["text"].astype(str))
108
+
109
  # Process with spaCy to extract noun chunks and named entities
110
  doc = nlp(raw.lower())
111
+
112
  # Combine noun chunks and relevant named entities
113
  cand = " ".join(
114
+ [c.text for c in doc.noun_chunks]
115
+ + [e.text for e in doc.ents if e.label_ in {"PRODUCT", "EVENT", "ORG", "GPE"}]
116
  )
117
+
118
  # Quick stopword list to filter common terms
119
+ for ex in [
120
+ "blog",
121
+ "topic",
122
+ "locked",
123
+ "author",
124
+ "moderator",
125
+ "error",
126
+ "bot",
127
+ "comments",
128
+ "archive",
129
+ "support",
130
+ "discord",
131
+ ]:
132
  cand = cand.replace(ex, " ")
133
+
134
  # Use KeyBERT to extract keywords with diversity
135
  return kw_model.extract_keywords(
136
  cand,
 
138
  stop_words="english",
139
  use_mmr=True,
140
  diversity=0.8,
141
+ top_n=top_n,
142
  )