Spaces:

hblim
/

reddit_sentiment_tracker

Sleeping

App Files Files Community

reddit_sentiment_tracker / frontend /text_analysis.py

hblim

Added spacy en_core_web_sm to build instead of lazy import

b8895e0 6 months ago

raw

history blame contribute delete

5.59 kB

	"""
	Text analysis utilities for Reddit content insights.
	Provides keyword extraction and similarity matching functions.
	"""
	import pandas as pd
	import contextlib

	# NOTE:
	# Heavy NLP/ML libraries (spaCy, sentence-transformers, KeyBERT, torch, etc.) can take a
	# long time to import or may not be available in constrained environments (e.g. the
	# default HuggingFace Spaces CPU image). Importing them at module import time can cause
	# the module to fail to initialise which, in turn, leads to cryptic errors such as
	# "cannot import name 'keywords_for_df'". To avoid this we lazily import the heavy
	# dependencies the first time they are actually needed. The helper is cached so that
	# subsequent calls are fast.

	from functools import lru_cache


	# -----------------------------------------------------------------------------
	# Internal helpers
	# -----------------------------------------------------------------------------


	@lru_cache(maxsize=1)
	def _load_models():
	"""Lazily load and cache NLP models.

	Returns
	-------
	tuple
	(nlp, kw_model) where ``nlp`` is a spaCy language model and ``kw_model`` is a
	KeyBERT instance. If the required libraries are not available the function
	raises ImportError inside the helper so the caller can decide how to handle
	the failure gracefully.
	"""

	import importlib

	# ------------------------------------------------------------------
	# Inform the user via Streamlit (if available) that heavy models are
	# loading. We use a spinner that is shown only on the first call; the
	# function is cached so subsequent calls skip the spinner entirely.
	# ------------------------------------------------------------------

	try:
	import streamlit as st # noqa: WPS433 (late import)

	spinner_cm = st.spinner(
	"Initializing keyword-extraction models (first run may take ~1 min)…",
	)
	except ModuleNotFoundError:
	# If Streamlit isn't present (e.g. unit tests) simply do nothing.
	spinner_cm = contextlib.nullcontext()

	with spinner_cm:
	# Import spaCy and ensure the small English model is available
	spacy = importlib.import_module("spacy")

	try:
	nlp = spacy.load("en_core_web_sm")
	except OSError as exc:
	# The model is missing. Do NOT attempt to install it at run-time
	# because the app may run under a non-privileged user (e.g. Streamlit
	# Cloud) and lack write permissions to the virtual-env. Instead we
	# instruct the developer to add the model wheel to build-time
	# dependencies so it gets installed by pip when the image is built.
	raise RuntimeError(
	"spaCy model 'en_core_web_sm' is not installed. "
	"Add 'en-core-web-sm==3.8.0' (hyphen, not underscore) to "
	"your requirements.txt so it is installed during deployment."
	) from exc

	# Sentence-Transformers and KeyBERT (which depends on it)
	sent_trans = importlib.import_module("sentence_transformers")
	SentenceTransformer = sent_trans.SentenceTransformer

	KeyBERT = importlib.import_module("keybert").KeyBERT

	embedder = SentenceTransformer("all-MiniLM-L6-v2")
	kw_model = KeyBERT(embedder)

	# Notify user that models are ready (only on first load)
	try:
	st.success("Keyword-extraction models ready!", icon="✅") # type: ignore[name-defined]
	except Exception: # noqa: BLE001 (streamlit not available or other minor issue)
	pass

	return nlp, kw_model

	def keywords_for_df(df: pd.DataFrame, top_n=5):
	"""
	Extract keywords from a DataFrame containing Reddit posts.

	Args:
	df: DataFrame with a 'text' column containing post content
	top_n: Number of top keywords to return

	Returns:
	List of (keyword, score) tuples
	"""
	if df.empty:
	return []

	# Attempt to load heavy models. If this fails we degrade gracefully by returning
	# an empty list rather than crashing the whole application.
	try:
	nlp, kw_model = _load_models()
	except Exception as exc: # noqa: BLE001 (broad, but we degrade gracefully)
	# Log the failure inside Streamlit if available; otherwise swallow silently.
	try:
	import streamlit as st # noqa: WPS433

	st.warning(
	f"Keyword extraction disabled due to model loading error: {exc}",
	icon="⚠️",
	)
	except ModuleNotFoundError:
	pass

	return []

	# Join all text from the dataframe
	raw = " ".join(df["text"].astype(str))

	# Process with spaCy to extract noun chunks and named entities
	doc = nlp(raw.lower())

	# Combine noun chunks and relevant named entities
	cand = " ".join(
	[c.text for c in doc.noun_chunks]
	+ [e.text for e in doc.ents if e.label_ in {"PRODUCT", "EVENT", "ORG", "GPE"}]
	)

	# Quick stopword list to filter common terms
	for ex in [
	"blog",
	"topic",
	"locked",
	"author",
	"moderator",
	"error",
	"bot",
	"comments",
	"archive",
	"support",
	"discord",
	]:
	cand = cand.replace(ex, " ")

	# Use KeyBERT to extract keywords with diversity
	return kw_model.extract_keywords(
	cand,
	keyphrase_ngram_range=(1, 3),
	stop_words="english",
	use_mmr=True,
	diversity=0.8,
	top_n=top_n,
	)