Spaces:

Nomio4640
/

NLP-intelligence

Running

File size: 3,347 Bytes

"""
services.py — shared singleton NLP services.

Changes from previous version:
  1. Seeds ALL hardcoded MONGOLIAN_STOPWORDS into the DB on startup so
     the admin can see and edit the full list. Uses INSERT OR IGNORE so
     server restarts never create duplicates.
  2. Preprocessor now reads stopwords FROM the DB (so admin additions/
     deletions are always reflected after reload_preprocessor()).
  3. MIN_TOPICS_DOCS lowered to match topic_modeler.py's new threshold of 3.
"""

import os
from typing import Optional

from nlp_core.knowledge_base import KnowledgeBase
from nlp_core.preprocessing import Preprocessor, MONGOLIAN_STOPWORDS
from nlp_core.ner_engine import NEREngine
from nlp_core.sentiment import SentimentAnalyzer
from nlp_core.topic_modeler import TopicModeler
from nlp_core.network_analyzer import NetworkAnalyzer

# ---------------------------------------------------------------------------
# DB path — resolves to webapp/knowledge.db regardless of cwd
# ---------------------------------------------------------------------------
_HERE = os.path.dirname(os.path.abspath(__file__))
# Check if an external DB_PATH is provided via environment variables (e.g., Colab Google Drive).
# Otherwise, default to the local knowledge.db inside the project folder.
DB_PATH = os.environ.get(
    "DB_PATH", 
    os.path.normpath(os.path.join(_HERE, "..", "..", "knowledge.db"))
)

# ---------------------------------------------------------------------------
# Singleton instances
# ---------------------------------------------------------------------------
kb = KnowledgeBase(db_path=DB_PATH)

# Seed all hardcoded stopwords into the DB on first run.
# INSERT OR IGNORE makes this safe to call on every restart.
_seeded = kb.seed_stopwords(list(MONGOLIAN_STOPWORDS))
if _seeded > 0:
    print(f"[services] Seeded {_seeded} default stopwords into DB.")

# Preprocessor reads stopwords from DB — now includes all defaults
# plus anything the admin has added via the UI.
preprocessor = Preprocessor(extra_stopwords=kb.get_stopwords())

# Heavy ML models — lazy-loaded inside the classes on first actual use.
# Keeping them as module-level objects means HuggingFace pipelines are
# only constructed ONCE per server lifetime, not once per request.
ner       = NEREngine()
sentiment = SentimentAnalyzer()
topic     = TopicModeler()
network   = NetworkAnalyzer()

# ---------------------------------------------------------------------------
# In-memory cache of the last analysis (for /network and /insights)
# ---------------------------------------------------------------------------
_last_analysis = None


def get_last_analysis():
    return _last_analysis


def set_last_analysis(result) -> None:
    global _last_analysis
    _last_analysis = result


# ---------------------------------------------------------------------------
# Reload — called by POST /admin/reload after admin changes
# ---------------------------------------------------------------------------

def reload_preprocessor() -> None:
    """
    Rebuild the Preprocessor with the latest stopword list from the DB.
    Call this after the admin adds or removes stopwords so the change
    takes effect on the next analysis request without restarting.
    """
    global preprocessor
    preprocessor = Preprocessor(extra_stopwords=kb.get_stopwords())