Spaces:
Running
Running
Delete app2.0.py
Browse files
app2.0.py
DELETED
|
@@ -1,259 +0,0 @@
|
|
| 1 |
-
"""
|
| 2 |
-
File: app.py
|
| 3 |
-
Description: Unified MOSAIC App (Lite + Pro).
|
| 4 |
-
Switches between CPU/Lite and GPU/LLM modes automatically based on environment variables.
|
| 5 |
-
"""
|
| 6 |
-
|
| 7 |
-
# =====================================================================
|
| 8 |
-
# Imports
|
| 9 |
-
# =====================================================================
|
| 10 |
-
|
| 11 |
-
import os
|
| 12 |
-
import sys
|
| 13 |
-
import json
|
| 14 |
-
import re
|
| 15 |
-
import logging
|
| 16 |
-
from pathlib import Path
|
| 17 |
-
|
| 18 |
-
import streamlit as st
|
| 19 |
-
import pandas as pd
|
| 20 |
-
import numpy as np
|
| 21 |
-
import nltk
|
| 22 |
-
|
| 23 |
-
# Standard ML Imports
|
| 24 |
-
from bertopic import BERTopic
|
| 25 |
-
from sentence_transformers import SentenceTransformer
|
| 26 |
-
from sklearn.feature_extraction.text import CountVectorizer
|
| 27 |
-
from umap import UMAP
|
| 28 |
-
from hdbscan import HDBSCAN
|
| 29 |
-
import datamapplot
|
| 30 |
-
import matplotlib.pyplot as plt
|
| 31 |
-
from huggingface_hub import hf_hub_download
|
| 32 |
-
|
| 33 |
-
# Configure Logging
|
| 34 |
-
logging.basicConfig(level=logging.INFO)
|
| 35 |
-
logger = logging.getLogger(__name__)
|
| 36 |
-
|
| 37 |
-
# --- 1. DETECT MODE ---
|
| 38 |
-
# We check if the user has enabled the LLM in Hugging Face Secrets/Env Vars
|
| 39 |
-
ENABLE_LLM = os.getenv("ENABLE_LLM", "False").lower() in ("true", "1", "yes")
|
| 40 |
-
|
| 41 |
-
# Try to import LLM libraries only if enabled
|
| 42 |
-
LLM_MODULES_AVAILABLE = False
|
| 43 |
-
if ENABLE_LLM:
|
| 44 |
-
try:
|
| 45 |
-
from llama_cpp import Llama
|
| 46 |
-
from bertopic.representation import LlamaCPP
|
| 47 |
-
LLM_MODULES_AVAILABLE = True
|
| 48 |
-
logger.info("🟢 LLM Modules imported successfully.")
|
| 49 |
-
except ImportError as e:
|
| 50 |
-
logger.warning(f"🔴 ENABLE_LLM is True, but libraries are missing: {e}. Falling back to Lite mode.")
|
| 51 |
-
ENABLE_LLM = False
|
| 52 |
-
|
| 53 |
-
# =====================================================================
|
| 54 |
-
# NLTK setup
|
| 55 |
-
# =====================================================================
|
| 56 |
-
|
| 57 |
-
NLTK_DATA_DIR = "/usr/local/share/nltk_data"
|
| 58 |
-
if NLTK_DATA_DIR not in nltk.data.path:
|
| 59 |
-
nltk.data.path.append(NLTK_DATA_DIR)
|
| 60 |
-
|
| 61 |
-
for resource in ("punkt_tab", "punkt"):
|
| 62 |
-
try:
|
| 63 |
-
nltk.data.find(f"tokenizers/{resource}")
|
| 64 |
-
except LookupError:
|
| 65 |
-
try:
|
| 66 |
-
nltk.download(resource, download_dir=NLTK_DATA_DIR)
|
| 67 |
-
except Exception:
|
| 68 |
-
pass
|
| 69 |
-
|
| 70 |
-
# =====================================================================
|
| 71 |
-
# Path / Cache Utils
|
| 72 |
-
# =====================================================================
|
| 73 |
-
|
| 74 |
-
# Fallback path logic (works without 'mosaic' package)
|
| 75 |
-
def _env(key: str, default: str) -> Path:
|
| 76 |
-
val = os.getenv(key, default)
|
| 77 |
-
return Path(val).expanduser().resolve()
|
| 78 |
-
|
| 79 |
-
_DATA_ROOT = _env("MOSAIC_DATA", str(Path(__file__).parent / "data"))
|
| 80 |
-
PROC_DIR = _DATA_ROOT / "preprocessed"
|
| 81 |
-
CACHE_DIR = PROC_DIR / "cache"
|
| 82 |
-
EVAL_DIR = _env("MOSAIC_EVAL", str(Path(__file__).parent / "eval"))
|
| 83 |
-
|
| 84 |
-
for p in [PROC_DIR, CACHE_DIR, EVAL_DIR]:
|
| 85 |
-
p.mkdir(parents=True, exist_ok=True)
|
| 86 |
-
|
| 87 |
-
def _slugify(s: str) -> str:
|
| 88 |
-
s = s.strip()
|
| 89 |
-
return re.sub(r"[^A-Za-z0-9._-]+", "_", s) or "DATASET"
|
| 90 |
-
|
| 91 |
-
def _cleanup_old_cache(current_slug: str):
|
| 92 |
-
if not CACHE_DIR.exists(): return
|
| 93 |
-
for p in CACHE_DIR.glob("precomputed_*.npy"):
|
| 94 |
-
if current_slug not in p.name:
|
| 95 |
-
try:
|
| 96 |
-
p.unlink()
|
| 97 |
-
except Exception: pass
|
| 98 |
-
|
| 99 |
-
# =====================================================================
|
| 100 |
-
# Streamlit App
|
| 101 |
-
# =====================================================================
|
| 102 |
-
|
| 103 |
-
st.set_page_config(page_title="MOSAIC Dashboard", layout="wide")
|
| 104 |
-
st.title("MOSAIC: Topic Modelling Dashboard")
|
| 105 |
-
|
| 106 |
-
# --- Status Indicator ---
|
| 107 |
-
if ENABLE_LLM:
|
| 108 |
-
st.info("🟢 **Pro Mode Active:** LLM Labeling (Llama-3-8B) is ENABLED.")
|
| 109 |
-
else:
|
| 110 |
-
st.warning("🟡 **Lite Mode Active:** Running on CPU (Keyword labels only).")
|
| 111 |
-
|
| 112 |
-
# =====================================================================
|
| 113 |
-
# Helper Functions
|
| 114 |
-
# =====================================================================
|
| 115 |
-
|
| 116 |
-
@st.cache_resource
|
| 117 |
-
def load_embedding_model(model_name):
|
| 118 |
-
return SentenceTransformer(model_name)
|
| 119 |
-
|
| 120 |
-
@st.cache_resource
|
| 121 |
-
def load_llm_model():
|
| 122 |
-
"""Loads LlamaCPP model only if enabled."""
|
| 123 |
-
if not ENABLE_LLM or not LLM_MODULES_AVAILABLE:
|
| 124 |
-
return None
|
| 125 |
-
|
| 126 |
-
status_container = st.empty()
|
| 127 |
-
status_container.info("⏳ Loading Llama-3-8B (Quantized)... This may take 1-2 minutes.")
|
| 128 |
-
|
| 129 |
-
try:
|
| 130 |
-
model_repo = "NousResearch/Meta-Llama-3-8B-Instruct-GGUF"
|
| 131 |
-
model_file = "Meta-Llama-3-8B-Instruct-Q4_K_M.gguf"
|
| 132 |
-
model_path = hf_hub_download(repo_id=model_repo, filename=model_file)
|
| 133 |
-
|
| 134 |
-
# Offload layers to GPU if available, otherwise CPU
|
| 135 |
-
llm = Llama(model_path=model_path, n_gpu_layers=-1, n_ctx=8192, verbose=False)
|
| 136 |
-
status_container.success("✅ LLM Loaded!")
|
| 137 |
-
return llm
|
| 138 |
-
except Exception as e:
|
| 139 |
-
status_container.error(f"Failed to load LLM: {e}")
|
| 140 |
-
return None
|
| 141 |
-
|
| 142 |
-
@st.cache_data
|
| 143 |
-
def load_precomputed_data(docs_file, emb_file):
|
| 144 |
-
return np.load(docs_file, allow_pickle=True).tolist(), np.load(emb_file, allow_pickle=True)
|
| 145 |
-
|
| 146 |
-
def get_config_hash(cfg):
|
| 147 |
-
return json.dumps(cfg, sort_keys=True)
|
| 148 |
-
|
| 149 |
-
# =====================================================================
|
| 150 |
-
# Topic Modeling Core
|
| 151 |
-
# =====================================================================
|
| 152 |
-
|
| 153 |
-
@st.cache_data
|
| 154 |
-
def perform_topic_modeling(_docs, _embeddings, config_hash):
|
| 155 |
-
_docs = list(_docs)
|
| 156 |
-
_embeddings = np.ascontiguousarray(_embeddings, dtype=np.float32)
|
| 157 |
-
config = json.loads(config_hash)
|
| 158 |
-
|
| 159 |
-
if "ngram_range" in config["vectorizer_params"]:
|
| 160 |
-
config["vectorizer_params"]["ngram_range"] = tuple(config["vectorizer_params"]["ngram_range"])
|
| 161 |
-
|
| 162 |
-
# --- Representation Model Logic (The Switch) ---
|
| 163 |
-
rep_model = None
|
| 164 |
-
if ENABLE_LLM and config.get("use_llm", False):
|
| 165 |
-
llm = load_llm_model()
|
| 166 |
-
if llm:
|
| 167 |
-
prompt = "Q:\nI have a topic described by keywords: '[KEYWORDS]'.\nThe documents are: [DOCUMENTS]\nProvide a short label (5 words max).\nA:"
|
| 168 |
-
rep_model = {"LLM": LlamaCPP(llm, prompt=prompt, nr_docs=10, doc_length=200, tokenizer="whitespace")}
|
| 169 |
-
|
| 170 |
-
# --- BERTopic Setup ---
|
| 171 |
-
topic_model = BERTopic(
|
| 172 |
-
umap_model=UMAP(random_state=42, metric="cosine", **config["umap_params"]),
|
| 173 |
-
hdbscan_model=HDBSCAN(metric="euclidean", prediction_data=True, **config["hdbscan_params"]),
|
| 174 |
-
vectorizer_model=CountVectorizer(**config["vectorizer_params"]) if config["use_vectorizer"] else None,
|
| 175 |
-
representation_model=rep_model,
|
| 176 |
-
top_n_words=config["bt_params"]["top_n_words"],
|
| 177 |
-
nr_topics=None if config["bt_params"]["nr_topics"] == "auto" else int(config["bt_params"]["nr_topics"]),
|
| 178 |
-
verbose=False
|
| 179 |
-
)
|
| 180 |
-
|
| 181 |
-
topics, _ = topic_model.fit_transform(_docs, _embeddings)
|
| 182 |
-
info = topic_model.get_topic_info()
|
| 183 |
-
|
| 184 |
-
# --- Label Extraction ---
|
| 185 |
-
if rep_model and "LLM" in topic_model.get_topics(full=True):
|
| 186 |
-
raw_labels = [label[0][0] for label in topic_model.get_topics(full=True)["LLM"].values()]
|
| 187 |
-
final_labels = [l.split(":")[-1].strip().strip('"') if l else "Unlabelled" for l in raw_labels]
|
| 188 |
-
all_labels = [final_labels[t + topic_model._outliers] if t != -1 else "Unlabelled" for t in topics]
|
| 189 |
-
else:
|
| 190 |
-
name_map = info.set_index("Topic")["Name"].to_dict()
|
| 191 |
-
all_labels = [name_map[t] for t in topics]
|
| 192 |
-
|
| 193 |
-
# --- Visualization Data ---
|
| 194 |
-
reduced = UMAP(n_neighbors=15, n_components=2, min_dist=0.0, metric="cosine", random_state=42).fit_transform(_embeddings)
|
| 195 |
-
|
| 196 |
-
outlier_pct = 0
|
| 197 |
-
if -1 in info.Topic.values:
|
| 198 |
-
outlier_pct = (info.Count[info.Topic == -1].iloc[0] / info.Count.sum()) * 100
|
| 199 |
-
|
| 200 |
-
return topic_model, reduced, all_labels, len(info) - 1, outlier_pct
|
| 201 |
-
|
| 202 |
-
# =====================================================================
|
| 203 |
-
# Main UI Logic
|
| 204 |
-
# =====================================================================
|
| 205 |
-
|
| 206 |
-
st.sidebar.header("Data & Model")
|
| 207 |
-
source = st.sidebar.radio("Data Source", ["Server CSV", "Upload CSV"])
|
| 208 |
-
CSV_PATH = None
|
| 209 |
-
|
| 210 |
-
if source == "Server CSV":
|
| 211 |
-
csvs = [str(p) for p in sorted(PROC_DIR.glob("*.csv"))]
|
| 212 |
-
if csvs: CSV_PATH = st.sidebar.selectbox("Select File", csvs)
|
| 213 |
-
else:
|
| 214 |
-
up = st.sidebar.file_uploader("Upload CSV", type=["csv"])
|
| 215 |
-
if up:
|
| 216 |
-
safe_name = _slugify(os.path.splitext(up.name)[0])
|
| 217 |
-
_cleanup_old_cache(safe_name)
|
| 218 |
-
CSV_PATH = str(PROC_DIR / f"{safe_name}.csv")
|
| 219 |
-
pd.read_csv(up).to_csv(CSV_PATH, index=False)
|
| 220 |
-
st.success(f"Saved: {safe_name}")
|
| 221 |
-
|
| 222 |
-
if CSV_PATH:
|
| 223 |
-
# --- Data Loading ---
|
| 224 |
-
df = pd.read_csv(CSV_PATH)
|
| 225 |
-
|
| 226 |
-
# Try to find text column
|
| 227 |
-
text_cols = [c for c in df.columns if df[c].dtype == object]
|
| 228 |
-
if not text_cols:
|
| 229 |
-
st.error("No text columns found.")
|
| 230 |
-
st.stop()
|
| 231 |
-
|
| 232 |
-
# Auto-pick "reflection_answer_english" if present
|
| 233 |
-
default_idx = 0
|
| 234 |
-
for i, col in enumerate(text_cols):
|
| 235 |
-
if "reflection" in col or "text" in col:
|
| 236 |
-
default_idx = i
|
| 237 |
-
break
|
| 238 |
-
|
| 239 |
-
selected_text_col = st.sidebar.selectbox("Text Column", text_cols, index=default_idx)
|
| 240 |
-
|
| 241 |
-
# --- Config ---
|
| 242 |
-
st.sidebar.markdown("---")
|
| 243 |
-
st.sidebar.subheader("Params")
|
| 244 |
-
nr_topics = st.sidebar.text_input("Topics (auto or int)", "auto")
|
| 245 |
-
|
| 246 |
-
# Run Button
|
| 247 |
-
if st.sidebar.button("Run Analysis", type="primary"):
|
| 248 |
-
with st.spinner("Processing..."):
|
| 249 |
-
docs = df[selected_text_col].dropna().astype(str).tolist()
|
| 250 |
-
|
| 251 |
-
# Simple embedding (In real app, cache this!)
|
| 252 |
-
emb_model = load_embedding_model("BAAI/bge-small-en-v1.5")
|
| 253 |
-
embeddings = emb_model.encode(docs, show_progress_bar=True)
|
| 254 |
-
|
| 255 |
-
# Config
|
| 256 |
-
config = {
|
| 257 |
-
"umap_params": {"n_neighbors": 15, "n_components": 5, "min_dist": 0.0},
|
| 258 |
-
"hdbscan_params": {"min_cluster_size": 10, "min_samples": 5},
|
| 259 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|