romybeaute commited on
Commit
2a3b238
·
verified ·
1 Parent(s): 0f73406

Delete app2.0.py

Browse files
Files changed (1) hide show
  1. app2.0.py +0 -259
app2.0.py DELETED
@@ -1,259 +0,0 @@
1
- """
2
- File: app.py
3
- Description: Unified MOSAIC App (Lite + Pro).
4
- Switches between CPU/Lite and GPU/LLM modes automatically based on environment variables.
5
- """
6
-
7
- # =====================================================================
8
- # Imports
9
- # =====================================================================
10
-
11
- import os
12
- import sys
13
- import json
14
- import re
15
- import logging
16
- from pathlib import Path
17
-
18
- import streamlit as st
19
- import pandas as pd
20
- import numpy as np
21
- import nltk
22
-
23
- # Standard ML Imports
24
- from bertopic import BERTopic
25
- from sentence_transformers import SentenceTransformer
26
- from sklearn.feature_extraction.text import CountVectorizer
27
- from umap import UMAP
28
- from hdbscan import HDBSCAN
29
- import datamapplot
30
- import matplotlib.pyplot as plt
31
- from huggingface_hub import hf_hub_download
32
-
33
- # Configure Logging
34
- logging.basicConfig(level=logging.INFO)
35
- logger = logging.getLogger(__name__)
36
-
37
- # --- 1. DETECT MODE ---
38
- # We check if the user has enabled the LLM in Hugging Face Secrets/Env Vars
39
- ENABLE_LLM = os.getenv("ENABLE_LLM", "False").lower() in ("true", "1", "yes")
40
-
41
- # Try to import LLM libraries only if enabled
42
- LLM_MODULES_AVAILABLE = False
43
- if ENABLE_LLM:
44
- try:
45
- from llama_cpp import Llama
46
- from bertopic.representation import LlamaCPP
47
- LLM_MODULES_AVAILABLE = True
48
- logger.info("🟢 LLM Modules imported successfully.")
49
- except ImportError as e:
50
- logger.warning(f"🔴 ENABLE_LLM is True, but libraries are missing: {e}. Falling back to Lite mode.")
51
- ENABLE_LLM = False
52
-
53
- # =====================================================================
54
- # NLTK setup
55
- # =====================================================================
56
-
57
- NLTK_DATA_DIR = "/usr/local/share/nltk_data"
58
- if NLTK_DATA_DIR not in nltk.data.path:
59
- nltk.data.path.append(NLTK_DATA_DIR)
60
-
61
- for resource in ("punkt_tab", "punkt"):
62
- try:
63
- nltk.data.find(f"tokenizers/{resource}")
64
- except LookupError:
65
- try:
66
- nltk.download(resource, download_dir=NLTK_DATA_DIR)
67
- except Exception:
68
- pass
69
-
70
- # =====================================================================
71
- # Path / Cache Utils
72
- # =====================================================================
73
-
74
- # Fallback path logic (works without 'mosaic' package)
75
- def _env(key: str, default: str) -> Path:
76
- val = os.getenv(key, default)
77
- return Path(val).expanduser().resolve()
78
-
79
- _DATA_ROOT = _env("MOSAIC_DATA", str(Path(__file__).parent / "data"))
80
- PROC_DIR = _DATA_ROOT / "preprocessed"
81
- CACHE_DIR = PROC_DIR / "cache"
82
- EVAL_DIR = _env("MOSAIC_EVAL", str(Path(__file__).parent / "eval"))
83
-
84
- for p in [PROC_DIR, CACHE_DIR, EVAL_DIR]:
85
- p.mkdir(parents=True, exist_ok=True)
86
-
87
- def _slugify(s: str) -> str:
88
- s = s.strip()
89
- return re.sub(r"[^A-Za-z0-9._-]+", "_", s) or "DATASET"
90
-
91
- def _cleanup_old_cache(current_slug: str):
92
- if not CACHE_DIR.exists(): return
93
- for p in CACHE_DIR.glob("precomputed_*.npy"):
94
- if current_slug not in p.name:
95
- try:
96
- p.unlink()
97
- except Exception: pass
98
-
99
- # =====================================================================
100
- # Streamlit App
101
- # =====================================================================
102
-
103
- st.set_page_config(page_title="MOSAIC Dashboard", layout="wide")
104
- st.title("MOSAIC: Topic Modelling Dashboard")
105
-
106
- # --- Status Indicator ---
107
- if ENABLE_LLM:
108
- st.info("🟢 **Pro Mode Active:** LLM Labeling (Llama-3-8B) is ENABLED.")
109
- else:
110
- st.warning("🟡 **Lite Mode Active:** Running on CPU (Keyword labels only).")
111
-
112
- # =====================================================================
113
- # Helper Functions
114
- # =====================================================================
115
-
116
- @st.cache_resource
117
- def load_embedding_model(model_name):
118
- return SentenceTransformer(model_name)
119
-
120
- @st.cache_resource
121
- def load_llm_model():
122
- """Loads LlamaCPP model only if enabled."""
123
- if not ENABLE_LLM or not LLM_MODULES_AVAILABLE:
124
- return None
125
-
126
- status_container = st.empty()
127
- status_container.info("⏳ Loading Llama-3-8B (Quantized)... This may take 1-2 minutes.")
128
-
129
- try:
130
- model_repo = "NousResearch/Meta-Llama-3-8B-Instruct-GGUF"
131
- model_file = "Meta-Llama-3-8B-Instruct-Q4_K_M.gguf"
132
- model_path = hf_hub_download(repo_id=model_repo, filename=model_file)
133
-
134
- # Offload layers to GPU if available, otherwise CPU
135
- llm = Llama(model_path=model_path, n_gpu_layers=-1, n_ctx=8192, verbose=False)
136
- status_container.success("✅ LLM Loaded!")
137
- return llm
138
- except Exception as e:
139
- status_container.error(f"Failed to load LLM: {e}")
140
- return None
141
-
142
- @st.cache_data
143
- def load_precomputed_data(docs_file, emb_file):
144
- return np.load(docs_file, allow_pickle=True).tolist(), np.load(emb_file, allow_pickle=True)
145
-
146
- def get_config_hash(cfg):
147
- return json.dumps(cfg, sort_keys=True)
148
-
149
- # =====================================================================
150
- # Topic Modeling Core
151
- # =====================================================================
152
-
153
- @st.cache_data
154
- def perform_topic_modeling(_docs, _embeddings, config_hash):
155
- _docs = list(_docs)
156
- _embeddings = np.ascontiguousarray(_embeddings, dtype=np.float32)
157
- config = json.loads(config_hash)
158
-
159
- if "ngram_range" in config["vectorizer_params"]:
160
- config["vectorizer_params"]["ngram_range"] = tuple(config["vectorizer_params"]["ngram_range"])
161
-
162
- # --- Representation Model Logic (The Switch) ---
163
- rep_model = None
164
- if ENABLE_LLM and config.get("use_llm", False):
165
- llm = load_llm_model()
166
- if llm:
167
- prompt = "Q:\nI have a topic described by keywords: '[KEYWORDS]'.\nThe documents are: [DOCUMENTS]\nProvide a short label (5 words max).\nA:"
168
- rep_model = {"LLM": LlamaCPP(llm, prompt=prompt, nr_docs=10, doc_length=200, tokenizer="whitespace")}
169
-
170
- # --- BERTopic Setup ---
171
- topic_model = BERTopic(
172
- umap_model=UMAP(random_state=42, metric="cosine", **config["umap_params"]),
173
- hdbscan_model=HDBSCAN(metric="euclidean", prediction_data=True, **config["hdbscan_params"]),
174
- vectorizer_model=CountVectorizer(**config["vectorizer_params"]) if config["use_vectorizer"] else None,
175
- representation_model=rep_model,
176
- top_n_words=config["bt_params"]["top_n_words"],
177
- nr_topics=None if config["bt_params"]["nr_topics"] == "auto" else int(config["bt_params"]["nr_topics"]),
178
- verbose=False
179
- )
180
-
181
- topics, _ = topic_model.fit_transform(_docs, _embeddings)
182
- info = topic_model.get_topic_info()
183
-
184
- # --- Label Extraction ---
185
- if rep_model and "LLM" in topic_model.get_topics(full=True):
186
- raw_labels = [label[0][0] for label in topic_model.get_topics(full=True)["LLM"].values()]
187
- final_labels = [l.split(":")[-1].strip().strip('"') if l else "Unlabelled" for l in raw_labels]
188
- all_labels = [final_labels[t + topic_model._outliers] if t != -1 else "Unlabelled" for t in topics]
189
- else:
190
- name_map = info.set_index("Topic")["Name"].to_dict()
191
- all_labels = [name_map[t] for t in topics]
192
-
193
- # --- Visualization Data ---
194
- reduced = UMAP(n_neighbors=15, n_components=2, min_dist=0.0, metric="cosine", random_state=42).fit_transform(_embeddings)
195
-
196
- outlier_pct = 0
197
- if -1 in info.Topic.values:
198
- outlier_pct = (info.Count[info.Topic == -1].iloc[0] / info.Count.sum()) * 100
199
-
200
- return topic_model, reduced, all_labels, len(info) - 1, outlier_pct
201
-
202
- # =====================================================================
203
- # Main UI Logic
204
- # =====================================================================
205
-
206
- st.sidebar.header("Data & Model")
207
- source = st.sidebar.radio("Data Source", ["Server CSV", "Upload CSV"])
208
- CSV_PATH = None
209
-
210
- if source == "Server CSV":
211
- csvs = [str(p) for p in sorted(PROC_DIR.glob("*.csv"))]
212
- if csvs: CSV_PATH = st.sidebar.selectbox("Select File", csvs)
213
- else:
214
- up = st.sidebar.file_uploader("Upload CSV", type=["csv"])
215
- if up:
216
- safe_name = _slugify(os.path.splitext(up.name)[0])
217
- _cleanup_old_cache(safe_name)
218
- CSV_PATH = str(PROC_DIR / f"{safe_name}.csv")
219
- pd.read_csv(up).to_csv(CSV_PATH, index=False)
220
- st.success(f"Saved: {safe_name}")
221
-
222
- if CSV_PATH:
223
- # --- Data Loading ---
224
- df = pd.read_csv(CSV_PATH)
225
-
226
- # Try to find text column
227
- text_cols = [c for c in df.columns if df[c].dtype == object]
228
- if not text_cols:
229
- st.error("No text columns found.")
230
- st.stop()
231
-
232
- # Auto-pick "reflection_answer_english" if present
233
- default_idx = 0
234
- for i, col in enumerate(text_cols):
235
- if "reflection" in col or "text" in col:
236
- default_idx = i
237
- break
238
-
239
- selected_text_col = st.sidebar.selectbox("Text Column", text_cols, index=default_idx)
240
-
241
- # --- Config ---
242
- st.sidebar.markdown("---")
243
- st.sidebar.subheader("Params")
244
- nr_topics = st.sidebar.text_input("Topics (auto or int)", "auto")
245
-
246
- # Run Button
247
- if st.sidebar.button("Run Analysis", type="primary"):
248
- with st.spinner("Processing..."):
249
- docs = df[selected_text_col].dropna().astype(str).tolist()
250
-
251
- # Simple embedding (In real app, cache this!)
252
- emb_model = load_embedding_model("BAAI/bge-small-en-v1.5")
253
- embeddings = emb_model.encode(docs, show_progress_bar=True)
254
-
255
- # Config
256
- config = {
257
- "umap_params": {"n_neighbors": 15, "n_components": 5, "min_dist": 0.0},
258
- "hdbscan_params": {"min_cluster_size": 10, "min_samples": 5},
259
- "