Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
|
@@ -130,28 +130,9 @@ def count_clean_reports(csv_path: str) -> int:
|
|
| 130 |
return len(df)
|
| 131 |
|
| 132 |
|
| 133 |
-
|
| 134 |
-
|
| 135 |
-
|
| 136 |
-
|
| 137 |
-
Newer NLTK (3.9+) uses 'punkt_tab' for sent_tokenize(),
|
| 138 |
-
older versions use 'punkt'
|
| 139 |
-
"""
|
| 140 |
-
for resource in ("punkt_tab", "punkt"):
|
| 141 |
-
try:
|
| 142 |
-
nltk.data.find(f"tokenizers/{resource}")
|
| 143 |
-
return
|
| 144 |
-
except LookupError:
|
| 145 |
-
# Try to download it
|
| 146 |
-
try:
|
| 147 |
-
nltk.download(resource)
|
| 148 |
-
return
|
| 149 |
-
except Exception as e:
|
| 150 |
-
print(f"Failed to download NLTK resource '{resource}': {e}")
|
| 151 |
-
|
| 152 |
-
# If we reach here, we didn't manage to get any tokenizer
|
| 153 |
-
raise LookupError("Could not load NLTK punkt or punkt_tab tokenizer data.")
|
| 154 |
-
|
| 155 |
|
| 156 |
|
| 157 |
# =====================================================================
|
|
@@ -171,17 +152,10 @@ st.markdown(
|
|
| 171 |
"""
|
| 172 |
)
|
| 173 |
|
| 174 |
-
# ROOT = project_root()
|
| 175 |
-
# sys.path.append(str(ROOT / "MULTILINGUAL"))
|
| 176 |
-
|
| 177 |
-
|
| 178 |
-
|
| 179 |
# =====================================================================
|
| 180 |
# 2. Dataset paths (using MOSAIC structure)
|
| 181 |
# =====================================================================
|
| 182 |
|
| 183 |
-
# DATASET = "INNERSPEECH"
|
| 184 |
-
|
| 185 |
# --- Choose dataset/project name (drives folder names) ---
|
| 186 |
ds_input = st.sidebar.text_input("Project/Dataset name", value="MOSAIC", key="dataset_name_input")
|
| 187 |
DATASET_DIR = _slugify(ds_input).upper()
|
|
@@ -209,10 +183,6 @@ with st.sidebar.expander("About the dataset name", expanded=False):
|
|
| 209 |
""".strip()
|
| 210 |
)
|
| 211 |
|
| 212 |
-
# DATASETS = {
|
| 213 |
-
# "API Translation (Batched)": str(PROC_DIR / "innerspeech_translated_batched_API.csv"),
|
| 214 |
-
# "Local Translation (Llama)": str(PROC_DIR / "innerspeech_dataset_translated_llama.csv"),
|
| 215 |
-
# }
|
| 216 |
|
| 217 |
def _list_server_csvs(proc_dir: Path) -> list[str]:
|
| 218 |
return [str(p) for p in sorted(proc_dir.glob("*.csv"))]
|
|
@@ -287,21 +257,6 @@ def perform_topic_modeling(_docs, _embeddings, config_hash):
|
|
| 287 |
# Prepare vectorizer parameters
|
| 288 |
if "ngram_range" in config["vectorizer_params"]:
|
| 289 |
config["vectorizer_params"]["ngram_range"] = tuple(config["vectorizer_params"]["ngram_range"])
|
| 290 |
-
|
| 291 |
-
# Load LLM for labeling
|
| 292 |
-
# llm = load_llm_model() # <-- REMOVED
|
| 293 |
-
|
| 294 |
-
# prompt = """Q:
|
| 295 |
-
# You are an expert in micro-phenomenology. The following documents are reflections from participants about their experience.
|
| 296 |
-
# I have a topic that contains the following documents:
|
| 297 |
-
# [DOCUMENTS]
|
| 298 |
-
# The topic is described by the following keywords: '[KEYWORDS]'.
|
| 299 |
-
# Based on the above information, give a short, informative label (5–10 words).
|
| 300 |
-
# A:"""
|
| 301 |
-
|
| 302 |
-
# rep_model = {
|
| 303 |
-
# "LLM": LlamaCPP(llm, prompt=prompt, nr_docs=25, doc_length=300, tokenizer="whitespace")
|
| 304 |
-
# }
|
| 305 |
|
| 306 |
# <-- MODIFIED: Use BERTopic's default representation instead of LLM
|
| 307 |
rep_model = None
|
|
@@ -384,15 +339,15 @@ def generate_and_save_embeddings(csv_path, docs_file, emb_file,
|
|
| 384 |
# ---------------------
|
| 385 |
# Sentence / report granularity
|
| 386 |
# ---------------------
|
|
|
|
|
|
|
| 387 |
if split_sentences:
|
| 388 |
try:
|
| 389 |
-
|
| 390 |
-
|
| 391 |
-
|
|
|
|
| 392 |
st.stop()
|
| 393 |
-
|
| 394 |
-
sentences = [s for r in reports for s in nltk.sent_tokenize(r)]
|
| 395 |
-
docs = [s for s in sentences if len(s.split()) > 2]
|
| 396 |
else:
|
| 397 |
docs = reports
|
| 398 |
|
|
@@ -445,32 +400,6 @@ source = st.sidebar.radio(
|
|
| 445 |
uploaded_csv_path = None
|
| 446 |
CSV_PATH = None # will be set in the chosen branch
|
| 447 |
|
| 448 |
-
# if source == "Use preprocessed CSV on server":
|
| 449 |
-
# # Show dataset selector ONLY in this branch
|
| 450 |
-
# selected_dataset_name = st.sidebar.selectbox(
|
| 451 |
-
# "Choose a dataset",
|
| 452 |
-
# list(DATASETS.keys()),
|
| 453 |
-
# key="dataset_name",
|
| 454 |
-
# )
|
| 455 |
-
# CSV_PATH = DATASETS[selected_dataset_name]
|
| 456 |
-
|
| 457 |
-
# else: # Upload my own CSV
|
| 458 |
-
# up = st.sidebar.file_uploader("Upload a CSV", type=["csv"], key="upload_csv")
|
| 459 |
-
# if up is not None:
|
| 460 |
-
# tmp_df = pd.read_csv(up)
|
| 461 |
-
# col = _pick_text_column(tmp_df)
|
| 462 |
-
# if col is None:
|
| 463 |
-
# st.error("CSV must contain a text column such as: " + ", ".join(ACCEPTABLE_TEXT_COLUMNS))
|
| 464 |
-
# st.stop()
|
| 465 |
-
# if col != "reflection_answer_english":
|
| 466 |
-
# tmp_df = tmp_df.rename(columns={col: "reflection_answer_english"})
|
| 467 |
-
# uploaded_csv_path = str((PROC_DIR / "uploaded.csv").resolve())
|
| 468 |
-
# tmp_df.to_csv(uploaded_csv_path, index=False)
|
| 469 |
-
# st.success(f"Uploaded CSV saved to {uploaded_csv_path}")
|
| 470 |
-
# CSV_PATH = uploaded_csv_path
|
| 471 |
-
# else:
|
| 472 |
-
# st.info("Upload a CSV to continue.")
|
| 473 |
-
# st.stop()
|
| 474 |
if source == "Use preprocessed CSV on server":
|
| 475 |
# List preprocessed CSVs inside this dataset’s folder
|
| 476 |
available = _list_server_csvs(PROC_DIR)
|
|
@@ -517,15 +446,7 @@ st.sidebar.markdown("---")
|
|
| 517 |
# --- Embedding model selection ---
|
| 518 |
st.sidebar.header("Model Selection")
|
| 519 |
|
| 520 |
-
|
| 521 |
-
|
| 522 |
-
# selected_embedding_model = st.sidebar.selectbox("Choose an embedding model", (
|
| 523 |
-
# "intfloat/multilingual-e5-large-instruct",
|
| 524 |
-
# "Qwen/Qwen3-Embedding-0.6B",
|
| 525 |
-
# "BAAI/bge-small-en-v1.5",
|
| 526 |
-
# "sentence-transformers/all-mpnet-base-v2",
|
| 527 |
-
# ))
|
| 528 |
-
|
| 529 |
selected_embedding_model = st.sidebar.selectbox("Choose an embedding model", (
|
| 530 |
"BAAI/bge-small-en-v1.5",
|
| 531 |
"intfloat/multilingual-e5-large-instruct",
|
|
@@ -533,10 +454,6 @@ selected_embedding_model = st.sidebar.selectbox("Choose an embedding model", (
|
|
| 533 |
"sentence-transformers/all-mpnet-base-v2",
|
| 534 |
))
|
| 535 |
|
| 536 |
-
|
| 537 |
-
|
| 538 |
-
|
| 539 |
-
|
| 540 |
# --- Device selection ---
|
| 541 |
# st.sidebar.header("Data Preparation")
|
| 542 |
selected_device = st.sidebar.radio(
|
|
@@ -545,11 +462,6 @@ selected_device = st.sidebar.radio(
|
|
| 545 |
index=0,
|
| 546 |
)
|
| 547 |
|
| 548 |
-
|
| 549 |
-
|
| 550 |
-
|
| 551 |
-
|
| 552 |
-
|
| 553 |
# =====================================================================
|
| 554 |
# 7. Precompute filenames and pipeline triggers
|
| 555 |
# =====================================================================
|
|
|
|
| 130 |
return len(df)
|
| 131 |
|
| 132 |
|
| 133 |
+
# --- THIS CONFLICTING FUNCTION IS NOW REMOVED ---
|
| 134 |
+
# def ensure_sentence_tokenizer():
|
| 135 |
+
# ...
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 136 |
|
| 137 |
|
| 138 |
# =====================================================================
|
|
|
|
| 152 |
"""
|
| 153 |
)
|
| 154 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 155 |
# =====================================================================
|
| 156 |
# 2. Dataset paths (using MOSAIC structure)
|
| 157 |
# =====================================================================
|
| 158 |
|
|
|
|
|
|
|
| 159 |
# --- Choose dataset/project name (drives folder names) ---
|
| 160 |
ds_input = st.sidebar.text_input("Project/Dataset name", value="MOSAIC", key="dataset_name_input")
|
| 161 |
DATASET_DIR = _slugify(ds_input).upper()
|
|
|
|
| 183 |
""".strip()
|
| 184 |
)
|
| 185 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 186 |
|
| 187 |
def _list_server_csvs(proc_dir: Path) -> list[str]:
|
| 188 |
return [str(p) for p in sorted(proc_dir.glob("*.csv"))]
|
|
|
|
| 257 |
# Prepare vectorizer parameters
|
| 258 |
if "ngram_range" in config["vectorizer_params"]:
|
| 259 |
config["vectorizer_params"]["ngram_range"] = tuple(config["vectorizer_params"]["ngram_range"])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 260 |
|
| 261 |
# <-- MODIFIED: Use BERTopic's default representation instead of LLM
|
| 262 |
rep_model = None
|
|
|
|
| 339 |
# ---------------------
|
| 340 |
# Sentence / report granularity
|
| 341 |
# ---------------------
|
| 342 |
+
|
| 343 |
+
# --- THIS BLOCK IS NOW MODIFIED ---
|
| 344 |
if split_sentences:
|
| 345 |
try:
|
| 346 |
+
sentences = [s for r in reports for s in nltk.sent_tokenize(r)]
|
| 347 |
+
docs = [s for s in sentences if len(s.split()) > 2]
|
| 348 |
+
except LookupError:
|
| 349 |
+
st.error("NLTK 'punkt' data not found! This is a build error.")
|
| 350 |
st.stop()
|
|
|
|
|
|
|
|
|
|
| 351 |
else:
|
| 352 |
docs = reports
|
| 353 |
|
|
|
|
| 400 |
uploaded_csv_path = None
|
| 401 |
CSV_PATH = None # will be set in the chosen branch
|
| 402 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 403 |
if source == "Use preprocessed CSV on server":
|
| 404 |
# List preprocessed CSVs inside this dataset’s folder
|
| 405 |
available = _list_server_csvs(PROC_DIR)
|
|
|
|
| 446 |
# --- Embedding model selection ---
|
| 447 |
st.sidebar.header("Model Selection")
|
| 448 |
|
| 449 |
+
# Default is now the small model to prevent OOM crash on start
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 450 |
selected_embedding_model = st.sidebar.selectbox("Choose an embedding model", (
|
| 451 |
"BAAI/bge-small-en-v1.5",
|
| 452 |
"intfloat/multilingual-e5-large-instruct",
|
|
|
|
| 454 |
"sentence-transformers/all-mpnet-base-v2",
|
| 455 |
))
|
| 456 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 457 |
# --- Device selection ---
|
| 458 |
# st.sidebar.header("Data Preparation")
|
| 459 |
selected_device = st.sidebar.radio(
|
|
|
|
| 462 |
index=0,
|
| 463 |
)
|
| 464 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 465 |
# =====================================================================
|
| 466 |
# 7. Precompute filenames and pipeline triggers
|
| 467 |
# =====================================================================
|