romybeaute commited on
Commit
3fd614a
·
verified ·
1 Parent(s): f2ee42e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +10 -98
app.py CHANGED
@@ -130,28 +130,9 @@ def count_clean_reports(csv_path: str) -> int:
130
  return len(df)
131
 
132
 
133
- def ensure_sentence_tokenizer():
134
- """
135
- Make sure NLTK sentence tokenizer data is available.
136
-
137
- Newer NLTK (3.9+) uses 'punkt_tab' for sent_tokenize(),
138
- older versions use 'punkt'
139
- """
140
- for resource in ("punkt_tab", "punkt"):
141
- try:
142
- nltk.data.find(f"tokenizers/{resource}")
143
- return
144
- except LookupError:
145
- # Try to download it
146
- try:
147
- nltk.download(resource)
148
- return
149
- except Exception as e:
150
- print(f"Failed to download NLTK resource '{resource}': {e}")
151
-
152
- # If we reach here, we didn't manage to get any tokenizer
153
- raise LookupError("Could not load NLTK punkt or punkt_tab tokenizer data.")
154
-
155
 
156
 
157
  # =====================================================================
@@ -171,17 +152,10 @@ st.markdown(
171
  """
172
  )
173
 
174
- # ROOT = project_root()
175
- # sys.path.append(str(ROOT / "MULTILINGUAL"))
176
-
177
-
178
-
179
  # =====================================================================
180
  # 2. Dataset paths (using MOSAIC structure)
181
  # =====================================================================
182
 
183
- # DATASET = "INNERSPEECH"
184
-
185
  # --- Choose dataset/project name (drives folder names) ---
186
  ds_input = st.sidebar.text_input("Project/Dataset name", value="MOSAIC", key="dataset_name_input")
187
  DATASET_DIR = _slugify(ds_input).upper()
@@ -209,10 +183,6 @@ with st.sidebar.expander("About the dataset name", expanded=False):
209
  """.strip()
210
  )
211
 
212
- # DATASETS = {
213
- # "API Translation (Batched)": str(PROC_DIR / "innerspeech_translated_batched_API.csv"),
214
- # "Local Translation (Llama)": str(PROC_DIR / "innerspeech_dataset_translated_llama.csv"),
215
- # }
216
 
217
  def _list_server_csvs(proc_dir: Path) -> list[str]:
218
  return [str(p) for p in sorted(proc_dir.glob("*.csv"))]
@@ -287,21 +257,6 @@ def perform_topic_modeling(_docs, _embeddings, config_hash):
287
  # Prepare vectorizer parameters
288
  if "ngram_range" in config["vectorizer_params"]:
289
  config["vectorizer_params"]["ngram_range"] = tuple(config["vectorizer_params"]["ngram_range"])
290
-
291
- # Load LLM for labeling
292
- # llm = load_llm_model() # <-- REMOVED
293
-
294
- # prompt = """Q:
295
- # You are an expert in micro-phenomenology. The following documents are reflections from participants about their experience.
296
- # I have a topic that contains the following documents:
297
- # [DOCUMENTS]
298
- # The topic is described by the following keywords: '[KEYWORDS]'.
299
- # Based on the above information, give a short, informative label (5–10 words).
300
- # A:"""
301
-
302
- # rep_model = {
303
- # "LLM": LlamaCPP(llm, prompt=prompt, nr_docs=25, doc_length=300, tokenizer="whitespace")
304
- # }
305
 
306
  # <-- MODIFIED: Use BERTopic's default representation instead of LLM
307
  rep_model = None
@@ -384,15 +339,15 @@ def generate_and_save_embeddings(csv_path, docs_file, emb_file,
384
  # ---------------------
385
  # Sentence / report granularity
386
  # ---------------------
 
 
387
  if split_sentences:
388
  try:
389
- ensure_sentence_tokenizer()
390
- except LookupError as e:
391
- st.error(f"Failed to load NLTK sentence tokenizer data: {e}")
 
392
  st.stop()
393
-
394
- sentences = [s for r in reports for s in nltk.sent_tokenize(r)]
395
- docs = [s for s in sentences if len(s.split()) > 2]
396
  else:
397
  docs = reports
398
 
@@ -445,32 +400,6 @@ source = st.sidebar.radio(
445
  uploaded_csv_path = None
446
  CSV_PATH = None # will be set in the chosen branch
447
 
448
- # if source == "Use preprocessed CSV on server":
449
- # # Show dataset selector ONLY in this branch
450
- # selected_dataset_name = st.sidebar.selectbox(
451
- # "Choose a dataset",
452
- # list(DATASETS.keys()),
453
- # key="dataset_name",
454
- # )
455
- # CSV_PATH = DATASETS[selected_dataset_name]
456
-
457
- # else: # Upload my own CSV
458
- # up = st.sidebar.file_uploader("Upload a CSV", type=["csv"], key="upload_csv")
459
- # if up is not None:
460
- # tmp_df = pd.read_csv(up)
461
- # col = _pick_text_column(tmp_df)
462
- # if col is None:
463
- # st.error("CSV must contain a text column such as: " + ", ".join(ACCEPTABLE_TEXT_COLUMNS))
464
- # st.stop()
465
- # if col != "reflection_answer_english":
466
- # tmp_df = tmp_df.rename(columns={col: "reflection_answer_english"})
467
- # uploaded_csv_path = str((PROC_DIR / "uploaded.csv").resolve())
468
- # tmp_df.to_csv(uploaded_csv_path, index=False)
469
- # st.success(f"Uploaded CSV saved to {uploaded_csv_path}")
470
- # CSV_PATH = uploaded_csv_path
471
- # else:
472
- # st.info("Upload a CSV to continue.")
473
- # st.stop()
474
  if source == "Use preprocessed CSV on server":
475
  # List preprocessed CSVs inside this dataset’s folder
476
  available = _list_server_csvs(PROC_DIR)
@@ -517,15 +446,7 @@ st.sidebar.markdown("---")
517
  # --- Embedding model selection ---
518
  st.sidebar.header("Model Selection")
519
 
520
-
521
-
522
- # selected_embedding_model = st.sidebar.selectbox("Choose an embedding model", (
523
- # "intfloat/multilingual-e5-large-instruct",
524
- # "Qwen/Qwen3-Embedding-0.6B",
525
- # "BAAI/bge-small-en-v1.5",
526
- # "sentence-transformers/all-mpnet-base-v2",
527
- # ))
528
-
529
  selected_embedding_model = st.sidebar.selectbox("Choose an embedding model", (
530
  "BAAI/bge-small-en-v1.5",
531
  "intfloat/multilingual-e5-large-instruct",
@@ -533,10 +454,6 @@ selected_embedding_model = st.sidebar.selectbox("Choose an embedding model", (
533
  "sentence-transformers/all-mpnet-base-v2",
534
  ))
535
 
536
-
537
-
538
-
539
-
540
  # --- Device selection ---
541
  # st.sidebar.header("Data Preparation")
542
  selected_device = st.sidebar.radio(
@@ -545,11 +462,6 @@ selected_device = st.sidebar.radio(
545
  index=0,
546
  )
547
 
548
-
549
-
550
-
551
-
552
-
553
  # =====================================================================
554
  # 7. Precompute filenames and pipeline triggers
555
  # =====================================================================
 
130
  return len(df)
131
 
132
 
133
+ # --- THIS CONFLICTING FUNCTION IS NOW REMOVED ---
134
+ # def ensure_sentence_tokenizer():
135
+ # ...
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
136
 
137
 
138
  # =====================================================================
 
152
  """
153
  )
154
 
 
 
 
 
 
155
  # =====================================================================
156
  # 2. Dataset paths (using MOSAIC structure)
157
  # =====================================================================
158
 
 
 
159
  # --- Choose dataset/project name (drives folder names) ---
160
  ds_input = st.sidebar.text_input("Project/Dataset name", value="MOSAIC", key="dataset_name_input")
161
  DATASET_DIR = _slugify(ds_input).upper()
 
183
  """.strip()
184
  )
185
 
 
 
 
 
186
 
187
  def _list_server_csvs(proc_dir: Path) -> list[str]:
188
  return [str(p) for p in sorted(proc_dir.glob("*.csv"))]
 
257
  # Prepare vectorizer parameters
258
  if "ngram_range" in config["vectorizer_params"]:
259
  config["vectorizer_params"]["ngram_range"] = tuple(config["vectorizer_params"]["ngram_range"])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
260
 
261
  # <-- MODIFIED: Use BERTopic's default representation instead of LLM
262
  rep_model = None
 
339
  # ---------------------
340
  # Sentence / report granularity
341
  # ---------------------
342
+
343
+ # --- THIS BLOCK IS NOW MODIFIED ---
344
  if split_sentences:
345
  try:
346
+ sentences = [s for r in reports for s in nltk.sent_tokenize(r)]
347
+ docs = [s for s in sentences if len(s.split()) > 2]
348
+ except LookupError:
349
+ st.error("NLTK 'punkt' data not found! This is a build error.")
350
  st.stop()
 
 
 
351
  else:
352
  docs = reports
353
 
 
400
  uploaded_csv_path = None
401
  CSV_PATH = None # will be set in the chosen branch
402
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
403
  if source == "Use preprocessed CSV on server":
404
  # List preprocessed CSVs inside this dataset’s folder
405
  available = _list_server_csvs(PROC_DIR)
 
446
  # --- Embedding model selection ---
447
  st.sidebar.header("Model Selection")
448
 
449
+ # Default is now the small model to prevent OOM crash on start
 
 
 
 
 
 
 
 
450
  selected_embedding_model = st.sidebar.selectbox("Choose an embedding model", (
451
  "BAAI/bge-small-en-v1.5",
452
  "intfloat/multilingual-e5-large-instruct",
 
454
  "sentence-transformers/all-mpnet-base-v2",
455
  ))
456
 
 
 
 
 
457
  # --- Device selection ---
458
  # st.sidebar.header("Data Preparation")
459
  selected_device = st.sidebar.radio(
 
462
  index=0,
463
  )
464
 
 
 
 
 
 
465
  # =====================================================================
466
  # 7. Precompute filenames and pipeline triggers
467
  # =====================================================================