romybeaute commited on
Commit
daf1dbf
·
verified ·
1 Parent(s): 96ab83e

added display text preprocessing / loading

Browse files
Files changed (1) hide show
  1. app.py +55 -14
app.py CHANGED
@@ -611,19 +611,9 @@ def generate_and_save_embeddings(
611
  df = df[df["reflection_answer_english"].str.strip() != ""]
612
  reports = df["reflection_answer_english"].tolist()
613
 
614
- # ---------------------
615
- # Sentence / report granularity
616
- # ---------------------
617
- # if split_sentences:
618
- # try:
619
- # sentences = [s for r in reports for s in nltk.sent_tokenize(r)]
620
- # docs = [s for s in sentences if len(s.split()) > 2]
621
- # except LookupError as e:
622
- # st.error(f"NLTK tokenizer data not found: {e}")
623
- # st.stop()
624
- # else:
625
- # docs = reports
626
-
627
  #change to account for sentence removal when < N words
628
  if split_sentences:
629
  try:
@@ -632,16 +622,40 @@ def generate_and_save_embeddings(
632
  st.error(f"NLTK tokenizer data not found: {e}")
633
  st.stop()
634
 
 
 
635
  if min_words and min_words > 0:
636
  docs = [s for s in sentences if len(s.split()) >= min_words]
637
  else:
638
  docs = sentences
639
  else:
 
640
  if min_words and min_words > 0:
641
  docs = [r for r in reports if len(str(r).split()) >= min_words]
642
  else:
643
  docs = reports
644
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
645
  np.save(docs_file, np.array(docs, dtype=object))
646
  st.success(f"Prepared {len(docs)} documents")
647
 
@@ -967,10 +981,37 @@ else:
967
  unit = "sentences" if selected_granularity else "reports"
968
  n_units = len(docs)
969
 
970
- c1, c2 = st.columns(2)
971
  c1.metric("Reports in CSV (cleaned)", n_reports)
972
  c2.metric(f"Units analysed ({unit})", n_units)
973
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
974
  # --- Parameter controls ---
975
  st.sidebar.header("Model Parameters")
976
 
 
611
  df = df[df["reflection_answer_english"].str.strip() != ""]
612
  reports = df["reflection_answer_english"].tolist()
613
 
614
+ #change to add data sanity check
615
+ granularity_label = "sentences" if split_sentences else "reports"
616
+
 
 
 
 
 
 
 
 
 
 
617
  #change to account for sentence removal when < N words
618
  if split_sentences:
619
  try:
 
622
  st.error(f"NLTK tokenizer data not found: {e}")
623
  st.stop()
624
 
625
+ total_units_before = len(sentences)
626
+
627
  if min_words and min_words > 0:
628
  docs = [s for s in sentences if len(s.split()) >= min_words]
629
  else:
630
  docs = sentences
631
  else:
632
+ total_units_before = len(reports)
633
  if min_words and min_words > 0:
634
  docs = [r for r in reports if len(str(r).split()) >= min_words]
635
  else:
636
  docs = reports
637
 
638
+ total_units_after = len(docs)
639
+ removed_units = total_units_before - total_units_after
640
+
641
+ # Store stats for later display in "Dataset summary"
642
+ st.session_state["last_data_stats"] = {
643
+ "granularity": granularity_label,
644
+ "min_words": int(min_words or 0),
645
+ "total_before": int(total_units_before),
646
+ "total_after": int(total_units_after),
647
+ "removed": int(removed_units),
648
+ }
649
+
650
+ if min_words and min_words > 0:
651
+ st.info(
652
+ f"Preprocessing: started with {total_units_before} {granularity_label}, "
653
+ f"removed {removed_units} shorter than {min_words} words; "
654
+ f"{total_units_after} remaining."
655
+ )
656
+ else:
657
+ st.info(f"Preprocessing: {total_units_after} {granularity_label} prepared.")
658
+
659
  np.save(docs_file, np.array(docs, dtype=object))
660
  st.success(f"Prepared {len(docs)} documents")
661
 
 
981
  unit = "sentences" if selected_granularity else "reports"
982
  n_units = len(docs)
983
 
984
+ c1, c2, c3 = st.columns(3)
985
  c1.metric("Reports in CSV (cleaned)", n_reports)
986
  c2.metric(f"Units analysed ({unit})", n_units)
987
 
988
+
989
+ stats = st.session_state.get("last_data_stats")
990
+ if (
991
+ stats is not None
992
+ and stats.get("granularity") == unit
993
+ and stats.get("min_words", 0) == int(min_words or 0)
994
+ ):
995
+ removed = stats["removed"]
996
+ total_before = stats["total_before"]
997
+ c3.metric("Units removed (< N words)", removed)
998
+ st.caption(
999
+ f"Min-words filter N = {int(min_words or 0)}. "
1000
+ f"Started with {total_before} {unit}, kept {stats['total_after']}."
1001
+ )
1002
+ else:
1003
+ c3.metric("Units removed (< N words)", "–")
1004
+ st.caption(
1005
+ "Change 'Remove units shorter than N words' and click "
1006
+ "'Prepare Data for This Configuration' to update removal stats."
1007
+ )
1008
+
1009
+
1010
+ with st.expander("Preview preprocessed text (first 10 units)"):
1011
+ preview_df = pd.DataFrame({"text": docs[:10]})
1012
+ st.dataframe(preview_df)
1013
+
1014
+
1015
  # --- Parameter controls ---
1016
  st.sidebar.header("Model Parameters")
1017