Spaces:
Sleeping
Sleeping
added display text preprocessing / loading
Browse files
app.py
CHANGED
|
@@ -611,19 +611,9 @@ def generate_and_save_embeddings(
|
|
| 611 |
df = df[df["reflection_answer_english"].str.strip() != ""]
|
| 612 |
reports = df["reflection_answer_english"].tolist()
|
| 613 |
|
| 614 |
-
#
|
| 615 |
-
|
| 616 |
-
|
| 617 |
-
# if split_sentences:
|
| 618 |
-
# try:
|
| 619 |
-
# sentences = [s for r in reports for s in nltk.sent_tokenize(r)]
|
| 620 |
-
# docs = [s for s in sentences if len(s.split()) > 2]
|
| 621 |
-
# except LookupError as e:
|
| 622 |
-
# st.error(f"NLTK tokenizer data not found: {e}")
|
| 623 |
-
# st.stop()
|
| 624 |
-
# else:
|
| 625 |
-
# docs = reports
|
| 626 |
-
|
| 627 |
#change to account for sentence removal when < N words
|
| 628 |
if split_sentences:
|
| 629 |
try:
|
|
@@ -632,16 +622,40 @@ def generate_and_save_embeddings(
|
|
| 632 |
st.error(f"NLTK tokenizer data not found: {e}")
|
| 633 |
st.stop()
|
| 634 |
|
|
|
|
|
|
|
| 635 |
if min_words and min_words > 0:
|
| 636 |
docs = [s for s in sentences if len(s.split()) >= min_words]
|
| 637 |
else:
|
| 638 |
docs = sentences
|
| 639 |
else:
|
|
|
|
| 640 |
if min_words and min_words > 0:
|
| 641 |
docs = [r for r in reports if len(str(r).split()) >= min_words]
|
| 642 |
else:
|
| 643 |
docs = reports
|
| 644 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 645 |
np.save(docs_file, np.array(docs, dtype=object))
|
| 646 |
st.success(f"Prepared {len(docs)} documents")
|
| 647 |
|
|
@@ -967,10 +981,37 @@ else:
|
|
| 967 |
unit = "sentences" if selected_granularity else "reports"
|
| 968 |
n_units = len(docs)
|
| 969 |
|
| 970 |
-
c1, c2 = st.columns(
|
| 971 |
c1.metric("Reports in CSV (cleaned)", n_reports)
|
| 972 |
c2.metric(f"Units analysed ({unit})", n_units)
|
| 973 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 974 |
# --- Parameter controls ---
|
| 975 |
st.sidebar.header("Model Parameters")
|
| 976 |
|
|
|
|
| 611 |
df = df[df["reflection_answer_english"].str.strip() != ""]
|
| 612 |
reports = df["reflection_answer_english"].tolist()
|
| 613 |
|
| 614 |
+
#change to add data sanity check
|
| 615 |
+
granularity_label = "sentences" if split_sentences else "reports"
|
| 616 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 617 |
#change to account for sentence removal when < N words
|
| 618 |
if split_sentences:
|
| 619 |
try:
|
|
|
|
| 622 |
st.error(f"NLTK tokenizer data not found: {e}")
|
| 623 |
st.stop()
|
| 624 |
|
| 625 |
+
total_units_before = len(sentences)
|
| 626 |
+
|
| 627 |
if min_words and min_words > 0:
|
| 628 |
docs = [s for s in sentences if len(s.split()) >= min_words]
|
| 629 |
else:
|
| 630 |
docs = sentences
|
| 631 |
else:
|
| 632 |
+
total_units_before = len(reports)
|
| 633 |
if min_words and min_words > 0:
|
| 634 |
docs = [r for r in reports if len(str(r).split()) >= min_words]
|
| 635 |
else:
|
| 636 |
docs = reports
|
| 637 |
|
| 638 |
+
total_units_after = len(docs)
|
| 639 |
+
removed_units = total_units_before - total_units_after
|
| 640 |
+
|
| 641 |
+
# Store stats for later display in "Dataset summary"
|
| 642 |
+
st.session_state["last_data_stats"] = {
|
| 643 |
+
"granularity": granularity_label,
|
| 644 |
+
"min_words": int(min_words or 0),
|
| 645 |
+
"total_before": int(total_units_before),
|
| 646 |
+
"total_after": int(total_units_after),
|
| 647 |
+
"removed": int(removed_units),
|
| 648 |
+
}
|
| 649 |
+
|
| 650 |
+
if min_words and min_words > 0:
|
| 651 |
+
st.info(
|
| 652 |
+
f"Preprocessing: started with {total_units_before} {granularity_label}, "
|
| 653 |
+
f"removed {removed_units} shorter than {min_words} words; "
|
| 654 |
+
f"{total_units_after} remaining."
|
| 655 |
+
)
|
| 656 |
+
else:
|
| 657 |
+
st.info(f"Preprocessing: {total_units_after} {granularity_label} prepared.")
|
| 658 |
+
|
| 659 |
np.save(docs_file, np.array(docs, dtype=object))
|
| 660 |
st.success(f"Prepared {len(docs)} documents")
|
| 661 |
|
|
|
|
| 981 |
unit = "sentences" if selected_granularity else "reports"
|
| 982 |
n_units = len(docs)
|
| 983 |
|
| 984 |
+
c1, c2, c3 = st.columns(3)
|
| 985 |
c1.metric("Reports in CSV (cleaned)", n_reports)
|
| 986 |
c2.metric(f"Units analysed ({unit})", n_units)
|
| 987 |
|
| 988 |
+
|
| 989 |
+
stats = st.session_state.get("last_data_stats")
|
| 990 |
+
if (
|
| 991 |
+
stats is not None
|
| 992 |
+
and stats.get("granularity") == unit
|
| 993 |
+
and stats.get("min_words", 0) == int(min_words or 0)
|
| 994 |
+
):
|
| 995 |
+
removed = stats["removed"]
|
| 996 |
+
total_before = stats["total_before"]
|
| 997 |
+
c3.metric("Units removed (< N words)", removed)
|
| 998 |
+
st.caption(
|
| 999 |
+
f"Min-words filter N = {int(min_words or 0)}. "
|
| 1000 |
+
f"Started with {total_before} {unit}, kept {stats['total_after']}."
|
| 1001 |
+
)
|
| 1002 |
+
else:
|
| 1003 |
+
c3.metric("Units removed (< N words)", "–")
|
| 1004 |
+
st.caption(
|
| 1005 |
+
"Change 'Remove units shorter than N words' and click "
|
| 1006 |
+
"'Prepare Data for This Configuration' to update removal stats."
|
| 1007 |
+
)
|
| 1008 |
+
|
| 1009 |
+
|
| 1010 |
+
with st.expander("Preview preprocessed text (first 10 units)"):
|
| 1011 |
+
preview_df = pd.DataFrame({"text": docs[:10]})
|
| 1012 |
+
st.dataframe(preview_df)
|
| 1013 |
+
|
| 1014 |
+
|
| 1015 |
# --- Parameter controls ---
|
| 1016 |
st.sidebar.header("Model Parameters")
|
| 1017 |
|