Spaces:

MilaNLProc
/

wordify

Build error

Pietro Lesci commited on Oct 4, 2021

Commit

c718eb8

1 Parent(s): e8a4a19

update

Files changed (3) hide show

src/configs.py CHANGED Viewed

@@ -33,4 +33,4 @@ class Languages(Enum):
 class SupportedFiles(Enum):
     xlsx = (lambda x: pd.read_excel(x, dtype=str),)
     csv = (lambda x: pd.read_csv(x, dtype=str),)
-    parquet = (lambda x: pd.read_parquet(x, dtype=str),)

 class SupportedFiles(Enum):
     xlsx = (lambda x: pd.read_excel(x, dtype=str),)
     csv = (lambda x: pd.read_csv(x, dtype=str),)
+    parquet = (lambda x: pd.read_parquet(x),)

src/pages/home.py CHANGED Viewed

@@ -108,7 +108,7 @@ def write(session, uploaded_file):
             pre_steps = pre_steps_elem.multiselect(
                 "Select pre-lemmatization preprocessing steps (ordered)",
                 options=steps_options,
-                default=steps_options[1:],
                 format_func=lambda x: x.replace("_", " ").title(),
                 key=session.run_id,
             )
@@ -146,6 +146,8 @@ def write(session, uploaded_file):
             post_steps=post_steps,
         )
         # ==== 3. PROVIDE FEEDBACK ON OPTIONS ==== #
         if show_sample and not (label_column and text_column):
             st.warning("Please select `label` and `text` columns")
@@ -155,6 +157,8 @@ def write(session, uploaded_file):
             sample_data[f"preprocessed_{text_column}"] = preprocessing_pipeline(
                 sample_data[text_column]
             ).values
             st.table(
                 sample_data.loc[
                     :, [label_column, text_column, f"preprocessed_{text_column}"]
@@ -174,6 +178,8 @@ def write(session, uploaded_file):
                     data[text_column]
                 ).values
                 inputs = encode(data[f"preprocessed_{text_column}"], data[label_column])
                 session.posdf, session.negdf = wordifier(**inputs)
             st.success("Wordified!")

             pre_steps = pre_steps_elem.multiselect(
                 "Select pre-lemmatization preprocessing steps (ordered)",
                 options=steps_options,
+                default=steps_options,
                 format_func=lambda x: x.replace("_", " ").title(),
                 key=session.run_id,
             )
             post_steps=post_steps,
         )
+        print(preprocessing_pipeline.pre_steps)
         # ==== 3. PROVIDE FEEDBACK ON OPTIONS ==== #
         if show_sample and not (label_column and text_column):
             st.warning("Please select `label` and `text` columns")
             sample_data[f"preprocessed_{text_column}"] = preprocessing_pipeline(
                 sample_data[text_column]
             ).values
+            print(sample_data)
             st.table(
                 sample_data.loc[
                     :, [label_column, text_column, f"preprocessed_{text_column}"]
                     data[text_column]
                 ).values
+                print(data.head())
                 inputs = encode(data[f"preprocessed_{text_column}"], data[label_column])
                 session.posdf, session.negdf = wordifier(**inputs)
             st.success("Wordified!")

src/preprocessing.py CHANGED Viewed

@@ -115,7 +115,7 @@ class Lemmatizer:
         elif remove_stop and not lemmatization:
             def lemmatizer_fn(doc: spacy.tokens.doc.Doc) -> str:
-                return " ".join([t for t in doc if not t.is_stop])
         elif lemmatization and not remove_stop:

         elif remove_stop and not lemmatization:
             def lemmatizer_fn(doc: spacy.tokens.doc.Doc) -> str:
+                return " ".join([t.text for t in doc if not t.is_stop])
         elif lemmatization and not remove_stop: