Spaces:

MilaNLProc
/

wordify

Build error

App Files Files Community

Pietro Lesci commited on Dec 15, 2021

Commit

fdbadfe

1 Parent(s): ca663e1

add missing typing

Browse files

Files changed (2) hide show

src/preprocessing.py +11 -11
src/utils.py +68 -67

src/preprocessing.py CHANGED Viewed

@@ -19,22 +19,22 @@ from .configs import Languages
 # and [here](https://textacy.readthedocs.io/en/latest/api_reference/preprocessing.html)
 # fmt: off
 _re_normalize_acronyms = re.compile(r"(?:[a-zA-Z]\.){2,}")
-def normalize_acronyms(t):
     return _re_normalize_acronyms.sub(t.translate(str.maketrans("", "", string.punctuation)).upper(), t)
 _re_non_word = re.compile(r"\W")
-def remove_non_word(t):
     return _re_non_word.sub(" ", t)
 _re_space = re.compile(r" {2,}")
-def normalize_useless_spaces(t):
     return _re_space.sub(" ", t)
 _re_rep = re.compile(r"(\S)(\1{2,})")
-def normalize_repeating_chars(t):
     def _replace_rep(m):
         c, cc = m.groups()
         return c
@@ -43,7 +43,7 @@ def normalize_repeating_chars(t):
 _re_wrep = re.compile(r"(?:\s|^)(\w+)\s+((?:\1\s+)+)\1(\s|\W|$)")
-def normalize_repeating_words(t):
     def _replace_wrep(m):
         c, cc, e = m.groups()
         return c
@@ -92,11 +92,10 @@ class PreprocessingPipeline:
         self.post = self.make_pre_post_component(self.post_steps)
         self.lemma = self.lemmatization_component()[self.lemmatization_step]
-    def apply_multiproc(fn, series):
-        with mp.Pool(mp.cpu_count()) as pool:
-            new_series = pool.map(fn, series)
-        return new_series
     def vaex_process(self, df: DataFrame, text_column: str) -> DataFrame:
         def fn(t):
@@ -106,8 +105,9 @@ class PreprocessingPipeline:
         vdf["processed_text"] = vdf.apply(
             fn, arguments=[vdf[text_column]], vectorize=False
         )
-        return vdf.to_pandas_df()
     def __call__(self, series: Series) -> Series:
         if self.pre:

 # and [here](https://textacy.readthedocs.io/en/latest/api_reference/preprocessing.html)
 # fmt: off
 _re_normalize_acronyms = re.compile(r"(?:[a-zA-Z]\.){2,}")
+def normalize_acronyms(t: str) -> str:
     return _re_normalize_acronyms.sub(t.translate(str.maketrans("", "", string.punctuation)).upper(), t)
 _re_non_word = re.compile(r"\W")
+def remove_non_word(t: str) -> str:
     return _re_non_word.sub(" ", t)
 _re_space = re.compile(r" {2,}")
+def normalize_useless_spaces(t: str) -> str:
     return _re_space.sub(" ", t)
 _re_rep = re.compile(r"(\S)(\1{2,})")
+def normalize_repeating_chars(t: str) -> str:
     def _replace_rep(m):
         c, cc = m.groups()
         return c
 _re_wrep = re.compile(r"(?:\s|^)(\w+)\s+((?:\1\s+)+)\1(\s|\W|$)")
+def normalize_repeating_words(t: str) -> str:
     def _replace_wrep(m):
         c, cc, e = m.groups()
         return c
         self.post = self.make_pre_post_component(self.post_steps)
         self.lemma = self.lemmatization_component()[self.lemmatization_step]
+    # def apply_multiproc(fn, series):
+    #     with mp.Pool(mp.cpu_count()) as pool:
+    #         new_series = pool.map(fn, series)
+    #     return new_series
     def vaex_process(self, df: DataFrame, text_column: str) -> DataFrame:
         def fn(t):
         vdf["processed_text"] = vdf.apply(
             fn, arguments=[vdf[text_column]], vectorize=False
         )
+        df = vdf.to_pandas_df()
+        return df
     def __call__(self, series: Series) -> Series:
         if self.pre:

src/utils.py CHANGED Viewed

@@ -1,14 +1,15 @@
 import base64
-import altair as alt
-import pandas as pd
 import streamlit as st
 from PIL import Image
 from .configs import SupportedFiles, ColumnNames
-def get_col_indices(cols):
     """Ugly but works"""
     cols = [i.lower() for i in cols]
     try:
@@ -25,12 +26,12 @@ def get_col_indices(cols):
 @st.cache
-def get_logo(path):
     return Image.open(path)
 @st.experimental_memo
-def read_file(uploaded_file) -> pd.DataFrame:
     file_type = uploaded_file.name.split(".")[-1]
     read_fn = SupportedFiles[file_type].value[0]
     df = read_fn(uploaded_file)
@@ -39,12 +40,12 @@ def read_file(uploaded_file) -> pd.DataFrame:
 @st.cache
-def convert_df(df):
     # IMPORTANT: Cache the conversion to prevent computation on every rerun
     return df.to_csv(index=False, sep=";").encode("utf-8")
-def download_button(dataframe: pd.DataFrame, name: str):
     csv = dataframe.to_csv(index=False)
     # some strings <-> bytes conversions necessary here
     b64 = base64.b64encode(csv.encode()).decode()
@@ -52,79 +53,79 @@ def download_button(dataframe: pd.DataFrame, name: str):
     st.write(href, unsafe_allow_html=True)
-def plot_labels_prop(data: pd.DataFrame, label_column: str):
-    unique_value_limit = 100
-    if data[label_column].nunique() > unique_value_limit:
-        st.warning(
-            f"""
-        The column you selected has more than {unique_value_limit}.
-        Are you sure it's the right column? If it is, please note that
-        this will impact __Wordify__ performance.
-        """
-        )
-        return
-    source = (
-        data[label_column]
-        .value_counts()
-        .reset_index()
-        .rename(columns={"index": "Labels", label_column: "Counts"})
-    )
-    source["Props"] = source["Counts"] / source["Counts"].sum()
-    source["Proportions"] = (source["Props"].round(3) * 100).map("{:,.2f}".format) + "%"
-    bars = (
-        alt.Chart(source)
-        .mark_bar()
-        .encode(
-            x=alt.X("Labels:O", sort="-y"),
-            y="Counts:Q",
-        )
-    )
-    text = bars.mark_text(align="center", baseline="middle", dy=15).encode(
-        text="Proportions:O"
-    )
-    return (bars + text).properties(height=300)
-def plot_nchars(data: pd.DataFrame, text_column: str):
-    source = data[text_column].str.len().to_frame()
-    plot = (
-        alt.Chart(source)
-        .mark_bar()
-        .encode(
-            alt.X(
-                f"{text_column}:Q", bin=True, axis=alt.Axis(title="# chars per text")
-            ),
-            alt.Y("count()", axis=alt.Axis(title="")),
-        )
-    )
-    return plot.properties(height=300)
-def plot_score(data: pd.DataFrame, label_col: str, label: str):
-    source = (
-        data.loc[data[label_col] == label]
-        .sort_values("score", ascending=False)
-        .head(100)
-    )
-    plot = (
-        alt.Chart(source)
-        .mark_bar()
-        .encode(
-            y=alt.Y("word:O", sort="-x"),
-            x="score:Q",
-        )
-    )
-    return plot.properties(height=max(30 * source.shape[0], 50))

 import base64
+from typing import List, Tuple
+from pandas.core.frame import DataFrame
 import streamlit as st
 from PIL import Image
+# import altair as alt
 from .configs import SupportedFiles, ColumnNames
+def get_col_indices(cols: List) -> Tuple[int, int]:
     """Ugly but works"""
     cols = [i.lower() for i in cols]
     try:
 @st.cache
+def get_logo(path: str) -> Image:
     return Image.open(path)
 @st.experimental_memo
+def read_file(uploaded_file) -> DataFrame:
     file_type = uploaded_file.name.split(".")[-1]
     read_fn = SupportedFiles[file_type].value[0]
     df = read_fn(uploaded_file)
 @st.cache
+def convert_df(df: DataFrame) -> bytes:
     # IMPORTANT: Cache the conversion to prevent computation on every rerun
     return df.to_csv(index=False, sep=";").encode("utf-8")
+def download_button(dataframe: DataFrame, name: str) -> None:
     csv = dataframe.to_csv(index=False)
     # some strings <-> bytes conversions necessary here
     b64 = base64.b64encode(csv.encode()).decode()
     st.write(href, unsafe_allow_html=True)
+# def plot_labels_prop(data: DataFrame, label_column: str):
+#     unique_value_limit = 100
+#     if data[label_column].nunique() > unique_value_limit:
+#         st.warning(
+#             f"""
+#         The column you selected has more than {unique_value_limit}.
+#         Are you sure it's the right column? If it is, please note that
+#         this will impact __Wordify__ performance.
+#         """
+#         )
+#         return
+#     source = (
+#         data[label_column]
+#         .value_counts()
+#         .reset_index()
+#         .rename(columns={"index": "Labels", label_column: "Counts"})
+#     )
+#     source["Props"] = source["Counts"] / source["Counts"].sum()
+#     source["Proportions"] = (source["Props"].round(3) * 100).map("{:,.2f}".format) + "%"
+#     bars = (
+#         alt.Chart(source)
+#         .mark_bar()
+#         .encode(
+#             x=alt.X("Labels:O", sort="-y"),
+#             y="Counts:Q",
+#         )
+#     )
+#     text = bars.mark_text(align="center", baseline="middle", dy=15).encode(
+#         text="Proportions:O"
+#     )
+#     return (bars + text).properties(height=300)
+# def plot_nchars(data: DataFrame, text_column: str):
+#     source = data[text_column].str.len().to_frame()
+#     plot = (
+#         alt.Chart(source)
+#         .mark_bar()
+#         .encode(
+#             alt.X(
+#                 f"{text_column}:Q", bin=True, axis=alt.Axis(title="# chars per text")
+#             ),
+#             alt.Y("count()", axis=alt.Axis(title="")),
+#         )
+#     )
+#     return plot.properties(height=300)
+# def plot_score(data: DataFrame, label_col: str, label: str):
+#     source = (
+#         data.loc[data[label_col] == label]
+#         .sort_values("score", ascending=False)
+#         .head(100)
+#     )
+#     plot = (
+#         alt.Chart(source)
+#         .mark_bar()
+#         .encode(
+#             y=alt.Y("word:O", sort="-x"),
+#             x="score:Q",
+#         )
+#     )
+#     return plot.properties(height=max(30 * source.shape[0], 50))