Spaces:
Build error
Build error
Pietro Lesci
commited on
Commit
·
fdbadfe
1
Parent(s):
ca663e1
add missing typing
Browse files- src/preprocessing.py +11 -11
- src/utils.py +68 -67
src/preprocessing.py
CHANGED
|
@@ -19,22 +19,22 @@ from .configs import Languages
|
|
| 19 |
# and [here](https://textacy.readthedocs.io/en/latest/api_reference/preprocessing.html)
|
| 20 |
# fmt: off
|
| 21 |
_re_normalize_acronyms = re.compile(r"(?:[a-zA-Z]\.){2,}")
|
| 22 |
-
def normalize_acronyms(t):
|
| 23 |
return _re_normalize_acronyms.sub(t.translate(str.maketrans("", "", string.punctuation)).upper(), t)
|
| 24 |
|
| 25 |
|
| 26 |
_re_non_word = re.compile(r"\W")
|
| 27 |
-
def remove_non_word(t):
|
| 28 |
return _re_non_word.sub(" ", t)
|
| 29 |
|
| 30 |
|
| 31 |
_re_space = re.compile(r" {2,}")
|
| 32 |
-
def normalize_useless_spaces(t):
|
| 33 |
return _re_space.sub(" ", t)
|
| 34 |
|
| 35 |
|
| 36 |
_re_rep = re.compile(r"(\S)(\1{2,})")
|
| 37 |
-
def normalize_repeating_chars(t):
|
| 38 |
def _replace_rep(m):
|
| 39 |
c, cc = m.groups()
|
| 40 |
return c
|
|
@@ -43,7 +43,7 @@ def normalize_repeating_chars(t):
|
|
| 43 |
|
| 44 |
|
| 45 |
_re_wrep = re.compile(r"(?:\s|^)(\w+)\s+((?:\1\s+)+)\1(\s|\W|$)")
|
| 46 |
-
def normalize_repeating_words(t):
|
| 47 |
def _replace_wrep(m):
|
| 48 |
c, cc, e = m.groups()
|
| 49 |
return c
|
|
@@ -92,11 +92,10 @@ class PreprocessingPipeline:
|
|
| 92 |
self.post = self.make_pre_post_component(self.post_steps)
|
| 93 |
self.lemma = self.lemmatization_component()[self.lemmatization_step]
|
| 94 |
|
| 95 |
-
def apply_multiproc(fn, series):
|
| 96 |
-
|
| 97 |
-
|
| 98 |
-
|
| 99 |
-
return new_series
|
| 100 |
|
| 101 |
def vaex_process(self, df: DataFrame, text_column: str) -> DataFrame:
|
| 102 |
def fn(t):
|
|
@@ -106,8 +105,9 @@ class PreprocessingPipeline:
|
|
| 106 |
vdf["processed_text"] = vdf.apply(
|
| 107 |
fn, arguments=[vdf[text_column]], vectorize=False
|
| 108 |
)
|
|
|
|
| 109 |
|
| 110 |
-
return
|
| 111 |
|
| 112 |
def __call__(self, series: Series) -> Series:
|
| 113 |
if self.pre:
|
|
|
|
| 19 |
# and [here](https://textacy.readthedocs.io/en/latest/api_reference/preprocessing.html)
|
| 20 |
# fmt: off
|
| 21 |
_re_normalize_acronyms = re.compile(r"(?:[a-zA-Z]\.){2,}")
|
| 22 |
+
def normalize_acronyms(t: str) -> str:
|
| 23 |
return _re_normalize_acronyms.sub(t.translate(str.maketrans("", "", string.punctuation)).upper(), t)
|
| 24 |
|
| 25 |
|
| 26 |
_re_non_word = re.compile(r"\W")
|
| 27 |
+
def remove_non_word(t: str) -> str:
|
| 28 |
return _re_non_word.sub(" ", t)
|
| 29 |
|
| 30 |
|
| 31 |
_re_space = re.compile(r" {2,}")
|
| 32 |
+
def normalize_useless_spaces(t: str) -> str:
|
| 33 |
return _re_space.sub(" ", t)
|
| 34 |
|
| 35 |
|
| 36 |
_re_rep = re.compile(r"(\S)(\1{2,})")
|
| 37 |
+
def normalize_repeating_chars(t: str) -> str:
|
| 38 |
def _replace_rep(m):
|
| 39 |
c, cc = m.groups()
|
| 40 |
return c
|
|
|
|
| 43 |
|
| 44 |
|
| 45 |
_re_wrep = re.compile(r"(?:\s|^)(\w+)\s+((?:\1\s+)+)\1(\s|\W|$)")
|
| 46 |
+
def normalize_repeating_words(t: str) -> str:
|
| 47 |
def _replace_wrep(m):
|
| 48 |
c, cc, e = m.groups()
|
| 49 |
return c
|
|
|
|
| 92 |
self.post = self.make_pre_post_component(self.post_steps)
|
| 93 |
self.lemma = self.lemmatization_component()[self.lemmatization_step]
|
| 94 |
|
| 95 |
+
# def apply_multiproc(fn, series):
|
| 96 |
+
# with mp.Pool(mp.cpu_count()) as pool:
|
| 97 |
+
# new_series = pool.map(fn, series)
|
| 98 |
+
# return new_series
|
|
|
|
| 99 |
|
| 100 |
def vaex_process(self, df: DataFrame, text_column: str) -> DataFrame:
|
| 101 |
def fn(t):
|
|
|
|
| 105 |
vdf["processed_text"] = vdf.apply(
|
| 106 |
fn, arguments=[vdf[text_column]], vectorize=False
|
| 107 |
)
|
| 108 |
+
df = vdf.to_pandas_df()
|
| 109 |
|
| 110 |
+
return df
|
| 111 |
|
| 112 |
def __call__(self, series: Series) -> Series:
|
| 113 |
if self.pre:
|
src/utils.py
CHANGED
|
@@ -1,14 +1,15 @@
|
|
| 1 |
import base64
|
| 2 |
-
|
| 3 |
-
|
| 4 |
-
import pandas as pd
|
| 5 |
import streamlit as st
|
| 6 |
from PIL import Image
|
| 7 |
|
|
|
|
|
|
|
| 8 |
from .configs import SupportedFiles, ColumnNames
|
| 9 |
|
| 10 |
|
| 11 |
-
def get_col_indices(cols):
|
| 12 |
"""Ugly but works"""
|
| 13 |
cols = [i.lower() for i in cols]
|
| 14 |
try:
|
|
@@ -25,12 +26,12 @@ def get_col_indices(cols):
|
|
| 25 |
|
| 26 |
|
| 27 |
@st.cache
|
| 28 |
-
def get_logo(path):
|
| 29 |
return Image.open(path)
|
| 30 |
|
| 31 |
|
| 32 |
@st.experimental_memo
|
| 33 |
-
def read_file(uploaded_file) ->
|
| 34 |
file_type = uploaded_file.name.split(".")[-1]
|
| 35 |
read_fn = SupportedFiles[file_type].value[0]
|
| 36 |
df = read_fn(uploaded_file)
|
|
@@ -39,12 +40,12 @@ def read_file(uploaded_file) -> pd.DataFrame:
|
|
| 39 |
|
| 40 |
|
| 41 |
@st.cache
|
| 42 |
-
def convert_df(df):
|
| 43 |
# IMPORTANT: Cache the conversion to prevent computation on every rerun
|
| 44 |
return df.to_csv(index=False, sep=";").encode("utf-8")
|
| 45 |
|
| 46 |
|
| 47 |
-
def download_button(dataframe:
|
| 48 |
csv = dataframe.to_csv(index=False)
|
| 49 |
# some strings <-> bytes conversions necessary here
|
| 50 |
b64 = base64.b64encode(csv.encode()).decode()
|
|
@@ -52,79 +53,79 @@ def download_button(dataframe: pd.DataFrame, name: str):
|
|
| 52 |
st.write(href, unsafe_allow_html=True)
|
| 53 |
|
| 54 |
|
| 55 |
-
def plot_labels_prop(data:
|
| 56 |
|
| 57 |
-
|
| 58 |
|
| 59 |
-
|
| 60 |
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
|
| 67 |
-
|
| 68 |
|
| 69 |
-
|
| 70 |
|
| 71 |
-
|
| 72 |
-
|
| 73 |
-
|
| 74 |
-
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
|
| 79 |
|
| 80 |
-
|
| 81 |
-
|
| 82 |
-
|
| 83 |
-
|
| 84 |
-
|
| 85 |
-
|
| 86 |
-
|
| 87 |
-
|
| 88 |
|
| 89 |
-
|
| 90 |
-
|
| 91 |
-
|
| 92 |
|
| 93 |
-
|
| 94 |
|
| 95 |
|
| 96 |
-
def plot_nchars(data:
|
| 97 |
-
|
| 98 |
|
| 99 |
-
|
| 100 |
-
|
| 101 |
-
|
| 102 |
-
|
| 103 |
-
|
| 104 |
-
|
| 105 |
-
|
| 106 |
-
|
| 107 |
-
|
| 108 |
-
|
| 109 |
|
| 110 |
-
|
| 111 |
|
| 112 |
|
| 113 |
-
def plot_score(data:
|
| 114 |
|
| 115 |
-
|
| 116 |
-
|
| 117 |
-
|
| 118 |
-
|
| 119 |
-
|
| 120 |
|
| 121 |
-
|
| 122 |
-
|
| 123 |
-
|
| 124 |
-
|
| 125 |
-
|
| 126 |
-
|
| 127 |
-
|
| 128 |
-
|
| 129 |
|
| 130 |
-
|
|
|
|
| 1 |
import base64
|
| 2 |
+
from typing import List, Tuple
|
| 3 |
+
from pandas.core.frame import DataFrame
|
|
|
|
| 4 |
import streamlit as st
|
| 5 |
from PIL import Image
|
| 6 |
|
| 7 |
+
# import altair as alt
|
| 8 |
+
|
| 9 |
from .configs import SupportedFiles, ColumnNames
|
| 10 |
|
| 11 |
|
| 12 |
+
def get_col_indices(cols: List) -> Tuple[int, int]:
|
| 13 |
"""Ugly but works"""
|
| 14 |
cols = [i.lower() for i in cols]
|
| 15 |
try:
|
|
|
|
| 26 |
|
| 27 |
|
| 28 |
@st.cache
|
| 29 |
+
def get_logo(path: str) -> Image:
|
| 30 |
return Image.open(path)
|
| 31 |
|
| 32 |
|
| 33 |
@st.experimental_memo
|
| 34 |
+
def read_file(uploaded_file) -> DataFrame:
|
| 35 |
file_type = uploaded_file.name.split(".")[-1]
|
| 36 |
read_fn = SupportedFiles[file_type].value[0]
|
| 37 |
df = read_fn(uploaded_file)
|
|
|
|
| 40 |
|
| 41 |
|
| 42 |
@st.cache
|
| 43 |
+
def convert_df(df: DataFrame) -> bytes:
|
| 44 |
# IMPORTANT: Cache the conversion to prevent computation on every rerun
|
| 45 |
return df.to_csv(index=False, sep=";").encode("utf-8")
|
| 46 |
|
| 47 |
|
| 48 |
+
def download_button(dataframe: DataFrame, name: str) -> None:
|
| 49 |
csv = dataframe.to_csv(index=False)
|
| 50 |
# some strings <-> bytes conversions necessary here
|
| 51 |
b64 = base64.b64encode(csv.encode()).decode()
|
|
|
|
| 53 |
st.write(href, unsafe_allow_html=True)
|
| 54 |
|
| 55 |
|
| 56 |
+
# def plot_labels_prop(data: DataFrame, label_column: str):
|
| 57 |
|
| 58 |
+
# unique_value_limit = 100
|
| 59 |
|
| 60 |
+
# if data[label_column].nunique() > unique_value_limit:
|
| 61 |
|
| 62 |
+
# st.warning(
|
| 63 |
+
# f"""
|
| 64 |
+
# The column you selected has more than {unique_value_limit}.
|
| 65 |
+
# Are you sure it's the right column? If it is, please note that
|
| 66 |
+
# this will impact __Wordify__ performance.
|
| 67 |
+
# """
|
| 68 |
+
# )
|
| 69 |
|
| 70 |
+
# return
|
| 71 |
|
| 72 |
+
# source = (
|
| 73 |
+
# data[label_column]
|
| 74 |
+
# .value_counts()
|
| 75 |
+
# .reset_index()
|
| 76 |
+
# .rename(columns={"index": "Labels", label_column: "Counts"})
|
| 77 |
+
# )
|
| 78 |
+
# source["Props"] = source["Counts"] / source["Counts"].sum()
|
| 79 |
+
# source["Proportions"] = (source["Props"].round(3) * 100).map("{:,.2f}".format) + "%"
|
| 80 |
|
| 81 |
+
# bars = (
|
| 82 |
+
# alt.Chart(source)
|
| 83 |
+
# .mark_bar()
|
| 84 |
+
# .encode(
|
| 85 |
+
# x=alt.X("Labels:O", sort="-y"),
|
| 86 |
+
# y="Counts:Q",
|
| 87 |
+
# )
|
| 88 |
+
# )
|
| 89 |
|
| 90 |
+
# text = bars.mark_text(align="center", baseline="middle", dy=15).encode(
|
| 91 |
+
# text="Proportions:O"
|
| 92 |
+
# )
|
| 93 |
|
| 94 |
+
# return (bars + text).properties(height=300)
|
| 95 |
|
| 96 |
|
| 97 |
+
# def plot_nchars(data: DataFrame, text_column: str):
|
| 98 |
+
# source = data[text_column].str.len().to_frame()
|
| 99 |
|
| 100 |
+
# plot = (
|
| 101 |
+
# alt.Chart(source)
|
| 102 |
+
# .mark_bar()
|
| 103 |
+
# .encode(
|
| 104 |
+
# alt.X(
|
| 105 |
+
# f"{text_column}:Q", bin=True, axis=alt.Axis(title="# chars per text")
|
| 106 |
+
# ),
|
| 107 |
+
# alt.Y("count()", axis=alt.Axis(title="")),
|
| 108 |
+
# )
|
| 109 |
+
# )
|
| 110 |
|
| 111 |
+
# return plot.properties(height=300)
|
| 112 |
|
| 113 |
|
| 114 |
+
# def plot_score(data: DataFrame, label_col: str, label: str):
|
| 115 |
|
| 116 |
+
# source = (
|
| 117 |
+
# data.loc[data[label_col] == label]
|
| 118 |
+
# .sort_values("score", ascending=False)
|
| 119 |
+
# .head(100)
|
| 120 |
+
# )
|
| 121 |
|
| 122 |
+
# plot = (
|
| 123 |
+
# alt.Chart(source)
|
| 124 |
+
# .mark_bar()
|
| 125 |
+
# .encode(
|
| 126 |
+
# y=alt.Y("word:O", sort="-x"),
|
| 127 |
+
# x="score:Q",
|
| 128 |
+
# )
|
| 129 |
+
# )
|
| 130 |
|
| 131 |
+
# return plot.properties(height=max(30 * source.shape[0], 50))
|