Spaces:
Build error
Build error
Pietro Lesci
commited on
Commit
·
4fe22cb
1
Parent(s):
e330a04
fix bug with no-ops
Browse files- src/preprocessing.py +30 -20
src/preprocessing.py
CHANGED
|
@@ -75,6 +75,10 @@ def lemmatize_keep_stopwords(doc: spacy.tokens.doc.Doc) -> str:
|
|
| 75 |
return " ".join([t.lemma_ for t in doc if t.lemma_ != "-PRON-"])
|
| 76 |
|
| 77 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 78 |
# fmt: on
|
| 79 |
class PreprocessingPipeline:
|
| 80 |
def __init__(
|
|
@@ -90,8 +94,14 @@ class PreprocessingPipeline:
|
|
| 90 |
self.post_steps = post_steps
|
| 91 |
|
| 92 |
self.nlp = spacy.load(Languages[language].value, disable=["parser", "ner"])
|
| 93 |
-
self.pre =
|
| 94 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 95 |
self.lemma = self.lemmatization_component()[self.lemmatization_step]
|
| 96 |
|
| 97 |
# def apply_multiproc(fn, series):
|
|
@@ -111,28 +121,28 @@ class PreprocessingPipeline:
|
|
| 111 |
|
| 112 |
return df
|
| 113 |
|
| 114 |
-
def __call__(self, series: Series) -> Series:
|
| 115 |
-
|
| 116 |
-
|
| 117 |
|
| 118 |
-
|
| 119 |
-
|
| 120 |
-
|
| 121 |
-
|
| 122 |
-
|
| 123 |
-
|
| 124 |
-
|
| 125 |
-
|
| 126 |
|
| 127 |
-
|
| 128 |
-
|
| 129 |
|
| 130 |
-
|
| 131 |
|
| 132 |
-
|
| 133 |
-
|
| 134 |
|
| 135 |
-
|
| 136 |
|
| 137 |
def make_pre_post_component(self, steps: Optional[List[str]]) -> Optional[Callable]:
|
| 138 |
if not steps:
|
|
@@ -179,7 +189,7 @@ class PreprocessingPipeline:
|
|
| 179 |
[
|
| 180 |
("Spacy lemmatizer (keep stopwords)", lemmatize_keep_stopwords),
|
| 181 |
("Spacy lemmatizer (no stopwords)", lemmatize_remove_stopwords),
|
| 182 |
-
("Disable lemmatizer",
|
| 183 |
("Remove stopwords", remove_stopwords),
|
| 184 |
]
|
| 185 |
)
|
|
|
|
| 75 |
return " ".join([t.lemma_ for t in doc if t.lemma_ != "-PRON-"])
|
| 76 |
|
| 77 |
|
| 78 |
+
def identity(t):
|
| 79 |
+
return t
|
| 80 |
+
|
| 81 |
+
|
| 82 |
# fmt: on
|
| 83 |
class PreprocessingPipeline:
|
| 84 |
def __init__(
|
|
|
|
| 94 |
self.post_steps = post_steps
|
| 95 |
|
| 96 |
self.nlp = spacy.load(Languages[language].value, disable=["parser", "ner"])
|
| 97 |
+
self.pre = (
|
| 98 |
+
self.make_pre_post_component(self.pre_steps) if self.pre_steps else identity
|
| 99 |
+
)
|
| 100 |
+
self.post = (
|
| 101 |
+
self.make_pre_post_component(self.post_steps)
|
| 102 |
+
if self.post_steps
|
| 103 |
+
else identity
|
| 104 |
+
)
|
| 105 |
self.lemma = self.lemmatization_component()[self.lemmatization_step]
|
| 106 |
|
| 107 |
# def apply_multiproc(fn, series):
|
|
|
|
| 121 |
|
| 122 |
return df
|
| 123 |
|
| 124 |
+
# def __call__(self, series: Series) -> Series:
|
| 125 |
+
# if self.pre:
|
| 126 |
+
# series = series.map(self.pre)
|
| 127 |
|
| 128 |
+
# if self.lemma:
|
| 129 |
+
# total_steps = len(series) // 100
|
| 130 |
+
# res = []
|
| 131 |
+
# pbar = st.progress(0)
|
| 132 |
+
# for i, doc in enumerate(
|
| 133 |
+
# self.nlp.pipe(series, batch_size=500, n_process=os.cpu_count())
|
| 134 |
+
# ):
|
| 135 |
+
# res.append(self.lemma(doc))
|
| 136 |
|
| 137 |
+
# if i % total_steps == 0:
|
| 138 |
+
# pbar.progress(1)
|
| 139 |
|
| 140 |
+
# series = pd.Series(res)
|
| 141 |
|
| 142 |
+
# if self.post:
|
| 143 |
+
# series = series.map(self.post)
|
| 144 |
|
| 145 |
+
# return series
|
| 146 |
|
| 147 |
def make_pre_post_component(self, steps: Optional[List[str]]) -> Optional[Callable]:
|
| 148 |
if not steps:
|
|
|
|
| 189 |
[
|
| 190 |
("Spacy lemmatizer (keep stopwords)", lemmatize_keep_stopwords),
|
| 191 |
("Spacy lemmatizer (no stopwords)", lemmatize_remove_stopwords),
|
| 192 |
+
("Disable lemmatizer", identity),
|
| 193 |
("Remove stopwords", remove_stopwords),
|
| 194 |
]
|
| 195 |
)
|