Spaces:
Build error
Build error
Pietro Lesci
commited on
Commit
·
57b9989
1
Parent(s):
dbb343d
add notebook to test app
Browse files- tests/notebook.ipynb +58 -20
tests/notebook.ipynb
CHANGED
|
@@ -36,10 +36,10 @@
|
|
| 36 |
" \"normalize_bullet_points\",\n",
|
| 37 |
" \"normalize_hyphenated_words\",\n",
|
| 38 |
" \"normalize_quotation_marks\",\n",
|
|
|
|
| 39 |
" \"normalize_repeating_words\",\n",
|
| 40 |
" \"normalize_repeating_chars\",\n",
|
| 41 |
" \"normalize_whitespaces\",\n",
|
| 42 |
-
" \"normalize_useless_spaces\",\n",
|
| 43 |
" # \"replace_currency_symbols\",\n",
|
| 44 |
" # \"replace_emails\",\n",
|
| 45 |
" # \"replace_emojis\",\n",
|
|
@@ -66,20 +66,20 @@
|
|
| 66 |
"source": [
|
| 67 |
"post_steps = [\n",
|
| 68 |
" \"lowercase\",\n",
|
| 69 |
-
" \"replace_currency_symbols\",\n",
|
| 70 |
-
" \"replace_urls\",\n",
|
| 71 |
-
" \"replace_emails\",\n",
|
| 72 |
-
" \"replace_user_handles\",\n",
|
| 73 |
-
" \"replace_hashtags\",\n",
|
| 74 |
-
" \"replace_emojis\",\n",
|
| 75 |
" # \"replace_phone_numbers\",\n",
|
| 76 |
" # \"replace_numbers\",\n",
|
| 77 |
-
" \"remove_accents\",\n",
|
| 78 |
-
" \"remove_brackets\",\n",
|
| 79 |
" \"remove_html_tags\",\n",
|
|
|
|
|
|
|
| 80 |
" \"remove_non_words\",\n",
|
| 81 |
-
" \"remove_numbers\",\n",
|
| 82 |
-
" \"remove_punctuation\",\n",
|
| 83 |
" \"normalize_repeating_words\",\n",
|
| 84 |
" \"normalize_repeating_chars\",\n",
|
| 85 |
" \"normalize_useless_spaces\",\n",
|
|
@@ -172,7 +172,7 @@
|
|
| 172 |
},
|
| 173 |
{
|
| 174 |
"cell_type": "code",
|
| 175 |
-
"execution_count":
|
| 176 |
"metadata": {},
|
| 177 |
"outputs": [
|
| 178 |
{
|
|
@@ -303,7 +303,7 @@
|
|
| 303 |
"[5000 rows x 3 columns]"
|
| 304 |
]
|
| 305 |
},
|
| 306 |
-
"execution_count":
|
| 307 |
"metadata": {},
|
| 308 |
"output_type": "execute_result"
|
| 309 |
}
|
|
@@ -314,7 +314,7 @@
|
|
| 314 |
},
|
| 315 |
{
|
| 316 |
"cell_type": "code",
|
| 317 |
-
"execution_count":
|
| 318 |
"metadata": {},
|
| 319 |
"outputs": [],
|
| 320 |
"source": [
|
|
@@ -325,16 +325,16 @@
|
|
| 325 |
},
|
| 326 |
{
|
| 327 |
"cell_type": "code",
|
| 328 |
-
"execution_count":
|
| 329 |
"metadata": {},
|
| 330 |
"outputs": [
|
| 331 |
{
|
| 332 |
"data": {
|
| 333 |
"text/plain": [
|
| 334 |
-
"[1, 14, 2, 3, 4, 23, 22, 5,
|
| 335 |
]
|
| 336 |
},
|
| 337 |
-
"execution_count":
|
| 338 |
"metadata": {},
|
| 339 |
"output_type": "execute_result"
|
| 340 |
}
|
|
@@ -345,16 +345,16 @@
|
|
| 345 |
},
|
| 346 |
{
|
| 347 |
"cell_type": "code",
|
| 348 |
-
"execution_count":
|
| 349 |
"metadata": {},
|
| 350 |
"outputs": [
|
| 351 |
{
|
| 352 |
"data": {
|
| 353 |
"text/plain": [
|
| 354 |
-
"[0,
|
| 355 |
]
|
| 356 |
},
|
| 357 |
-
"execution_count":
|
| 358 |
"metadata": {},
|
| 359 |
"output_type": "execute_result"
|
| 360 |
}
|
|
@@ -381,6 +381,44 @@
|
|
| 381 |
"list(PreprocessingPipeline.lemmatization_component().keys())"
|
| 382 |
]
|
| 383 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 384 |
{
|
| 385 |
"cell_type": "code",
|
| 386 |
"execution_count": null,
|
|
|
|
| 36 |
" \"normalize_bullet_points\",\n",
|
| 37 |
" \"normalize_hyphenated_words\",\n",
|
| 38 |
" \"normalize_quotation_marks\",\n",
|
| 39 |
+
" \"normalize_useless_spaces\",\n",
|
| 40 |
" \"normalize_repeating_words\",\n",
|
| 41 |
" \"normalize_repeating_chars\",\n",
|
| 42 |
" \"normalize_whitespaces\",\n",
|
|
|
|
| 43 |
" # \"replace_currency_symbols\",\n",
|
| 44 |
" # \"replace_emails\",\n",
|
| 45 |
" # \"replace_emojis\",\n",
|
|
|
|
| 66 |
"source": [
|
| 67 |
"post_steps = [\n",
|
| 68 |
" \"lowercase\",\n",
|
| 69 |
+
" # \"replace_currency_symbols\",\n",
|
| 70 |
+
" # \"replace_urls\",\n",
|
| 71 |
+
" # \"replace_emails\",\n",
|
| 72 |
+
" # \"replace_user_handles\",\n",
|
| 73 |
+
" # \"replace_hashtags\",\n",
|
| 74 |
+
" # \"replace_emojis\",\n",
|
| 75 |
" # \"replace_phone_numbers\",\n",
|
| 76 |
" # \"replace_numbers\",\n",
|
|
|
|
|
|
|
| 77 |
" \"remove_html_tags\",\n",
|
| 78 |
+
" \"remove_accents\",\n",
|
| 79 |
+
" # \"remove_brackets\",\n",
|
| 80 |
" \"remove_non_words\",\n",
|
| 81 |
+
" # \"remove_numbers\",\n",
|
| 82 |
+
" # \"remove_punctuation\",\n",
|
| 83 |
" \"normalize_repeating_words\",\n",
|
| 84 |
" \"normalize_repeating_chars\",\n",
|
| 85 |
" \"normalize_useless_spaces\",\n",
|
|
|
|
| 172 |
},
|
| 173 |
{
|
| 174 |
"cell_type": "code",
|
| 175 |
+
"execution_count": 14,
|
| 176 |
"metadata": {},
|
| 177 |
"outputs": [
|
| 178 |
{
|
|
|
|
| 303 |
"[5000 rows x 3 columns]"
|
| 304 |
]
|
| 305 |
},
|
| 306 |
+
"execution_count": 14,
|
| 307 |
"metadata": {},
|
| 308 |
"output_type": "execute_result"
|
| 309 |
}
|
|
|
|
| 314 |
},
|
| 315 |
{
|
| 316 |
"cell_type": "code",
|
| 317 |
+
"execution_count": 15,
|
| 318 |
"metadata": {},
|
| 319 |
"outputs": [],
|
| 320 |
"source": [
|
|
|
|
| 325 |
},
|
| 326 |
{
|
| 327 |
"cell_type": "code",
|
| 328 |
+
"execution_count": 16,
|
| 329 |
"metadata": {},
|
| 330 |
"outputs": [
|
| 331 |
{
|
| 332 |
"data": {
|
| 333 |
"text/plain": [
|
| 334 |
+
"[1, 14, 2, 3, 4, 21, 23, 22, 5, 24]"
|
| 335 |
]
|
| 336 |
},
|
| 337 |
+
"execution_count": 16,
|
| 338 |
"metadata": {},
|
| 339 |
"output_type": "execute_result"
|
| 340 |
}
|
|
|
|
| 345 |
},
|
| 346 |
{
|
| 347 |
"cell_type": "code",
|
| 348 |
+
"execution_count": 17,
|
| 349 |
"metadata": {},
|
| 350 |
"outputs": [
|
| 351 |
{
|
| 352 |
"data": {
|
| 353 |
"text/plain": [
|
| 354 |
+
"[0, 17, 15, 19, 23, 22, 21, 24]"
|
| 355 |
]
|
| 356 |
},
|
| 357 |
+
"execution_count": 17,
|
| 358 |
"metadata": {},
|
| 359 |
"output_type": "execute_result"
|
| 360 |
}
|
|
|
|
| 381 |
"list(PreprocessingPipeline.lemmatization_component().keys())"
|
| 382 |
]
|
| 383 |
},
|
| 384 |
+
{
|
| 385 |
+
"cell_type": "code",
|
| 386 |
+
"execution_count": 14,
|
| 387 |
+
"metadata": {},
|
| 388 |
+
"outputs": [],
|
| 389 |
+
"source": [
|
| 390 |
+
"import re"
|
| 391 |
+
]
|
| 392 |
+
},
|
| 393 |
+
{
|
| 394 |
+
"cell_type": "code",
|
| 395 |
+
"execution_count": 27,
|
| 396 |
+
"metadata": {},
|
| 397 |
+
"outputs": [],
|
| 398 |
+
"source": [
|
| 399 |
+
"_re_non_words = re.compile(\"[^A-Za-z]+\")"
|
| 400 |
+
]
|
| 401 |
+
},
|
| 402 |
+
{
|
| 403 |
+
"cell_type": "code",
|
| 404 |
+
"execution_count": 28,
|
| 405 |
+
"metadata": {},
|
| 406 |
+
"outputs": [
|
| 407 |
+
{
|
| 408 |
+
"data": {
|
| 409 |
+
"text/plain": [
|
| 410 |
+
"'Mimmo '"
|
| 411 |
+
]
|
| 412 |
+
},
|
| 413 |
+
"execution_count": 28,
|
| 414 |
+
"metadata": {},
|
| 415 |
+
"output_type": "execute_result"
|
| 416 |
+
}
|
| 417 |
+
],
|
| 418 |
+
"source": [
|
| 419 |
+
"_re_non_words.sub(\" \", \"Mimmo23\")"
|
| 420 |
+
]
|
| 421 |
+
},
|
| 422 |
{
|
| 423 |
"cell_type": "code",
|
| 424 |
"execution_count": null,
|