| | import re |
| | from spacy.tokens import Doc |
| | from spacy.language import Language |
| |
|
| |
|
| | @Language.component("preprocess_text") |
| |
|
| | def preprocess_text(doc): |
| |
|
| | text = doc.text |
| |
|
| | |
| | etape1_in = re.compile('\s$', re.MULTILINE) |
| | etape1_out = '' |
| |
|
| | |
| | |
| | etape2_in = re.compile('(([a-zà-ÿ]|[A-ZÀ-Ÿ])-)\n') |
| | etape2_out = r'\1' |
| |
|
| | |
| | etape3_in = re.compile('\n') |
| | etape3_out = ' ' |
| |
|
| | |
| | etape4_in = re.compile('\s{2,}') |
| | etape4_out = ' ' |
| |
|
| | |
| | sortie1 = etape1_in.sub(etape1_out, text) |
| | sortie2 = etape2_in.sub(etape2_out, sortie1) |
| | sortie3 = etape3_in.sub(etape3_out, sortie2) |
| | sortie4 = etape4_in.sub(etape4_out, sortie3) |
| |
|
| | |
| | modified_doc = Doc(doc.vocab, words=sortie4.split()) |
| |
|
| | return modified_doc |
| |
|
| |
|
| | Language.component("preprocess_text", func=preprocess_text) |