|
|
|
|
|
|
|
|
|
|
|
import os, sys, warnings, json, joblib, random, re, unicodedata |
|
|
import numpy as np |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def polish_spanish(s: str) -> str: |
|
|
"""Corrects and normalizes Spanish sentences for Mori outputs.""" |
|
|
|
|
|
|
|
|
s = unicodedata.normalize("NFC", s).strip() |
|
|
|
|
|
|
|
|
s = re.sub( |
|
|
r'\s*[\[\(]\s*Mori\s+(?:Social|T[eé]nico|T[eé]cnico)\s*[\]\)]\s*', |
|
|
'', |
|
|
s, |
|
|
flags=re.I |
|
|
) |
|
|
|
|
|
|
|
|
fixes = [ |
|
|
(r'(?i)(^|\W)T\s+puedes(?P<p>[^\w]|$)', r'\1Tú puedes\g<p>'), |
|
|
(r'(?i)(^|\W)T\s+(ya|eres|estas|estás|tienes|puedes)\b', r'\1Tú \2'), |
|
|
(r'(?i)\bclaro que s(?:i|í)?\b(?P<p>[,.\!?…])?', r'Claro que sí\g<p>'), |
|
|
(r'(?i)(^|\s)si,', r'\1Sí,'), |
|
|
(r'(?i)(\beso\s+)s(\s+est[áa]\b)', r'\1sí\2'), |
|
|
(r'(?i)(^|[\s,;:])s(\s+es\b)', r'\1sí\2'), |
|
|
(r'(?i)\btiles\b', 'útiles'), |
|
|
(r'(?i)\butiles\b', 'útiles'), |
|
|
(r'(?i)\butil\b', 'útil'), |
|
|
(r'(?i)\btil\b', 'útil'), |
|
|
(r'(?i)\bcategoras\b', 'categorías'), |
|
|
(r'(?i)\bcategora\b', 'categoría'), |
|
|
(r'(?i)\batpico\b', 'atípico'), |
|
|
(r'(?i)\batpicos\b', 'atípico'), |
|
|
(r'(?i)\bdesempeo\b', 'desempeño'), |
|
|
(r'(?i)\baqui\b', 'aquí'), |
|
|
(r'(?i)\balgn\b', 'algún'), |
|
|
(r'(?i)\banomala\b', 'anomalía'), |
|
|
(r'(?i)\banomalas\b', 'anomalías'), |
|
|
(r'(?i)\balgun\b', 'algún'), |
|
|
(r'(?i)\bAnimo\b', 'Ánimo'), |
|
|
(r'(?i)\bcario\b', 'cariño'), |
|
|
(r'(?i)\baprendisaje\b', 'aprendizaje'), |
|
|
(r'(?i)\bmanana\b', 'mañana'), |
|
|
(r'(?i)\bmaana\b', 'mañana'), |
|
|
(r'(?i)\benergia\b', 'energía'), |
|
|
(r'(?i)\benerga\b', 'energía'), |
|
|
(r'(?i)\bextrano\b', 'extraño'), |
|
|
(r'(?i)\bextrana\b', 'extraña'), |
|
|
(r'(?i)\bextranar\b', 'extrañar'), |
|
|
(r'(?i)\bextranarte\b', 'extrañarte'), |
|
|
(r'(?i)\bextranas\b', 'extrañas'), |
|
|
(r'(?i)\bextranos\b', 'extraños'), |
|
|
(r'(?i)\baqu\b', 'aquí'), |
|
|
(r'(?i)\bestare\b', 'estaré'), |
|
|
(r'(?i)\bclarn\b', 'clarín'), |
|
|
(r'(?i)\bclarin\b', 'clarín'), |
|
|
(r'(?i)\bclar[íi]n\s+cornetas\b', 'clarín cornetas'), |
|
|
(r'(?i)(^|\s)s([,.;:!?])', r'\1Sí\2'), |
|
|
(r'(?i)\bfutbol\b', 'fútbol'), |
|
|
(r'(?i)(^|\s)as(\s+se\b)', r'\1Así\2'), |
|
|
(r'(?i)(^|\s)s(\s+orientarte\b)', r'\1sí\2'), |
|
|
(r'(?i)\bbuen dia\b', 'buen día'), |
|
|
(r'(?i)\bgran dia\b', 'gran día'), |
|
|
(r'(?i)\bdias\b', 'días'), |
|
|
(r'(?i)\bdia\b', 'día'), |
|
|
(r'(?i)\bgran da\b', 'gran día'), |
|
|
(r'(?i)\bacompa?a(r|rte|do|da|dos|das)?\b', r'acompaña\1'), |
|
|
(r'(?i)(^|\s)as([,.;:!?]|\s|$)', r'\1así\2'), |
|
|
(r'(?i)(^|\s)S lo se\b', r'\1Sí lo sé'), |
|
|
(r'(?i)(^|\s)S lo sé\b', r'\1Sí lo sé'), |
|
|
(r'(?i)\bcuidate\b', 'cuídate'), |
|
|
(r'(?i)\bcuidese\b', 'cuídese'), |
|
|
(r'(?i)\bcuidense\b', 'cuídense'), |
|
|
(r'(?i)\bpequeo\b', 'pequeño'), |
|
|
(r'(?i)\bpequea\b', 'pequeña'), |
|
|
(r'(?i)\bpequeos\b', 'pequeños'), |
|
|
(r'(?i)\bpequeas\b', 'pequeñas'), |
|
|
(r'(?i)\bunico\b', 'único'), |
|
|
(r'(?i)\bunica\b', 'única'), |
|
|
(r'(?i)\bunicos\b', 'únicos'), |
|
|
(r'(?i)\bunicas\b', 'únicas'), |
|
|
(r'(?i)\bnico\b', 'único'), |
|
|
(r'(?i)\bnica\b', 'única'), |
|
|
(r'(?i)\bnicos\b', 'únicos'), |
|
|
(r'(?i)\bnicas\b', 'únicas'), |
|
|
(r'(?i)\bestadstico\b', 'estadístico'), |
|
|
(r'(?i)\bestadstica\b', 'estadística'), |
|
|
(r'(?i)\bestadsticos\b', 'estadísticos'), |
|
|
(r'(?i)\bestadsticas\b', 'estadísticas'), |
|
|
(r'(?i)\bgracias por confiar en m\b', 'gracias por confiar en mí'), |
|
|
(r'(?i)\bcada dia\b', 'cada día'), |
|
|
(r'(?i)\bcada da\b', 'cada día'), |
|
|
(r'(?i)\bsegun\b', 'según'), |
|
|
(r'(?i)\bcaracteristica(s)?\b', r'característica\1'), |
|
|
(r'(?i)\bcaracterstica(s)?\b', r'característica\1'), |
|
|
(r'(?i)\b([a-záéíóúñ]+)cion\b', r'\1ción'), |
|
|
(r'(?i)\bdeterminacio\b', 'determinación'), |
|
|
] |
|
|
|
|
|
for pat, rep in fixes: |
|
|
s = re.sub(pat, rep, s) |
|
|
|
|
|
|
|
|
s = re.sub( |
|
|
r'(?i)^eso es todo!(?P<r>(\s|$).*)', |
|
|
r'¡Eso es todo!\g<r>', |
|
|
s |
|
|
) |
|
|
|
|
|
|
|
|
def add_opening_q(m): |
|
|
cuerpo = m.group('qbody') |
|
|
if '¿' in cuerpo: |
|
|
return m.group(0) |
|
|
return f"{m.group('pre')}¿{cuerpo}" |
|
|
|
|
|
s = re.sub( |
|
|
r'(?P<pre>(^|[.!…]\s+))(?P<qbody>[^?]*\?)', |
|
|
add_opening_q, |
|
|
s |
|
|
) |
|
|
|
|
|
|
|
|
def _open_exclam(m): |
|
|
palabra = m.group('w') |
|
|
resto = m.group('r') or '' |
|
|
return f'¡{palabra}!{resto}' |
|
|
|
|
|
s = re.sub( |
|
|
r'(?i)^\s*(?P<w>(hola|gracias|genial|perfecto|claro|por supuesto|con gusto|listo|vaya|wow|tu puedes|tú puedes|clarín|clarin|clarín cornetas))!(?P<r>(\s|$).*)', |
|
|
_open_exclam, |
|
|
s |
|
|
) |
|
|
|
|
|
|
|
|
s = re.sub(r'\s+', ' ', s).strip() |
|
|
if s and s[-1] not in ".!?…": |
|
|
s += "." |
|
|
|
|
|
return s |
|
|
|