Spaces:

caarleexx
/

para-Ai-data

Sleeping

caarleexx commited on Jan 6

Commit

60fb770

verified ·

1 Parent(s): 98cec38

Update worker.py

Files changed (1) hide show

worker.py CHANGED Viewed

@@ -187,8 +187,8 @@ class ExtratorUltraSimples:
         valor = re.sub(r'Ocultar Acórdão Atenção: O texto abaixo representa a transcrição de Acórdão. Eventuais imagens serão suprimidas.Recomenda-se acessar o PDF assinado.', '', valor)
         valor = re.sub(r' ---\s*([A-Za-z0-9IVXLCDM]{1,3})\s*+', r' \n\n#### \1 - ', valor)
         valor = re.sub(r'  ', ' *** ', valor)
-        valor = limpar_citacao(valor)
-        valor = remover_tags_repetidas(valor)
         return chave, valor
@@ -207,7 +207,7 @@ class ExtratorUltraSimples:
         #return re.sub(r'  +', ' ', re.sub(r'_\([^)]+\):_|_([^:]+):_', '', texto))
         return re.sub(r'  +', ' ', re.sub(r'_\([^)]+\):_|_([^:]+):\s+', '', texto))
-    def remover_tags_repetidas(t: str, janela=10) -> str:
         w = t.split()
         i = 0
         tag_re = re.compile(r'^_[A-Z0-9_]+:$')
@@ -226,7 +226,7 @@ class ExtratorUltraSimples:
         return ' '.join(w)
-    def limpar_citacao(t: str) -> str:
         return re.sub(
             r'\[([^\]]+)\]',
             lambda m: "[" + re.sub(r'_[A-Z0-9_]+:\s*', '', m.group(1)) + "]",

         valor = re.sub(r'Ocultar Acórdão Atenção: O texto abaixo representa a transcrição de Acórdão. Eventuais imagens serão suprimidas.Recomenda-se acessar o PDF assinado.', '', valor)
         valor = re.sub(r' ---\s*([A-Za-z0-9IVXLCDM]{1,3})\s*+', r' \n\n#### \1 - ', valor)
         valor = re.sub(r'  ', ' *** ', valor)
+        valor = self.limpar_citacao(valor)
+        valor = self.remover_tags_repetidas(valor)
         return chave, valor
         #return re.sub(r'  +', ' ', re.sub(r'_\([^)]+\):_|_([^:]+):_', '', texto))
         return re.sub(r'  +', ' ', re.sub(r'_\([^)]+\):_|_([^:]+):\s+', '', texto))
+    def remover_tags_repetidas(self, t: str, janela=10) -> str:
         w = t.split()
         i = 0
         tag_re = re.compile(r'^_[A-Z0-9_]+:$')
         return ' '.join(w)
+    def limpar_citacao(self, t: str) -> str:
         return re.sub(
             r'\[([^\]]+)\]',
             lambda m: "[" + re.sub(r'_[A-Z0-9_]+:\s*', '', m.group(1)) + "]",