Spaces:

habulaj
/

newapi-clone

Paused

App Files Files Community

habulaj commited on Aug 26, 2025

Commit

2a2b881

verified ·

1 Parent(s): b77c8dd

Update routers/inference_createposter.py

Browse files

Files changed (1) hide show

routers/inference_createposter.py +22 -43

routers/inference_createposter.py CHANGED Viewed

@@ -52,74 +52,53 @@ def fix_citation_quotes(citation_text: str) -> str:
     # Adicionar as aspas corretas
     return f"“{text.strip()}”"
-def clean_text_content(text: str) -> str:
     """
-    Limpa o conteúdo de texto removendo tags inválidas e corrigindo formatação:
-    - Remove todas as tags exceto <strong> e <em>
-    - Se tiver <strong><em> juntas, prioriza <em>
-    - Se tiver <em><strong> juntas, prioriza <strong>
     """
     if not text:
         return text
     import re
-    # Corrigir tags aninhadas - priorizar <em> quando <strong><em>
-    text = re.sub(r'<strong>\s*<em>(.*?)</em>\s*</strong>', r'<em>\1</em>', text)
-    # Corrigir tags aninhadas - priorizar <strong> quando <em><strong>
-    text = re.sub(r'<em>\s*<strong>(.*?)</strong>\s*</em>', r'<strong>\1</strong>', text)
-    # Remover todas as outras tags exceto <strong> e <em>
-    # Primeiro, proteger as tags válidas
-    protected_tags = []
-    strong_pattern = r'<strong>(.*?)</strong>'
-    em_pattern = r'<em>(.*?)</em>'
-    # Substituir temporariamente tags válidas
-    def protect_tag(match):
-        index = len(protected_tags)
-        protected_tags.append(match.group(0))
-        return f"__PROTECTED_TAG_{index}__"
-    text = re.sub(strong_pattern, protect_tag, text)
-    text = re.sub(em_pattern, protect_tag, text)
-    # Remover todas as outras tags
-    text = re.sub(r'<[^>]+>', '', text)
-    # Restaurar tags protegidas
-    for i, tag in enumerate(protected_tags):
-        text = text.replace(f"__PROTECTED_TAG_{i}__", tag)
     return text.strip()
 def fix_url_citation(url: str) -> str:
     """
-    Analisa uma URL e corrige o parâmetro citation se presente,
-    além de limpar outros parâmetros de texto
     """
     try:
         # Parse da URL
         parsed_url = urlparse(url)
         query_params = parse_qs(parsed_url.query)
-        # Parâmetros que precisam de limpeza geral (exceto citation)
-        text_params = ['text', 'title', 'headline']
-        # Processar parâmetros de texto
         for param in text_params:
             if param in query_params and query_params[param]:
                 original_text = query_params[param][0]
-                cleaned_text = clean_text_content(original_text)
                 query_params[param] = [cleaned_text]
-        # Processar citation especialmente
-        if 'citation' in query_params and query_params['citation']:
-            original_citation = query_params['citation'][0]
-            fixed_citation = fix_citation_quotes(original_citation)
-            query_params['citation'] = [fixed_citation]
         # Reconstruir a query string
         new_query = urlencode(
             {k: v[0] if isinstance(v, list) and len(v) == 1 else v for k, v in query_params.items()},

     # Adicionar as aspas corretas
     return f"“{text.strip()}”"
+def clean_text_content_improved(text: str) -> str:
     """
+    Remove TODAS as tags HTML do texto, incluindo <em>, <strong>, <wiki>, etc.
+    Mantém apenas o conteúdo textual limpo.
     """
     if not text:
         return text
     import re
+    # Remove TODAS as tags HTML usando regex mais ampla
+    # Esta regex captura qualquer tag: <qualquer_coisa>
+    text = re.sub(r'<[^>]*>', '', text)
+    # Remove possíveis entidades HTML comuns
+    text = text.replace('&lt;', '<').replace('&gt;', '>').replace('&amp;', '&')
+    text = text.replace('&quot;', '"').replace('&#39;', "'")
     return text.strip()
 def fix_url_citation(url: str) -> str:
     """
+    Analisa uma URL e remove TODAS as tags HTML de TODOS os parâmetros de texto,
+    além de aplicar correções específicas para citation
     """
     try:
         # Parse da URL
         parsed_url = urlparse(url)
         query_params = parse_qs(parsed_url.query)
+        # TODOS os parâmetros que podem conter texto e tags HTML
+        text_params = ['text', 'title', 'headline', 'citation']
+        # Processar TODOS os parâmetros de texto
         for param in text_params:
             if param in query_params and query_params[param]:
                 original_text = query_params[param][0]
+                # Primeiro, limpar todas as tags HTML
+                cleaned_text = clean_text_content_improved(original_text)
+                # Se for citation, aplicar correção específica das aspas
+                if param == 'citation':
+                    cleaned_text = fix_citation_quotes(cleaned_text)
                 query_params[param] = [cleaned_text]
         # Reconstruir a query string
         new_query = urlencode(
             {k: v[0] if isinstance(v, list) and len(v) == 1 else v for k, v in query_params.items()},