Spaces:

habulaj
/

newapi-clone

Paused

App Files Files Community

habulaj commited on Aug 26, 2025

Commit

78be5af

verified ·

1 Parent(s): 2a2b881

Update routers/inference_createposter.py

Browse files

Files changed (1) hide show

routers/inference_createposter.py +51 -13

routers/inference_createposter.py CHANGED Viewed

@@ -52,9 +52,33 @@ def fix_citation_quotes(citation_text: str) -> str:
     # Adicionar as aspas corretas
     return f"“{text.strip()}”"
-def clean_text_content_improved(text: str) -> str:
     """
-    Remove TODAS as tags HTML do texto, incluindo <em>, <strong>, <wiki>, etc.
     Mantém apenas o conteúdo textual limpo.
     """
     if not text:
@@ -63,7 +87,6 @@ def clean_text_content_improved(text: str) -> str:
     import re
     # Remove TODAS as tags HTML usando regex mais ampla
-    # Esta regex captura qualquer tag: <qualquer_coisa>
     text = re.sub(r'<[^>]*>', '', text)
     # Remove possíveis entidades HTML comuns
@@ -72,26 +95,34 @@ def clean_text_content_improved(text: str) -> str:
     return text.strip()
-def fix_url_citation(url: str) -> str:
     """
-    Analisa uma URL e remove TODAS as tags HTML de TODOS os parâmetros de texto,
-    além de aplicar correções específicas para citation
     """
     try:
         # Parse da URL
         parsed_url = urlparse(url)
         query_params = parse_qs(parsed_url.query)
-        # TODOS os parâmetros que podem conter texto e tags HTML
-        text_params = ['text', 'title', 'headline', 'citation']
-        # Processar TODOS os parâmetros de texto
-        for param in text_params:
             if param in query_params and query_params[param]:
                 original_text = query_params[param][0]
-                # Primeiro, limpar todas as tags HTML
-                cleaned_text = clean_text_content_improved(original_text)
                 # Se for citation, aplicar correção específica das aspas
                 if param == 'citation':
@@ -99,6 +130,13 @@ def fix_url_citation(url: str) -> str:
                 query_params[param] = [cleaned_text]
         # Reconstruir a query string
         new_query = urlencode(
             {k: v[0] if isinstance(v, list) and len(v) == 1 else v for k, v in query_params.items()},

     # Adicionar as aspas corretas
     return f"“{text.strip()}”"
+def clean_text_content_for_text_param(text: str) -> str:
     """
+    Limpa o conteúdo do parâmetro 'text':
+    - Remove apenas tags <wiki>
+    - Mantém <strong> e <em>
+    - Se tiver tags aninhadas (ex: <strong><em>), prioriza a segunda (mais interna)
+    """
+    if not text:
+        return text
+    import re
+    # Primeiro, resolver conflitos de tags aninhadas - priorizar a segunda (mais interna)
+    # <strong><em>conteúdo</em></strong> -> <em>conteúdo</em>
+    text = re.sub(r'<strong>\s*<em>(.*?)</em>\s*</strong>', r'<em>\1</em>', text)
+    # <em><strong>conteúdo</strong></em> -> <strong>conteúdo</strong>
+    text = re.sub(r'<em>\s*<strong>(.*?)</strong>\s*</em>', r'<strong>\1</strong>', text)
+    # Remover apenas tags <wiki>
+    text = re.sub(r'</?wiki[^>]*>', '', text)
+    return text.strip()
+def clean_text_content_remove_all_tags(text: str) -> str:
+    """
+    Remove TODAS as tags HTML do texto (para headline, title, citation).
     Mantém apenas o conteúdo textual limpo.
     """
     if not text:
     import re
     # Remove TODAS as tags HTML usando regex mais ampla
     text = re.sub(r'<[^>]*>', '', text)
     # Remove possíveis entidades HTML comuns
     return text.strip()
+# Função atualizada para manter compatibilidade com o código existente
+def clean_text_content(text: str) -> str:
+    """
+    Função mantida para compatibilidade.
+    Usa a nova lógica para parâmetro 'text'.
     """
+    return clean_text_content_for_text_param(text)
+    """
+    Analisa uma URL e trata os parâmetros de texto de forma específica:
+    - Para 'text': mantém <strong> e <em>, remove <wiki>, resolve conflitos de tags aninhadas
+    - Para 'headline', 'title', 'citation': remove TODAS as tags HTML
     """
     try:
         # Parse da URL
         parsed_url = urlparse(url)
         query_params = parse_qs(parsed_url.query)
+        # Parâmetros que devem ter TODAS as tags removidas
+        clean_all_params = ['headline', 'title', 'citation']
+        # Parâmetros que têm tratamento especial (apenas text)
+        special_text_params = ['text']
+        # Processar parâmetros que devem ser completamente limpos
+        for param in clean_all_params:
             if param in query_params and query_params[param]:
                 original_text = query_params[param][0]
+                cleaned_text = clean_text_content_remove_all_tags(original_text)
                 # Se for citation, aplicar correção específica das aspas
                 if param == 'citation':
                 query_params[param] = [cleaned_text]
+        # Processar parâmetro 'text' com tratamento especial
+        for param in special_text_params:
+            if param in query_params and query_params[param]:
+                original_text = query_params[param][0]
+                cleaned_text = clean_text_content_for_text_param(original_text)
+                query_params[param] = [cleaned_text]
         # Reconstruir a query string
         new_query = urlencode(
             {k: v[0] if isinstance(v, list) and len(v) == 1 else v for k, v in query_params.items()},