Spaces:

13ze
/

complex-html-to-markdown-llm

Sleeping

App Files Files Community

complex-html-to-markdown-llm / app.py

13ze

Update app.py

2442fcd verified 9 months ago

raw

history blame contribute delete

13.7 kB

	# -- coding: utf-8 --
	import gradio as gr
	import html2text
	from bs4 import BeautifulSoup, Comment
	import logging
	import re

	logging.basicConfig(level=logging.INFO)

	def extrair_limpar_html_v5(html_bruto):
	"""
	Extrai o conteúdo principal (priorizando .entry-content), remove
	elementos irmãos indesejados (tags, nav, comments, related), limpa
	o conteúdo principal e retorna o HTML limpo.
	V5: Adaptado para a estrutura HTML fornecida.

	:param html_bruto: String contendo o código HTML original.
	:return: String contendo o HTML limpo e focado no conteúdo principal.
	"""
	if not html_bruto:
	return ""

	soup = BeautifulSoup(html_bruto, 'html.parser')

	# --- 0. Remover comentários HTML ---
	for comment in soup.find_all(string=lambda text: isinstance(text, Comment)):
	comment.extract()

	target_element = None
	main_container = None # Guarda o elemento que contém o target_element e os irmãos

	# --- 1. Encontrar o Contêiner Principal Específico (.entry-content) ---
	# Seletores em ordem de preferência para este site
	main_content_selectors = [
	'.entry-content', # O mais provável para o corpo do post neste HTML
	'.wp-block-post-content', # Alternativa
	'article', # Fallback
	'main', # Fallback mais amplo
	# '[role="main"]', # Menos provável neste tema
	]

	for selector in main_content_selectors:
	target_element = soup.select_one(selector)
	if target_element:
	logging.info(f"Conteúdo principal identificado usando o seletor: '{selector}'")
	# Tenta encontrar um pai razoável para procurar irmãos
	# Anda alguns níveis acima se necessário, mas não até o body/html se possível
	potential_main_container = target_element.parent
	levels_up = 0
	while potential_main_container and potential_main_container.name in ['div', 'section'] and levels_up < 3:
	# Verifica se este pai contém os blocos indesejados como irmãos do target
	if potential_main_container.select_one('.wp-block-post-terms, .wp-block-comments, .wp-block-query'):
	main_container = potential_main_container
	logging.info(f"Container principal para busca de irmãos definido como: <{main_container.name}>")
	break
	potential_main_container = potential_main_container.parent
	levels_up += 1

	# Se não encontrou um container com irmãos indesejados, usa o pai direto
	if not main_container:
	main_container = target_element.parent
	if main_container:
	logging.info(f"Container principal para busca de irmãos definido como pai direto: <{main_container.name}>")


	break # Para ao encontrar o primeiro target

	# Fallback se nenhum seletor específico funcionou
	if not target_element:
	logging.warning("Nenhum seletor de conteúdo principal específico (.entry-content, article, main) encontrado.")
	# Tenta usar o body, mas a limpeza de irmãos não será eficaz
	if soup.body:
	target_element = soup.body
	main_container = soup.body # Define main_container como body
	logging.info("Usando <body> como target_element e main_container.")
	else:
	logging.error("Falha crítica: Nenhum elemento de conteúdo ou body encontrado.")
	return "" # Não há nada para processar

	# Se não conseguiu definir um main_container, não pode remover irmãos
	if not main_container:
	logging.warning("Não foi possível determinar um container válido para remover irmãos.")
	# Prossegue limpando apenas o target_element encontrado

	# --- 2. Remover Elementos Irmãos Indesejados (SE main_container foi definido) ---
	if main_container and target_element is not main_container: # Só remove irmãos se o target não for o próprio container
	logging.info(f"Procurando irmãos indesejados de <{target_element.name}> dentro de <{main_container.name}>...")
	siblings_to_remove_selectors = [
	'.wp-block-post-terms', # Bloco de Tags
	'.wp-container-core-group-is-layout-9b36172e', # Div que contém a navegação Prev/Next (baseado no HTML)
	'.wp-block-comments', # Bloco de comentários inteiro
	'.wp-block-query', # Bloco "Mais Posts" (Query Loop)
	# Poderíamos ser mais específicos para "Mais Posts", mas .wp-block-query parece ok aqui
	# Exemplo: 'div.wp-block-group:has(> h2:contains("Mais posts"))' # Requer análise mais complexa
	]
	removed_siblings_count = 0
	# Itera sobre os elementos DENTRO do main_container
	for element in main_container.find_all(recursive=False): # Apenas filhos diretos ou netos? Melhor procurar em todo o container
	# Verifica se o elemento atual NÃO é o target_element ou um de seus pais
	if element is not target_element and not element.find(target_element):
	for selector in siblings_to_remove_selectors:
	# Verifica se o elemento corresponde a um seletor indesejado
	# Usamos select_one para garantir que estamos testando o próprio elemento
	# Ou podemos usar element.matches(selector) se a versão do bs4 suportar bem
	if element.select_one(f':is({selector})'): # :is() para testar o próprio elemento
	logging.info(f" Removendo irmão/elemento indesejado: <{element.name} class='{' '.join(element.get('class',[]))}'> (match com '{selector}')")
	element.decompose()
	removed_siblings_count += 1
	break # Sai do loop de seletores para este elemento

	# Abordagem alternativa/complementar: Buscar DEPOIS do target_element
	for sibling in target_element.find_next_siblings():
	for selector in siblings_to_remove_selectors:
	# Verifica se o irmão corresponde a um seletor indesejado
	if sibling.select_one(f':is({selector})'): # :is() para testar o próprio irmão
	logging.info(f" Removendo irmão SEGUINTE indesejado: <{sibling.name} class='{' '.join(sibling.get('class',[]))}'> (match com '{selector}')")
	sibling.decompose()
	removed_siblings_count += 1
	break # Vai para o próximo irmão


	if removed_siblings_count > 0:
	logging.info(f"Removidos {removed_siblings_count} elementos/irmãos indesejados.")
	else:
	logging.info("Nenhum elemento/irmão indesejado conhecido foi encontrado ou removido após o conteúdo principal.")

	# --- 3. Limpeza Geral DENTRO do target_element isolado ---
	logging.info(f"Iniciando limpeza geral DENTRO do target_element: <{target_element.name}>")
	tags_para_remover_geral = [
	'script', 'style', 'form', 'input', 'button', 'select', 'textarea', 'label',
	'footer', 'header', 'nav', 'aside', 'iframe', 'noscript', 'meta', 'link',
	'canvas', 'svg', 'audio', 'video', 'figure', # Remover figure, manter figcaption permitido
	# '.wp-block-button', # Remover botões? Pode ser útil manter alguns. Avaliar.
	]
	removed_general_count = 0
	# Importante: usar find_all DENTRO do target_element
	for tag_name in tags_para_remover_geral:
	for tag in target_element.find_all(tag_name):
	tag.decompose()
	removed_general_count +=1
	if removed_general_count > 0:
	logging.info(f"Removidas {removed_general_count} tags gerais indesejadas dentro do target_element.")

	# --- 4. Limpar Atributos e Tags Restantes no target_element ---
	tags_permitidas = {
	'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'br', 'a', 'strong', 'b',
	'em', 'i', 'u', 's', 'strike', 'del', 'ul', 'ol', 'li', 'img',
	'table', 'thead', 'tbody', 'tr', 'th', 'td', 'blockquote', 'pre', 'code',
	'figcaption'
	}
	atributos_permitidos = {
	'a': ['href', 'title'],
	'img': ['src', 'alt', 'title', 'width', 'height'],
	'th': ['colspan', 'rowspan', 'scope'],
	'td': ['colspan', 'rowspan'],
	'blockquote': ['cite'],
	'ol': ['start'],
	'pre': [], # Geralmente não precisa de atributos
	'code': ['class'], # Permitir classe para syntax highlighting (ex: class="language-python")
	}

	# Iterar sobre uma cópia da lista de tags DENTRO do target_element
	for tag in list(target_element.find_all(True)):
	if not tag.parent: continue # Ignora tags já removidas

	if tag.name not in tags_permitidas:
	tag.unwrap() # Remove tag, mantém conteúdo
	else:
	# Limpa atributos
	atributos_para_manter = atributos_permitidos.get(tag.name, [])
	attrs_mantidos = {}
	# Mantém atributos essenciais primeiro
	if tag.name == 'a' and 'href' in tag.attrs: attrs_mantidos['href'] = tag.attrs['href']
	if tag.name == 'img' and 'src' in tag.attrs: attrs_mantidos['src'] = tag.attrs['src']
	if tag.name == 'img' and 'alt' in tag.attrs: attrs_mantidos['alt'] = tag.attrs['alt'] # Manter ALT

	# Adiciona outros permitidos
	for attr, value in tag.attrs.items():
	if attr in atributos_para_manter:
	attrs_mantidos[attr] = value
	tag.attrs = attrs_mantidos

	# Retorna o HTML limpo e focado como string
	html_final = str(target_element)
	html_final = html_final.replace(' ', ' ')
	# Remover divs vazios que podem sobrar após unwrap
	soup_final = BeautifulSoup(html_final, 'html.parser')
	for div in soup_final.find_all('div'):
	if not div.get_text(strip=True) and not div.find(['img', 'br']): # Se não tem texto nem imagem/br
	div.decompose()
	html_final = str(soup_final)

	logging.info("Limpeza final do HTML concluída.")
	return html_final


	def html_para_markdown_final_v5(html_input):
	"""
	Pipeline completo V5: Extrai .entry-content, remove irmãos, limpa, converte.
	"""
	if not html_input:
	return "Por favor, insira algum código HTML."

	try:
	# 1. Extrai, remove irmãos indesejados e limpa HTML
	logging.info("--- Iniciando Extração e Limpeza V5 ---")
	html_processado = extrair_limpar_html_v5(html_input)
	logging.info("--- Extração e Limpeza V5 Concluída ---")

	soup_check = BeautifulSoup(html_processado, 'html.parser')
	if not html_processado or not soup_check.get_text(strip=True):
	logging.warning("HTML resultante V5 após limpeza está vazio ou sem texto.")
	return "HTML resultante após extração e limpeza está vazio ou não contém texto."

	# 2. Converte o HTML processado para Markdown (Config V4/V2)
	logging.info("--- Iniciando Conversão para Markdown V5 ---")
	converter = html2text.HTML2Text()
	converter.body_width = 0
	converter.ignore_links = False
	converter.ignore_images = False
	converter.ignore_emphasis = False
	converter.use_automatic_links = True
	converter.unicode_snob = True
	converter.escape_snob = True

	markdown_output = converter.handle(html_processado)
	logging.info("--- Conversão para Markdown V5 Concluída ---")

	# 3. Pós-processamento do Markdown (Simplificado - V4/V2)
	logging.info("--- Iniciando Pós-processamento do Markdown V5 ---")
	linhas = [line.strip() for line in markdown_output.splitlines()]
	linhas_filtradas = [line for line in linhas if line]
	markdown_output = "\n\n".join(linhas_filtradas)

	# Limpeza final extra
	markdown_output = re.sub(r' +', ' ', markdown_output) # Múltiplos espaços
	markdown_output = re.sub(r' +\n', '\n', markdown_output) # Espaços antes de \n
	# Remover marcadores de lista vazios ou estranhos que podem sobrar
	markdown_output = re.sub(r'\n\n[-+]\s\n\n', '\n\n', markdown_output)
	markdown_output = re.sub(r'^\s[-+]\s*\n\n', '', markdown_output) # No início

	logging.info("--- Pós-processamento do Markdown V5 Concluído ---")
	return markdown_output.strip()

	except Exception as e:
	logging.error(f"Erro durante o processo V5: {e}", exc_info=True)
	try: html_on_error = html_processado
	except NameError: html_on_error = "(HTML não disponível)"
	return (f"Ocorreu um erro V5: {str(e)}\n\n"
	f"Verifique os logs do Space.\n\n"
	f"HTML processado antes do erro:\n"
	f"{html_on_error[:2000]}...")


	# --- Cria a interface Gradio ---
	iface = gr.Interface(
	fn=html_para_markdown_final_v5, # Usando a função V5
	inputs=gr.Textbox(lines=20, label="Insira o HTML bruto aqui", placeholder="Cole o código-fonte HTML completo da página..."),
	outputs=gr.Textbox(lines=20, label="Markdown Resultante (Conteúdo Principal Limpo - V5)", show_copy_button=True),
	title="Conversor HTML para Markdown (V5 - Específico para Estrutura WP)",
	description="Cole o HTML. O script tenta isolar '.entry-content', remove tags/comentários/relacionados/nav que vêm depois dele, limpa o HTML restante e converte para Markdown (formatação V2/V4).",
	allow_flagging='never'
	)

	# --- Lança a interface ---
	if __name__ == "__main__":
	iface.launch()