Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| from bs4 import BeautifulSoup | |
| from markdownify import MarkdownConverter | |
| from playwright.sync_api import sync_playwright | |
| def md(soup, **options): | |
| return MarkdownConverter(**options).convert_soup(soup) | |
| def main_fn(url: str, check: list[str], request: gr.Request): | |
| user_agent = request.headers["user-agent"] | |
| with sync_playwright() as p: | |
| browser = p.chromium.launch( | |
| args=[ | |
| "--single-process", | |
| "--no-zygote", | |
| "--no-sandbox", | |
| "--disable-gpu", | |
| "--disable-dev-shm-usage", | |
| "--headless=new", | |
| ] | |
| ) | |
| context = browser.new_context(user_agent=user_agent) | |
| page = context.new_page() | |
| response = page.goto(url=url) | |
| status = response.status | |
| content = page.content() | |
| title = page.title() | |
| browser.close() | |
| soup = BeautifulSoup(content, features="html.parser") | |
| # Remove <script> e <style> corretamente | |
| for tag in ["script", "style"]: | |
| for t in soup.find_all(tag): | |
| t.decompose() | |
| # Converte conteúdo principal para Markdown | |
| body = soup.find("body") | |
| main = soup.find("main") | |
| strip_tags = check # <- aplica o que o usuário selecionou | |
| if main: | |
| markdown = md(main, strip=strip_tags) | |
| else: | |
| markdown = md(body, strip=strip_tags) | |
| return f"{title}\n======\n\n{markdown}" | |
| demo = gr.Interface( | |
| fn=main_fn, | |
| title="URL para Markdown V2", | |
| description=""" | |
| <div style="width: fit-content; margin: 0 auto;"> | |
| Este app acessa o HTML da URL informada e converte em Markdown. | |
| Utiliza o Playwright, então funciona com páginas dinâmicas como React. | |
| </div> | |
| <div style="width: fit-content; margin: 0 auto;"> | |
| É possível ignorar links (<code><a></code>), imagens (<code><img></code>) e outros elementos. | |
| </div>""", | |
| inputs=[ | |
| gr.Text(label="URL", placeholder="https://*****"), | |
| gr.CheckboxGroup( | |
| label="Ignorar tags no Markdown gerado", | |
| choices=["a", "img", "noscript"], | |
| value=[], | |
| ), | |
| ], | |
| outputs=[ | |
| gr.TextArea(label="Markdown gerado", show_copy_button=True) | |
| ], | |
| allow_flagging="never", | |
| ) | |
| demo.launch(server_name="0.0.0.0") | |