|
|
import gradio as gr |
|
|
import trafilatura |
|
|
import requests |
|
|
from markdownify import markdownify as md |
|
|
|
|
|
def extract(url): |
|
|
headers = {"User-Agent": "Mozilla/5.0"} |
|
|
try: |
|
|
r = requests.get(url, headers=headers, timeout=10) |
|
|
r.raise_for_status() |
|
|
|
|
|
html_content = trafilatura.extract( |
|
|
r.text, |
|
|
output_format="html", |
|
|
include_tables=True, |
|
|
favor_recall=True |
|
|
) |
|
|
if html_content: |
|
|
|
|
|
markdown_text = md(html_content, heading_style="ATX") |
|
|
return markdown_text |
|
|
return "๋ณธ๋ฌธ์ ์ถ์ถํ ์ ์์ต๋๋ค." |
|
|
except requests.exceptions.Timeout: |
|
|
return "์์ฒญ์ด ์๊ฐ ์ด๊ณผ๋์์ต๋๋ค." |
|
|
except requests.exceptions.RequestException as e: |
|
|
return f"์์ฒญ ์คํจ: {e}" |
|
|
except Exception as e: |
|
|
return f"์๋ฌ ๋ฐ์: {e}" |
|
|
|
|
|
iface = gr.Interface( |
|
|
fn=extract, |
|
|
inputs=gr.Textbox(label="URL ์
๋ ฅ", placeholder="https://example.com"), |
|
|
outputs=gr.Markdown(label="์ถ์ถ๋ ๋ณธ๋ฌธ"), |
|
|
title="๋ณธ๋ฌธ ์ถ์ถ๊ธฐ (๋งํฌ๋ค์ด ์ง์)", |
|
|
description="์นํ์ด์ง URL์ ์
๋ ฅํ๋ฉด ๋ฆฌ๋๋ชจ๋์ฒ๋ผ ๊น๋ํ๊ฒ ๋งํฌ๋ค์ด์ผ๋ก ์ถ์ถํฉ๋๋ค." |
|
|
) |
|
|
|
|
|
if __name__ == "__main__": |
|
|
iface.launch() |