moro_text / app.py
orgoflu's picture
app.py
7aef82d verified
import gradio as gr
import trafilatura
import requests
from markdownify import markdownify as md
def extract(url):
headers = {"User-Agent": "Mozilla/5.0"}
try:
r = requests.get(url, headers=headers, timeout=10)
r.raise_for_status()
# HTML ํ˜•ํƒœ๋กœ ์ถ”์ถœ
html_content = trafilatura.extract(
r.text,
output_format="html",
include_tables=True,
favor_recall=True
)
if html_content:
# HTML โ†’ Markdown ๋ณ€ํ™˜
markdown_text = md(html_content, heading_style="ATX")
return markdown_text
return "๋ณธ๋ฌธ์„ ์ถ”์ถœํ•  ์ˆ˜ ์—†์Šต๋‹ˆ๋‹ค."
except requests.exceptions.Timeout:
return "์š”์ฒญ์ด ์‹œ๊ฐ„ ์ดˆ๊ณผ๋˜์—ˆ์Šต๋‹ˆ๋‹ค."
except requests.exceptions.RequestException as e:
return f"์š”์ฒญ ์‹คํŒจ: {e}"
except Exception as e:
return f"์—๋Ÿฌ ๋ฐœ์ƒ: {e}"
iface = gr.Interface(
fn=extract,
inputs=gr.Textbox(label="URL ์ž…๋ ฅ", placeholder="https://example.com"),
outputs=gr.Markdown(label="์ถ”์ถœ๋œ ๋ณธ๋ฌธ"),
title="๋ณธ๋ฌธ ์ถ”์ถœ๊ธฐ (๋งˆํฌ๋‹ค์šด ์ง€์›)",
description="์›นํŽ˜์ด์ง€ URL์„ ์ž…๋ ฅํ•˜๋ฉด ๋ฆฌ๋”๋ชจ๋“œ์ฒ˜๋Ÿผ ๊น”๋”ํ•˜๊ฒŒ ๋งˆํฌ๋‹ค์šด์œผ๋กœ ์ถ”์ถœํ•ฉ๋‹ˆ๋‹ค."
)
if __name__ == "__main__":
iface.launch()