File size: 1,317 Bytes
908d904
 
 
7aef82d
908d904
 
 
 
 
 
7aef82d
 
f03ed3b
7aef82d
f03ed3b
 
 
7aef82d
 
 
 
 
908d904
 
 
 
 
 
 
 
 
 
7aef82d
 
 
908d904
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
import gradio as gr
import trafilatura
import requests
from markdownify import markdownify as md

def extract(url):
    headers = {"User-Agent": "Mozilla/5.0"}
    try:
        r = requests.get(url, headers=headers, timeout=10)
        r.raise_for_status()
        # HTML ํ˜•ํƒœ๋กœ ์ถ”์ถœ
        html_content = trafilatura.extract(
            r.text,
            output_format="html",
            include_tables=True,
            favor_recall=True
        )
        if html_content:
            # HTML โ†’ Markdown ๋ณ€ํ™˜
            markdown_text = md(html_content, heading_style="ATX")
            return markdown_text
        return "๋ณธ๋ฌธ์„ ์ถ”์ถœํ•  ์ˆ˜ ์—†์Šต๋‹ˆ๋‹ค."
    except requests.exceptions.Timeout:
        return "์š”์ฒญ์ด ์‹œ๊ฐ„ ์ดˆ๊ณผ๋˜์—ˆ์Šต๋‹ˆ๋‹ค."
    except requests.exceptions.RequestException as e:
        return f"์š”์ฒญ ์‹คํŒจ: {e}"
    except Exception as e:
        return f"์—๋Ÿฌ ๋ฐœ์ƒ: {e}"

iface = gr.Interface(
    fn=extract,
    inputs=gr.Textbox(label="URL ์ž…๋ ฅ", placeholder="https://example.com"),
    outputs=gr.Markdown(label="์ถ”์ถœ๋œ ๋ณธ๋ฌธ"),
    title="๋ณธ๋ฌธ ์ถ”์ถœ๊ธฐ (๋งˆํฌ๋‹ค์šด ์ง€์›)",
    description="์›นํŽ˜์ด์ง€ URL์„ ์ž…๋ ฅํ•˜๋ฉด ๋ฆฌ๋”๋ชจ๋“œ์ฒ˜๋Ÿผ ๊น”๋”ํ•˜๊ฒŒ ๋งˆํฌ๋‹ค์šด์œผ๋กœ ์ถ”์ถœํ•ฉ๋‹ˆ๋‹ค."
)

if __name__ == "__main__":
    iface.launch()