orgoflu commited on
Commit
7aef82d
ยท
verified ยท
1 Parent(s): f03ed3b
Files changed (1) hide show
  1. app.py +12 -8
app.py CHANGED
@@ -1,21 +1,25 @@
1
  import gradio as gr
2
  import trafilatura
3
  import requests
 
4
 
5
  def extract(url):
6
  headers = {"User-Agent": "Mozilla/5.0"}
7
  try:
8
  r = requests.get(url, headers=headers, timeout=10)
9
  r.raise_for_status()
10
- # full_text=True โ†’ ๊ฐ€๋Šฅํ•œ ๋ชจ๋“  ํ…์ŠคํŠธ ์ถ”์ถœ
11
- text = trafilatura.extract(
12
  r.text,
13
- include_comments=False,
14
  include_tables=True,
15
- no_fallback=False,
16
  favor_recall=True
17
  )
18
- return text or "๋ณธ๋ฌธ์„ ์ถ”์ถœํ•  ์ˆ˜ ์—†์Šต๋‹ˆ๋‹ค."
 
 
 
 
19
  except requests.exceptions.Timeout:
20
  return "์š”์ฒญ์ด ์‹œ๊ฐ„ ์ดˆ๊ณผ๋˜์—ˆ์Šต๋‹ˆ๋‹ค."
21
  except requests.exceptions.RequestException as e:
@@ -26,9 +30,9 @@ def extract(url):
26
  iface = gr.Interface(
27
  fn=extract,
28
  inputs=gr.Textbox(label="URL ์ž…๋ ฅ", placeholder="https://example.com"),
29
- outputs=gr.Textbox(label="์ถ”์ถœ๋œ ๋ณธ๋ฌธ", lines=30),
30
- title="๋ณธ๋ฌธ ์ถ”์ถœ๊ธฐ",
31
- description="์›นํŽ˜์ด์ง€ URL์„ ์ž…๋ ฅํ•˜๋ฉด ๊ฐ€๋Šฅํ•œ ๋งŽ์€ ๋ณธ๋ฌธ์„ ์ถ”์ถœํ•ฉ๋‹ˆ๋‹ค."
32
  )
33
 
34
  if __name__ == "__main__":
 
1
  import gradio as gr
2
  import trafilatura
3
  import requests
4
+ from markdownify import markdownify as md
5
 
6
  def extract(url):
7
  headers = {"User-Agent": "Mozilla/5.0"}
8
  try:
9
  r = requests.get(url, headers=headers, timeout=10)
10
  r.raise_for_status()
11
+ # HTML ํ˜•ํƒœ๋กœ ์ถ”์ถœ
12
+ html_content = trafilatura.extract(
13
  r.text,
14
+ output_format="html",
15
  include_tables=True,
 
16
  favor_recall=True
17
  )
18
+ if html_content:
19
+ # HTML โ†’ Markdown ๋ณ€ํ™˜
20
+ markdown_text = md(html_content, heading_style="ATX")
21
+ return markdown_text
22
+ return "๋ณธ๋ฌธ์„ ์ถ”์ถœํ•  ์ˆ˜ ์—†์Šต๋‹ˆ๋‹ค."
23
  except requests.exceptions.Timeout:
24
  return "์š”์ฒญ์ด ์‹œ๊ฐ„ ์ดˆ๊ณผ๋˜์—ˆ์Šต๋‹ˆ๋‹ค."
25
  except requests.exceptions.RequestException as e:
 
30
  iface = gr.Interface(
31
  fn=extract,
32
  inputs=gr.Textbox(label="URL ์ž…๋ ฅ", placeholder="https://example.com"),
33
+ outputs=gr.Markdown(label="์ถ”์ถœ๋œ ๋ณธ๋ฌธ"),
34
+ title="๋ณธ๋ฌธ ์ถ”์ถœ๊ธฐ (๋งˆํฌ๋‹ค์šด ์ง€์›)",
35
+ description="์›นํŽ˜์ด์ง€ URL์„ ์ž…๋ ฅํ•˜๋ฉด ๋ฆฌ๋”๋ชจ๋“œ์ฒ˜๋Ÿผ ๊น”๋”ํ•˜๊ฒŒ ๋งˆํฌ๋‹ค์šด์œผ๋กœ ์ถ”์ถœํ•ฉ๋‹ˆ๋‹ค."
36
  )
37
 
38
  if __name__ == "__main__":