orgoflu commited on
Commit
f6a9bc3
ยท
verified ยท
1 Parent(s): df6d951

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +24 -23
app.py CHANGED
@@ -4,8 +4,17 @@ import requests
4
  from markdownify import markdownify as md
5
  from bs4 import BeautifulSoup
6
  from urllib.parse import urljoin
 
 
 
7
 
8
- def extract(url):
 
 
 
 
 
 
9
  headers = {"User-Agent": "Mozilla/5.0"}
10
  try:
11
  r = requests.get(url, headers=headers, timeout=10)
@@ -20,40 +29,32 @@ def extract(url):
20
  )
21
 
22
  if not html_content:
23
- return "๋ณธ๋ฌธ์„ ์ถ”์ถœํ•  ์ˆ˜ ์—†์Šต๋‹ˆ๋‹ค."
24
-
25
- # ์ด๋ฏธ์ง€ ์ ˆ๋Œ€ ๊ฒฝ๋กœ ๋ณ€ํ™˜
26
- soup = BeautifulSoup(r.text, "lxml")
27
- images = []
28
- for img in soup.find_all("img"):
29
- src = img.get("src")
30
- if src:
31
- full_url = urljoin(url, src) # ์ƒ๋Œ€ ๊ฒฝ๋กœ โ†’ ์ ˆ๋Œ€ ๊ฒฝ๋กœ
32
- if full_url.startswith("http"):
33
- images.append(f"![์ด๋ฏธ์ง€]({full_url})")
34
 
35
  # HTML โ†’ Markdown ๋ณ€ํ™˜
36
  markdown_text = md(html_content, heading_style="ATX")
37
 
38
- # ์ด๋ฏธ์ง€๋“ค์„ ๋ณธ๋ฌธ ๋์— ์ถ”๊ฐ€
39
- if images:
40
- markdown_text += "\n\n---\n\n" + "\n\n".join(images)
41
 
42
- return markdown_text
43
 
44
  except requests.exceptions.Timeout:
45
- return "์š”์ฒญ์ด ์‹œ๊ฐ„ ์ดˆ๊ณผ๋˜์—ˆ์Šต๋‹ˆ๋‹ค."
46
  except requests.exceptions.RequestException as e:
47
- return f"์š”์ฒญ ์‹คํŒจ: {e}"
48
  except Exception as e:
49
- return f"์—๋Ÿฌ ๋ฐœ์ƒ: {e}"
50
 
51
  iface = gr.Interface(
52
- fn=extract,
53
  inputs=gr.Textbox(label="URL ์ž…๋ ฅ", placeholder="https://example.com"),
54
- outputs=gr.Markdown(label="์ถ”์ถœ๋œ ๋ณธ๋ฌธ"),
55
- title="๋ณธ๋ฌธ ์ถ”์ถœ๊ธฐ (๋ฆฌ๋”๋ชจ๋“œ + ์ด๋ฏธ์ง€ ํฌํ•จ)",
56
- description="์›นํŽ˜์ด์ง€ URL์„ ์ž…๋ ฅํ•˜๋ฉด ๋ฆฌ๋”๋ชจ๋“œ์ฒ˜๋Ÿผ ๊น”๋”ํ•˜๊ฒŒ ๋งˆํฌ๋‹ค์šด์œผ๋กœ ์ถ”์ถœํ•˜๊ณ , ๋ณธ๋ฌธ ์† ์ด๋ฏธ์ง€๋„ ํ•จ๊ป˜ ํ‘œ์‹œํ•ฉ๋‹ˆ๋‹ค."
 
 
 
57
  )
58
 
59
  if __name__ == "__main__":
 
4
  from markdownify import markdownify as md
5
  from bs4 import BeautifulSoup
6
  from urllib.parse import urljoin
7
+ from sumy.parsers.plaintext import PlaintextParser
8
+ from sumy.nlp.tokenizers import Tokenizer
9
+ from sumy.summarizers.text_rank import TextRankSummarizer
10
 
11
+ def summarize_text(text, sentence_count=3):
12
+ parser = PlaintextParser.from_string(text, Tokenizer("korean"))
13
+ summarizer = TextRankSummarizer()
14
+ summary_sentences = summarizer(parser.document, sentence_count)
15
+ return "\n".join(str(sentence) for sentence in summary_sentences)
16
+
17
+ def extract_and_summarize(url):
18
  headers = {"User-Agent": "Mozilla/5.0"}
19
  try:
20
  r = requests.get(url, headers=headers, timeout=10)
 
29
  )
30
 
31
  if not html_content:
32
+ return "๋ณธ๋ฌธ์„ ์ถ”์ถœํ•  ์ˆ˜ ์—†์Šต๋‹ˆ๋‹ค.", ""
 
 
 
 
 
 
 
 
 
 
33
 
34
  # HTML โ†’ Markdown ๋ณ€ํ™˜
35
  markdown_text = md(html_content, heading_style="ATX")
36
 
37
+ # ์š”์•ฝ ์ƒ์„ฑ
38
+ summary = summarize_text(markdown_text, sentence_count=3)
 
39
 
40
+ return markdown_text, summary
41
 
42
  except requests.exceptions.Timeout:
43
+ return "์š”์ฒญ์ด ์‹œ๊ฐ„ ์ดˆ๊ณผ๋˜์—ˆ์Šต๋‹ˆ๋‹ค.", ""
44
  except requests.exceptions.RequestException as e:
45
+ return f"์š”์ฒญ ์‹คํŒจ: {e}", ""
46
  except Exception as e:
47
+ return f"์—๋Ÿฌ ๋ฐœ์ƒ: {e}", ""
48
 
49
  iface = gr.Interface(
50
+ fn=extract_and_summarize,
51
  inputs=gr.Textbox(label="URL ์ž…๋ ฅ", placeholder="https://example.com"),
52
+ outputs=[
53
+ gr.Markdown(label="์ถ”์ถœ๋œ ๋ณธ๋ฌธ"),
54
+ gr.Textbox(label="์ž๋™ ์š”์•ฝ", lines=5)
55
+ ],
56
+ title="๋ณธ๋ฌธ ์ถ”์ถœ๊ธฐ + ์ž๋™ ์š”์•ฝ",
57
+ description="์›นํŽ˜์ด์ง€ URL์„ ์ž…๋ ฅํ•˜๋ฉด ๋ณธ๋ฌธ์„ ์ถ”์ถœํ•˜๊ณ , TextRank ์•Œ๊ณ ๋ฆฌ์ฆ˜์œผ๋กœ 3๋ฌธ์žฅ ์š”์•ฝ์„ ์ œ๊ณตํ•ฉ๋‹ˆ๋‹ค."
58
  )
59
 
60
  if __name__ == "__main__":