orgoflu commited on
Commit
00e61ba
ยท
verified ยท
1 Parent(s): 0dc5312

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +47 -49
app.py CHANGED
@@ -1,67 +1,65 @@
1
- # app.py
2
- # HTML ํŒŒ์‹ฑ โ†’ ์ž๋™์š”์•ฝ(TextRank)๋งŒ ์ˆ˜ํ–‰
 
 
3
 
4
- import requests
5
- import trafilatura
6
  import gradio as gr
 
 
7
  from markdownify import markdownify as md
8
-
9
  from sumy.parsers.plaintext import PlaintextParser
10
  from sumy.nlp.tokenizers import Tokenizer
11
  from sumy.summarizers.text_rank import TextRankSummarizer
12
 
13
- # ===== ์ž๋™์š”์•ฝ(TextRank) =====
14
- def auto_summarize(text: str, n_sentences: int = 3) -> str:
 
 
 
 
 
 
 
15
  try:
16
- parser = PlaintextParser.from_string(text, Tokenizer("korean"))
17
- summarizer = TextRankSummarizer()
18
- sents = [str(s) for s in summarizer(parser.document, n_sentences)]
19
- summary = " ".join(sents).strip()
20
- return summary or text[:500]
21
- except Exception:
22
- return text[:500]
 
 
 
 
 
 
23
 
24
- # ===== URL ์ฒ˜๋ฆฌ =====
25
- def process_url(url: str):
26
- # 1) HTML ํŒŒ์‹ฑ
27
- r = requests.get(url, headers={"User-Agent": "Mozilla/5.0"}, timeout=10)
28
- r.raise_for_status()
29
- plain = trafilatura.extract(
30
- r.text,
31
- output_format="txt",
32
- include_tables=False,
33
- include_comments=False,
34
- favor_recall=True
35
- ) or ""
36
- html = trafilatura.extract(
37
- r.text,
38
- output_format="html",
39
- include_tables=False,
40
- include_comments=False,
41
- favor_recall=True
42
- )
43
- md_preview = md(html or r.text, heading_style="ATX")
44
 
45
- # 2) ์ž๋™์š”์•ฝ
46
- auto_sum = auto_summarize(plain, n_sentences=3)
47
 
48
- # 3) ๊ฒฐ๊ณผ ๋ฐ˜ํ™˜
49
- link_html = f'<a href="{url}" target="_blank">์›๋ฌธ ๋ณด๊ธฐ</a>'
50
- return link_html + "<br><br>" + md_preview, auto_sum
 
 
 
 
 
51
 
52
- # ===== Gradio UI =====
53
  iface = gr.Interface(
54
- fn=process_url,
55
- inputs=[
56
- gr.Textbox(label="URL ์ž…๋ ฅ", placeholder="https://n.news.naver.com/...")
57
- ],
58
  outputs=[
59
- gr.HTML(label="์›๋ฌธ ๋งํฌ + ๋ณธ๋ฌธ ๋ฏธ๋ฆฌ๋ณด๊ธฐ"),
60
- gr.Textbox(label="์ž๋™์š”์•ฝ", lines=4)
61
  ],
62
- title="HTML ํŒŒ์‹ฑ โ†’ ์ž๋™์š”์•ฝ",
63
- description="HTML์—์„œ ๋ณธ๋ฌธ์„ ์ถ”์ถœํ•˜๊ณ  TextRank๋กœ ์ž๋™์š”์•ฝ๋งŒ ์ˆ˜ํ–‰ํ•ฉ๋‹ˆ๋‹ค."
64
  )
65
 
66
  if __name__ == "__main__":
67
- iface.launch()
 
1
+ import nltk
2
+ # NLTK ํ† ํฌ๋‚˜์ด์ € ๋ฆฌ์†Œ์Šค ์ž๋™ ๋‹ค์šด๋กœ๋“œ
3
+ nltk.download("punkt")
4
+ nltk.download("punkt_tab")
5
 
 
 
6
  import gradio as gr
7
+ import trafilatura
8
+ import requests
9
  from markdownify import markdownify as md
 
10
  from sumy.parsers.plaintext import PlaintextParser
11
  from sumy.nlp.tokenizers import Tokenizer
12
  from sumy.summarizers.text_rank import TextRankSummarizer
13
 
14
+ def summarize_text(text, sentence_count=3):
15
+ # ํ•œ๊ตญ์–ด๋„ ๋ฌธ์žฅ ๋‹จ์œ„๋กœ ๋Š๊ธฐ ์œ„ํ•ด english ํ† ํฌ๋‚˜์ด์ € ์‚ฌ์šฉ
16
+ parser = PlaintextParser.from_string(text, Tokenizer("english"))
17
+ summarizer = TextRankSummarizer()
18
+ summary_sentences = summarizer(parser.document, sentence_count)
19
+ return "\n".join(str(sentence) for sentence in summary_sentences)
20
+
21
+ def extract_and_summarize(url):
22
+ headers = {"User-Agent": "Mozilla/5.0"}
23
  try:
24
+ r = requests.get(url, headers=headers, timeout=10)
25
+ r.raise_for_status()
26
+
27
+ # HTML ํ˜•ํƒœ๋กœ ๋ณธ๋ฌธ ์ถ”์ถœ
28
+ html_content = trafilatura.extract(
29
+ r.text,
30
+ output_format="html",
31
+ include_tables=True,
32
+ favor_recall=True
33
+ )
34
+
35
+ if not html_content:
36
+ return "๋ณธ๋ฌธ์„ ์ถ”์ถœํ•  ์ˆ˜ ์—†์Šต๋‹ˆ๋‹ค.", ""
37
 
38
+ # HTML โ†’ Markdown ๋ณ€ํ™˜
39
+ markdown_text = md(html_content, heading_style="ATX")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
40
 
41
+ # ์š”์•ฝ ์ƒ์„ฑ
42
+ summary = summarize_text(markdown_text, sentence_count=3)
43
 
44
+ return markdown_text, summary
45
+
46
+ except requests.exceptions.Timeout:
47
+ return "์š”์ฒญ์ด ์‹œ๊ฐ„ ์ดˆ๊ณผ๋˜์—ˆ์Šต๋‹ˆ๋‹ค.", ""
48
+ except requests.exceptions.RequestException as e:
49
+ return f"์š”์ฒญ ์‹คํŒจ: {e}", ""
50
+ except Exception as e:
51
+ return f"์—๋Ÿฌ ๋ฐœ์ƒ: {e}", ""
52
 
 
53
  iface = gr.Interface(
54
+ fn=extract_and_summarize,
55
+ inputs=gr.Textbox(label="URL ์ž…๋ ฅ", placeholder="https://example.com"),
 
 
56
  outputs=[
57
+ gr.Markdown(label="์ถ”์ถœ๋œ ๋ณธ๋ฌธ"),
58
+ gr.Textbox(label="์ž๋™ ์š”์•ฝ", lines=5)
59
  ],
60
+ title="๋ณธ๋ฌธ ์ถ”์ถœ๊ธฐ + ์ž๋™ ์š”์•ฝ",
61
+ description="์›นํŽ˜์ด์ง€ URL์„ ์ž…๋ ฅํ•˜๋ฉด ๋ณธ๋ฌธ์„ ์ถ”์ถœํ•˜๊ณ , TextRank ์•Œ๊ณ ๋ฆฌ์ฆ˜์œผ๋กœ 3๋ฌธ์žฅ ์š”์•ฝ์„ ์ œ๊ณตํ•ฉ๋‹ˆ๋‹ค."
62
  )
63
 
64
  if __name__ == "__main__":
65
+ iface.launch()