orgoflu commited on
Commit
bba79ad
ยท
verified ยท
1 Parent(s): fcd51b7

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +88 -9
app.py CHANGED
@@ -4,14 +4,32 @@ import trafilatura
4
  from bs4 import BeautifulSoup
5
  import gradio as gr
6
 
 
 
 
 
 
 
 
7
  DEFAULT_HEADERS = {"User-Agent": "Mozilla/5.0"}
8
 
 
 
 
9
  def fetch_html(url: str, timeout: int = 12) -> str:
10
  r = requests.get(url, headers=DEFAULT_HEADERS, timeout=timeout)
11
  r.raise_for_status()
12
  return r.text
13
 
 
 
 
14
  def html_to_text_preserve_code(html: str) -> str:
 
 
 
 
 
15
  extracted = trafilatura.extract(
16
  html,
17
  output_format="html",
@@ -23,7 +41,7 @@ def html_to_text_preserve_code(html: str) -> str:
23
 
24
  soup = BeautifulSoup(extracted, "html.parser")
25
 
26
- # <pre><code> ๋ฐ <pre>๋ฅผ fenced code block์œผ๋กœ
27
  for pre in soup.find_all("pre"):
28
  code_tag = pre.find("code")
29
  code_text = code_tag.get_text() if code_tag else pre.get_text()
@@ -31,55 +49,116 @@ def html_to_text_preserve_code(html: str) -> str:
31
  code_text = re.sub(r"\n{3,}", "\n\n", code_text).strip("\n")
32
  pre.replace_with(f"\n```\n{code_text}\n```\n")
33
 
34
- # inline <code> โ†’ `...`
35
  for c in soup.find_all("code"):
36
  c_text = c.get_text().replace("`", "\\`")
37
  c.replace_with(f"`{c_text}`")
38
 
39
- # ํƒœ๊ทธ ์ œ๊ฑฐ + ์ค„๋ฐ”๊ฟˆ ์œ ์ง€
40
  text_output = soup.get_text("\n")
41
  text_output = re.sub(r"\n{3,}", "\n\n", text_output).strip()
42
  return text_output
43
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
44
  def handle_html(url: str) -> str:
45
  url = (url or "").strip()
46
  if not url:
47
  return "โŒ URL์„ ์ž…๋ ฅํ•˜์„ธ์š”."
48
  try:
49
- html = fetch_html(url)
50
- return html
51
  except Exception as e:
52
  return f"์—๋Ÿฌ: {e}"
53
 
54
  def handle_text(url: str) -> str:
 
 
 
 
 
 
 
 
 
 
55
  url = (url or "").strip()
56
  if not url:
57
  return "โŒ URL์„ ์ž…๋ ฅํ•˜์„ธ์š”."
58
  try:
59
  html = fetch_html(url)
60
  text = html_to_text_preserve_code(html)
61
- return text
 
 
 
 
 
62
  except Exception as e:
63
  return f"์—๋Ÿฌ: {e}"
64
 
 
 
 
65
  with gr.Blocks(css="""
66
  #container { max-width: 920px; margin: 0 auto; }
67
  .small { color:#666; font-size:14px; }
68
  """) as demo:
69
- gr.Markdown("## ๋งํฌ ์ž…๋ ฅ โ†’ ์›๋ณธ HTML / ํ…์ŠคํŠธ(์ฝ”๋“œ๋ธ”๋Ÿญ ๋ณด์กด) ๋ณด๊ธฐ", elem_id="container")
70
 
71
  with gr.Row():
72
  url_input = gr.Textbox(label="URL", placeholder="https://example.com", scale=4)
73
- gr.Markdown('<div class="small">URL์„ ์ž…๋ ฅํ•˜๊ณ  ์›ํ•˜๋Š” ๋ฒ„ํŠผ์„ ๋ˆŒ๋Ÿฌ ๊ฒฐ๊ณผ๋ฅผ ์•„๋ž˜์—์„œ ํ™•์ธํ•˜์„ธ์š”.</div>')
74
 
75
  with gr.Row():
76
  btn_html = gr.Button("์›๋ณธ HTML ๋ณด๊ธฐ", scale=1)
77
  btn_text = gr.Button("ํ…์ŠคํŠธ ๋ณด๊ธฐ (์ฝ”๋“œ๋ธ”๋Ÿญ ๋ณด์กด)", scale=1)
 
 
 
78
 
79
- output = gr.Textbox(label="๊ฒฐ๊ณผ", lines=24, show_copy_button=True)
80
 
81
  btn_html.click(fn=handle_html, inputs=url_input, outputs=output)
82
  btn_text.click(fn=handle_text, inputs=url_input, outputs=output)
 
83
 
84
  if __name__ == "__main__":
85
  demo.launch()
 
4
  from bs4 import BeautifulSoup
5
  import gradio as gr
6
 
7
+ # TextRank summarizer (summa)
8
+ try:
9
+ from summa.summarizer import summarize as textrank_summarize
10
+ HAS_SUMMA = True
11
+ except Exception:
12
+ HAS_SUMMA = False
13
+
14
  DEFAULT_HEADERS = {"User-Agent": "Mozilla/5.0"}
15
 
16
+ # -----------------------------
17
+ # Fetch
18
+ # -----------------------------
19
  def fetch_html(url: str, timeout: int = 12) -> str:
20
  r = requests.get(url, headers=DEFAULT_HEADERS, timeout=timeout)
21
  r.raise_for_status()
22
  return r.text
23
 
24
+ # -----------------------------
25
+ # HTML -> Text (preserve code blocks)
26
+ # -----------------------------
27
  def html_to_text_preserve_code(html: str) -> str:
28
+ """
29
+ 1) trafilatura๋กœ ๋ณธ๋ฌธ๋งŒ ๋‚จ๊ธด HTML์„ ์–ป๋Š”๋‹ค.
30
+ 2) <pre>, <pre><code>, inline <code>๋ฅผ Markdown ์ฝ”๋“œํ‘œ๊ธฐ(```, `...`)๋กœ ๋ฐ”๊พผ๋‹ค.
31
+ 3) ๋‚˜๋จธ์ง€ ํƒœ๊ทธ ์ œ๊ฑฐ ํ›„ ์ค„๋ฐ”๊ฟˆ ๋ณด์กดํ•œ ํ…์ŠคํŠธ๋ฅผ ๋ฐ˜ํ™˜ํ•œ๋‹ค.
32
+ """
33
  extracted = trafilatura.extract(
34
  html,
35
  output_format="html",
 
41
 
42
  soup = BeautifulSoup(extracted, "html.parser")
43
 
44
+ # <pre> (including nested <code>) -> fenced block
45
  for pre in soup.find_all("pre"):
46
  code_tag = pre.find("code")
47
  code_text = code_tag.get_text() if code_tag else pre.get_text()
 
49
  code_text = re.sub(r"\n{3,}", "\n\n", code_text).strip("\n")
50
  pre.replace_with(f"\n```\n{code_text}\n```\n")
51
 
52
+ # inline <code> -> `...`
53
  for c in soup.find_all("code"):
54
  c_text = c.get_text().replace("`", "\\`")
55
  c.replace_with(f"`{c_text}`")
56
 
57
+ # strip tags, keep newlines
58
  text_output = soup.get_text("\n")
59
  text_output = re.sub(r"\n{3,}", "\n\n", text_output).strip()
60
  return text_output
61
 
62
+ # -----------------------------
63
+ # Sentence splitting (basic, lang-agnostic-ish)
64
+ # -----------------------------
65
+ _SENT_SPLIT_REGEX = re.compile(r"(?<=[\.!\?ใ€‚๏ผ๏ผŸ])\s+|\n+")
66
+
67
+ def split_sentences(text: str):
68
+ # quick-and-dirty splitter handling ., !, ?, CJK punctuation, and newlines
69
+ parts = _SENT_SPLIT_REGEX.split(text)
70
+ return [s.strip() for s in parts if s.strip()]
71
+
72
+ # -----------------------------
73
+ # Summarize
74
+ # -----------------------------
75
+ def summarize_text(text: str, max_sentences: int = 3) -> str:
76
+ text = (text or "").strip()
77
+ if not text:
78
+ return ""
79
+
80
+ # Try TextRank via summa first
81
+ if HAS_SUMMA:
82
+ try:
83
+ # ratio๋Š” ๋Œ€๋žต์ ์ธ ๊ธธ์ด ๋น„์œจ, ๋„ˆ๋ฌด ์งง์œผ๋ฉด sentences ์˜ต์…˜ ์‚ฌ์šฉ
84
+ # summa๋Š” ํ•œ๊ตญ์–ด๋„ ์–ด๋А์ •๋„ ์ž‘๋™ํ•˜์ง€๋งŒ ํ’ˆ์งˆ์€ ํ…์ŠคํŠธ์— ๋”ฐ๋ผ ๋‹ฌ๋ผ์ง
85
+ candidate = textrank_summarize(text, split=True)
86
+ if candidate:
87
+ # Pick top-N sentences
88
+ return "\n".join(candidate[:max_sentences]).strip()
89
+ except Exception:
90
+ pass
91
+
92
+ # Fallback: Lead-N sentences
93
+ sents = split_sentences(text)
94
+ if not sents:
95
+ # last resort, truncate
96
+ return text[:800]
97
+ return "\n".join(sents[:max_sentences]).strip()
98
+
99
+ # -----------------------------
100
+ # Handlers
101
+ # -----------------------------
102
  def handle_html(url: str) -> str:
103
  url = (url or "").strip()
104
  if not url:
105
  return "โŒ URL์„ ์ž…๋ ฅํ•˜์„ธ์š”."
106
  try:
107
+ return fetch_html(url)
 
108
  except Exception as e:
109
  return f"์—๋Ÿฌ: {e}"
110
 
111
  def handle_text(url: str) -> str:
112
+ url = (url or "").strip()
113
+ if not url:
114
+ return "โŒ URL์„ ์ž…๋ ฅํ•˜์„ธ์š”."
115
+ try:
116
+ html = fetch_html(url)
117
+ return html_to_text_preserve_code(html)
118
+ except Exception as e:
119
+ return f"์—๋Ÿฌ: {e}"
120
+
121
+ def handle_summary(url: str, sent_n: int) -> str:
122
  url = (url or "").strip()
123
  if not url:
124
  return "โŒ URL์„ ์ž…๋ ฅํ•˜์„ธ์š”."
125
  try:
126
  html = fetch_html(url)
127
  text = html_to_text_preserve_code(html)
128
+ if not text or text.startswith("๋ณธ๋ฌธ์„ ์ถ”์ถœํ•  ์ˆ˜ ์—†์Šต๋‹ˆ๋‹ค."):
129
+ return text
130
+ summary = summarize_text(text, max_sentences=int(sent_n))
131
+ if not summary:
132
+ return "์š”์•ฝ์„ ์ƒ์„ฑํ•  ์ˆ˜ ์—†์Šต๋‹ˆ๋‹ค."
133
+ return f"๐Ÿ“ ์ž๋™์š”์•ฝ ({sent_n}๋ฌธ์žฅ)\n\n{summary}"
134
  except Exception as e:
135
  return f"์—๋Ÿฌ: {e}"
136
 
137
+ # -----------------------------
138
+ # UI
139
+ # -----------------------------
140
  with gr.Blocks(css="""
141
  #container { max-width: 920px; margin: 0 auto; }
142
  .small { color:#666; font-size:14px; }
143
  """) as demo:
144
+ gr.Markdown("## ๋งํฌ ์ž…๋ ฅ โ†’ ์›๋ณธ HTML / ํ…์ŠคํŠธ(์ฝ”๋“œ๋ธ”๋Ÿญ ๋ณด์กด) / ์ž๋™์š”์•ฝ", elem_id="container")
145
 
146
  with gr.Row():
147
  url_input = gr.Textbox(label="URL", placeholder="https://example.com", scale=4)
148
+ gr.Markdown('<div class="small">URL์„ ์ž…๋ ฅํ•˜๊ณ  ์›ํ•˜๋Š” ๋™์ž‘ ๋ฒ„ํŠผ์„ ๋ˆ„๋ฅด์„ธ์š”.</div>')
149
 
150
  with gr.Row():
151
  btn_html = gr.Button("์›๋ณธ HTML ๋ณด๊ธฐ", scale=1)
152
  btn_text = gr.Button("ํ…์ŠคํŠธ ๋ณด๊ธฐ (์ฝ”๋“œ๋ธ”๋Ÿญ ๋ณด์กด)", scale=1)
153
+ with gr.Row():
154
+ sent_n = gr.Slider(1, 8, value=3, step=1, label="์š”์•ฝ ๋ฌธ์žฅ ์ˆ˜")
155
+ btn_sum = gr.Button("์ž๋™์š”์•ฝ ๋ณด๊ธฐ", scale=1)
156
 
157
+ output = gr.Textbox(label="๊ฒฐ๊ณผ", lines=26, show_copy_button=True)
158
 
159
  btn_html.click(fn=handle_html, inputs=url_input, outputs=output)
160
  btn_text.click(fn=handle_text, inputs=url_input, outputs=output)
161
+ btn_sum.click(fn=handle_summary, inputs=[url_input, sent_n], outputs=output)
162
 
163
  if __name__ == "__main__":
164
  demo.launch()