orgoflu commited on
Commit
2382c2d
ยท
verified ยท
1 Parent(s): 5736bd1

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +73 -48
app.py CHANGED
@@ -1,60 +1,85 @@
1
- import gradio as gr
2
- import trafilatura
3
  import requests
4
- from markdownify import markdownify as md
5
- from sumy.parsers.plaintext import PlaintextParser
6
- from sumy.nlp.tokenizers import Tokenizer
7
- from sumy.summarizers.text_rank import TextRankSummarizer
8
 
9
- # === ์ž๋™์š”์•ฝ ํ•จ์ˆ˜ ===
10
- def auto_summarize(text, sentence_count=3):
11
- try:
12
- parser = PlaintextParser.from_string(text, Tokenizer("english"))
13
- summarizer = TextRankSummarizer()
14
- sents = [str(s) for s in summarizer(parser.document, sentence_count)]
15
- return " ".join(sents).strip()
16
- except Exception:
17
- return text[:500]
18
-
19
- # === URL ํŒŒ์‹ฑ + ๋ชจ๋“œ๋ณ„ ์ถœ๋ ฅ ===
20
- def parse_and_display(url, mode):
21
- if not url.strip():
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
22
  return "โŒ URL์„ ์ž…๋ ฅํ•˜์„ธ์š”."
23
  try:
24
- headers = {"User-Agent": "Mozilla/5.0"}
25
- r = requests.get(url, headers=headers, timeout=10)
26
- r.raise_for_status()
27
-
28
- html_content = trafilatura.extract(
29
- r.text,
30
- output_format="html",
31
- include_tables=True,
32
- favor_recall=True
33
- )
34
- if not html_content:
35
- return "๋ณธ๋ฌธ์„ ์ถ”์ถœํ•  ์ˆ˜ ์—†์Šต๋‹ˆ๋‹ค."
36
-
37
- markdown_text = md(html_content, heading_style="ATX")
38
- summary_text = auto_summarize(markdown_text, sentence_count=3)
39
-
40
- if mode == "์ž๋™์š”์•ฝ":
41
- return f"๐Ÿ“ **์ž๋™์š”์•ฝ**\n\n{summary_text}\n\n๐Ÿ”— [์›๋ฌธ ๋ณด๊ธฐ]({url})"
42
- else:
43
- return f"๐Ÿ“„ **์›๋ฌธ**\n\n{markdown_text}\n\n๐Ÿ”— [์›๋ฌธ ๋ณด๊ธฐ]({url})"
44
 
 
 
 
 
 
 
 
 
45
  except Exception as e:
46
- return f"์—๋Ÿฌ: {str(e)}"
 
 
 
 
 
 
 
 
 
 
47
 
48
- # === Gradio UI ===
49
- with gr.Blocks() as demo:
50
- gr.Markdown("## ๐Ÿ”— ๋งํฌ โ†’ ์ž๋™์š”์•ฝ / ์›๋ฌธ ๋ณด๊ธฐ")
51
  with gr.Row():
52
- url_input = gr.Textbox(label="URL ์ž…๋ ฅ", placeholder="https://example.com", scale=3)
53
- mode_select = gr.Radio(["์ž๋™์š”์•ฝ", "์›๋ฌธ"], value="์ž๋™์š”์•ฝ", label="๊ธฐ๋ณธ ๋ชจ๋“œ", scale=1)
54
- output = gr.Markdown()
55
- run_btn = gr.Button("์ถ”๊ฐ€")
56
 
57
- run_btn.click(parse_and_display, inputs=[url_input, mode_select], outputs=output)
 
58
 
59
  if __name__ == "__main__":
60
  demo.launch()
 
1
+ import re
 
2
  import requests
3
+ import trafilatura
4
+ from bs4 import BeautifulSoup
5
+ import gradio as gr
 
6
 
7
+ DEFAULT_HEADERS = {"User-Agent": "Mozilla/5.0"}
8
+
9
+ def fetch_html(url: str, timeout: int = 12) -> str:
10
+ r = requests.get(url, headers=DEFAULT_HEADERS, timeout=timeout)
11
+ r.raise_for_status()
12
+ return r.text
13
+
14
+ def html_to_text_preserve_code(html: str) -> str:
15
+ extracted = trafilatura.extract(
16
+ html,
17
+ output_format="html",
18
+ include_tables=True,
19
+ favor_recall=True
20
+ )
21
+ if not extracted:
22
+ return "๋ณธ๋ฌธ์„ ์ถ”์ถœํ•  ์ˆ˜ ์—†์Šต๋‹ˆ๋‹ค."
23
+
24
+ soup = BeautifulSoup(extracted, "html.parser")
25
+
26
+ # <pre><code> ๋ฐ <pre>๋ฅผ fenced code block์œผ๋กœ
27
+ for pre in soup.find_all("pre"):
28
+ code_tag = pre.find("code")
29
+ code_text = code_tag.get_text() if code_tag else pre.get_text()
30
+ code_text = code_text.replace("\r\n", "\n")
31
+ code_text = re.sub(r"\n{3,}", "\n\n", code_text).strip("\n")
32
+ pre.replace_with(f"\n```\n{code_text}\n```\n")
33
+
34
+ # inline <code> โ†’ `...`
35
+ for c in soup.find_all("code"):
36
+ c_text = c.get_text().replace("`", "\\`")
37
+ c.replace_with(f"`{c_text}`")
38
+
39
+ # ํƒœ๊ทธ ์ œ๊ฑฐ + ์ค„๋ฐ”๊ฟˆ ์œ ์ง€
40
+ text_output = soup.get_text("\n")
41
+ text_output = re.sub(r"\n{3,}", "\n\n", text_output).strip()
42
+ return text_output
43
+
44
+ def handle_html(url: str) -> str:
45
+ url = (url or "").strip()
46
+ if not url:
47
  return "โŒ URL์„ ์ž…๋ ฅํ•˜์„ธ์š”."
48
  try:
49
+ html = fetch_html(url)
50
+ return html
51
+ except Exception as e:
52
+ return f"์—๋Ÿฌ: {e}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
53
 
54
+ def handle_text(url: str) -> str:
55
+ url = (url or "").strip()
56
+ if not url:
57
+ return "โŒ URL์„ ์ž…๋ ฅํ•˜์„ธ์š”."
58
+ try:
59
+ html = fetch_html(url)
60
+ text = html_to_text_preserve_code(html)
61
+ return text
62
  except Exception as e:
63
+ return f"์—๋Ÿฌ: {e}"
64
+
65
+ with gr.Blocks(css="""
66
+ #container { max-width: 920px; margin: 0 auto; }
67
+ .small { color:#666; font-size:14px; }
68
+ """) as demo:
69
+ gr.Markdown("## ๋งํฌ ์ž…๋ ฅ โ†’ ์›๋ณธ HTML / ํ…์ŠคํŠธ(์ฝ”๋“œ๋ธ”๋Ÿญ ๋ณด์กด) ๋ณด๊ธฐ", elem_id="container")
70
+
71
+ with gr.Row():
72
+ url_input = gr.Textbox(label="URL", placeholder="https://example.com", scale=4)
73
+ gr.Markdown('<div class="small">URL์„ ์ž…๋ ฅํ•˜๊ณ  ์›ํ•˜๋Š” ๋ฒ„ํŠผ์„ ๋ˆŒ๋Ÿฌ ๊ฒฐ๊ณผ๋ฅผ ์•„๋ž˜์—์„œ ํ™•์ธํ•˜์„ธ์š”.</div>')
74
 
 
 
 
75
  with gr.Row():
76
+ btn_html = gr.Button("์›๋ณธ HTML ๋ณด๊ธฐ", scale=1)
77
+ btn_text = gr.Button("ํ…์ŠคํŠธ ๋ณด๊ธฐ (์ฝ”๋“œ๋ธ”๋Ÿญ ๋ณด์กด)", scale=1)
78
+
79
+ output = gr.Textbox(label="๊ฒฐ๊ณผ", lines=24, show_copy_button=True)
80
 
81
+ btn_html.click(fn=handle_html, inputs=url_input, outputs=output)
82
+ btn_text.click(fn=handle_text, inputs=url_input, outputs=output)
83
 
84
  if __name__ == "__main__":
85
  demo.launch()