orgoflu commited on
Commit
e97007d
ยท
verified ยท
1 Parent(s): 0f64d79

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +35 -19
app.py CHANGED
@@ -1,5 +1,4 @@
1
  import nltk
2
- # NLTK ํ† ํฌ๋‚˜์ด์ € ๋ฆฌ์†Œ์Šค ์ž๋™ ๋‹ค์šด๋กœ๋“œ
3
  nltk.download("punkt")
4
  nltk.download("punkt_tab")
5
 
@@ -10,21 +9,42 @@ from markdownify import markdownify as md
10
  from sumy.parsers.plaintext import PlaintextParser
11
  from sumy.nlp.tokenizers import Tokenizer
12
  from sumy.summarizers.text_rank import TextRankSummarizer
 
13
 
14
  def summarize_text(text, sentence_count=3):
15
- # ํ•œ๊ตญ์–ด๋„ ๋ฌธ์žฅ ๋‹จ์œ„๋กœ ๋Š๊ธฐ ์œ„ํ•ด english ํ† ํฌ๋‚˜์ด์ € ์‚ฌ์šฉ
16
  parser = PlaintextParser.from_string(text, Tokenizer("english"))
17
  summarizer = TextRankSummarizer()
18
  summary_sentences = summarizer(parser.document, sentence_count)
19
- return "\n".join(str(sentence) for sentence in summary_sentences)
20
 
21
- def extract_and_summarize(url):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
22
  headers = {"User-Agent": "Mozilla/5.0"}
23
  try:
24
  r = requests.get(url, headers=headers, timeout=10)
25
  r.raise_for_status()
26
 
27
- # HTML ํ˜•ํƒœ๋กœ ๋ณธ๋ฌธ ์ถ”์ถœ
28
  html_content = trafilatura.extract(
29
  r.text,
30
  output_format="html",
@@ -33,32 +53,28 @@ def extract_and_summarize(url):
33
  )
34
 
35
  if not html_content:
36
- return "๋ณธ๋ฌธ์„ ์ถ”์ถœํ•  ์ˆ˜ ์—†์Šต๋‹ˆ๋‹ค.", ""
37
 
38
- # HTML โ†’ Markdown ๋ณ€ํ™˜
39
  markdown_text = md(html_content, heading_style="ATX")
40
 
41
- # ์š”์•ฝ ์ƒ์„ฑ
42
- summary = summarize_text(markdown_text, sentence_count=3)
43
 
44
- return markdown_text, summary
45
 
46
- except requests.exceptions.Timeout:
47
- return "์š”์ฒญ์ด ์‹œ๊ฐ„ ์ดˆ๊ณผ๋˜์—ˆ์Šต๋‹ˆ๋‹ค.", ""
48
- except requests.exceptions.RequestException as e:
49
- return f"์š”์ฒญ ์‹คํŒจ: {e}", ""
50
  except Exception as e:
51
- return f"์—๋Ÿฌ ๋ฐœ์ƒ: {e}", ""
52
 
53
  iface = gr.Interface(
54
- fn=extract_and_summarize,
55
  inputs=gr.Textbox(label="URL ์ž…๋ ฅ", placeholder="https://example.com"),
56
  outputs=[
57
  gr.Markdown(label="์ถ”์ถœ๋œ ๋ณธ๋ฌธ"),
58
- gr.Textbox(label="์ž๋™ ์š”์•ฝ", lines=5)
 
59
  ],
60
- title="๋ณธ๋ฌธ ์ถ”์ถœ๊ธฐ + ์ž๋™ ์š”์•ฝ",
61
- description="์›นํŽ˜์ด์ง€ URL์„ ์ž…๋ ฅํ•˜๋ฉด ๋ณธ๋ฌธ์„ ์ถ”์ถœํ•˜๊ณ , TextRank ์•Œ๊ณ ๋ฆฌ์ฆ˜์œผ๋กœ 3๋ฌธ์žฅ ์š”์•ฝ์„ ์ œ๊ณตํ•ฉ๋‹ˆ๋‹ค."
62
  )
63
 
64
  if __name__ == "__main__":
 
1
  import nltk
 
2
  nltk.download("punkt")
3
  nltk.download("punkt_tab")
4
 
 
9
  from sumy.parsers.plaintext import PlaintextParser
10
  from sumy.nlp.tokenizers import Tokenizer
11
  from sumy.summarizers.text_rank import TextRankSummarizer
12
+ import re
13
 
14
  def summarize_text(text, sentence_count=3):
 
15
  parser = PlaintextParser.from_string(text, Tokenizer("english"))
16
  summarizer = TextRankSummarizer()
17
  summary_sentences = summarizer(parser.document, sentence_count)
18
+ return [str(sentence) for sentence in summary_sentences]
19
 
20
+ def paraphrase_text(sentences):
21
+ # ๊ฐ„๋‹จํ•œ ๋กœ์ปฌ paraphrase: ๋™์˜์–ด ์น˜ํ™˜ + ๋ฌธ์žฅ ๊ตฌ์กฐ ๋ณ€๊ฒฝ
22
+ # (LLM ์—†์ด ๋™์ž‘, ํ’ˆ์งˆ์€ ๊ธฐ๋ณธ ์ˆ˜์ค€)
23
+ paraphrased = []
24
+ replacements = {
25
+ "๋…ธ์กฐ": "๋…ธ๋™์กฐํ•ฉ",
26
+ "์„ฑ๊ณผ๊ธ‰": "์„ฑ๊ณผ ๋ณด์ƒ๊ธˆ",
27
+ "์š”๊ตฌ": "์š”์ฒญ",
28
+ "๋ถˆ๋งŒ": "์ด์˜ ์ œ๊ธฐ",
29
+ "ํ•ฉ์˜์•ˆ": "ํ˜‘์ƒ ๊ฒฐ๊ณผ์•ˆ"
30
+ }
31
+ for s in sentences:
32
+ new_s = s
33
+ for k, v in replacements.items():
34
+ new_s = re.sub(k, v, new_s)
35
+ # ์–ด์ˆœ ์‚ด์ง ๋ณ€๊ฒฝ ์˜ˆ์‹œ
36
+ if "๋Š”" in new_s:
37
+ parts = new_s.split("๋Š”", 1)
38
+ new_s = f"{parts[1].strip()} โ€” {parts[0].strip()}๋Š”"
39
+ paraphrased.append(new_s)
40
+ return paraphrased
41
+
42
+ def extract_summarize_paraphrase(url):
43
  headers = {"User-Agent": "Mozilla/5.0"}
44
  try:
45
  r = requests.get(url, headers=headers, timeout=10)
46
  r.raise_for_status()
47
 
 
48
  html_content = trafilatura.extract(
49
  r.text,
50
  output_format="html",
 
53
  )
54
 
55
  if not html_content:
56
+ return "๋ณธ๋ฌธ์„ ์ถ”์ถœํ•  ์ˆ˜ ์—†์Šต๋‹ˆ๋‹ค.", "", ""
57
 
 
58
  markdown_text = md(html_content, heading_style="ATX")
59
 
60
+ summary_sentences = summarize_text(markdown_text, sentence_count=3)
61
+ paraphrased_sentences = paraphrase_text(summary_sentences)
62
 
63
+ return markdown_text, "\n".join(summary_sentences), "\n".join(paraphrased_sentences)
64
 
 
 
 
 
65
  except Exception as e:
66
+ return f"์—๋Ÿฌ ๋ฐœ์ƒ: {e}", "", ""
67
 
68
  iface = gr.Interface(
69
+ fn=extract_summarize_paraphrase,
70
  inputs=gr.Textbox(label="URL ์ž…๋ ฅ", placeholder="https://example.com"),
71
  outputs=[
72
  gr.Markdown(label="์ถ”์ถœ๋œ ๋ณธ๋ฌธ"),
73
+ gr.Textbox(label="์ž๋™ ์š”์•ฝ", lines=5),
74
+ gr.Textbox(label="์ž๋™ ์žฌ์ž‘์„ฑ", lines=5)
75
  ],
76
+ title="๋ณธ๋ฌธ ์ถ”์ถœ๊ธฐ + ์ž๋™ ์š”์•ฝ + ์ž๋™ ์žฌ์ž‘์„ฑ",
77
+ description="์›นํŽ˜์ด์ง€ URL์„ ์ž…๋ ฅํ•˜๋ฉด ๋ณธ๋ฌธ์„ ์ถ”์ถœํ•˜๊ณ , 3๋ฌธ์žฅ ์š”์•ฝ๊ณผ ์žฌ์ž‘์„ฑ(Paraphrasing) ๊ฒฐ๊ณผ๋ฅผ ์ œ๊ณตํ•ฉ๋‹ˆ๋‹ค."
78
  )
79
 
80
  if __name__ == "__main__":