Spaces:
Sleeping
Sleeping
File size: 2,736 Bytes
b454ab3 c221558 f6a9bc3 e97007d c221558 f6a9bc3 dbf0822 f6a9bc3 e97007d f6a9bc3 e97007d c221558 e97007d c221558 b454ab3 e97007d c221558 e97007d c221558 e97007d c221558 e97007d c221558 f6a9bc3 e97007d f6a9bc3 e97007d c221558 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 |
import nltk
nltk.download("punkt")
nltk.download("punkt_tab")
import gradio as gr
import trafilatura
import requests
from markdownify import markdownify as md
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.text_rank import TextRankSummarizer
import re
def summarize_text(text, sentence_count=3):
parser = PlaintextParser.from_string(text, Tokenizer("english"))
summarizer = TextRankSummarizer()
summary_sentences = summarizer(parser.document, sentence_count)
return [str(sentence) for sentence in summary_sentences]
def paraphrase_text(sentences):
# ๊ฐ๋จํ ๋ก์ปฌ paraphrase: ๋์์ด ์นํ + ๋ฌธ์ฅ ๊ตฌ์กฐ ๋ณ๊ฒฝ
# (LLM ์์ด ๋์, ํ์ง์ ๊ธฐ๋ณธ ์์ค)
paraphrased = []
replacements = {
"๋
ธ์กฐ": "๋
ธ๋์กฐํฉ",
"์ฑ๊ณผ๊ธ": "์ฑ๊ณผ ๋ณด์๊ธ",
"์๊ตฌ": "์์ฒญ",
"๋ถ๋ง": "์ด์ ์ ๊ธฐ",
"ํฉ์์": "ํ์ ๊ฒฐ๊ณผ์"
}
for s in sentences:
new_s = s
for k, v in replacements.items():
new_s = re.sub(k, v, new_s)
# ์ด์ ์ด์ง ๋ณ๊ฒฝ ์์
if "๋" in new_s:
parts = new_s.split("๋", 1)
new_s = f"{parts[1].strip()} โ {parts[0].strip()}๋"
paraphrased.append(new_s)
return paraphrased
def extract_summarize_paraphrase(url):
headers = {"User-Agent": "Mozilla/5.0"}
try:
r = requests.get(url, headers=headers, timeout=10)
r.raise_for_status()
html_content = trafilatura.extract(
r.text,
output_format="html",
include_tables=True,
favor_recall=True
)
if not html_content:
return "๋ณธ๋ฌธ์ ์ถ์ถํ ์ ์์ต๋๋ค.", "", ""
markdown_text = md(html_content, heading_style="ATX")
summary_sentences = summarize_text(markdown_text, sentence_count=3)
paraphrased_sentences = paraphrase_text(summary_sentences)
return markdown_text, "\n".join(summary_sentences), "\n".join(paraphrased_sentences)
except Exception as e:
return f"์๋ฌ ๋ฐ์: {e}", "", ""
iface = gr.Interface(
fn=extract_summarize_paraphrase,
inputs=gr.Textbox(label="URL ์
๋ ฅ", placeholder="https://example.com"),
outputs=[
gr.Markdown(label="์ถ์ถ๋ ๋ณธ๋ฌธ"),
gr.Textbox(label="์๋ ์์ฝ", lines=5),
gr.Textbox(label="์๋ ์ฌ์์ฑ", lines=5)
],
title="๋ณธ๋ฌธ ์ถ์ถ๊ธฐ + ์๋ ์์ฝ + ์๋ ์ฌ์์ฑ",
description="์นํ์ด์ง URL์ ์
๋ ฅํ๋ฉด ๋ณธ๋ฌธ์ ์ถ์ถํ๊ณ , 3๋ฌธ์ฅ ์์ฝ๊ณผ ์ฌ์์ฑ(Paraphrasing) ๊ฒฐ๊ณผ๋ฅผ ์ ๊ณตํฉ๋๋ค."
)
if __name__ == "__main__":
iface.launch() |