Spaces:

orgoflu
/

moro_flask_proxy

Sleeping

App Files Files Community

moro_flask_proxy / app.py

orgoflu

Update app.py

bba79ad verified 5 months ago

raw

history blame contribute delete

5.55 kB

	import re
	import requests
	import trafilatura
	from bs4 import BeautifulSoup
	import gradio as gr

	# TextRank summarizer (summa)
	try:
	from summa.summarizer import summarize as textrank_summarize
	HAS_SUMMA = True
	except Exception:
	HAS_SUMMA = False

	DEFAULT_HEADERS = {"User-Agent": "Mozilla/5.0"}

	# -----------------------------
	# Fetch
	# -----------------------------
	def fetch_html(url: str, timeout: int = 12) -> str:
	r = requests.get(url, headers=DEFAULT_HEADERS, timeout=timeout)
	r.raise_for_status()
	return r.text

	# -----------------------------
	# HTML -> Text (preserve code blocks)
	# -----------------------------
	def html_to_text_preserve_code(html: str) -> str:
	"""
	1) trafilatura로 본문만 남긴 HTML을 얻는다.
	2) <pre>, <pre><code>, inline <code>를 Markdown 코드표기(```, `...`)로 바꾼다.
	3) 나머지 태그 제거 후 줄바꿈 보존한 텍스트를 반환한다.
	"""
	extracted = trafilatura.extract(
	html,
	output_format="html",
	include_tables=True,
	favor_recall=True
	)
	if not extracted:
	return "본문을 추출할 수 없습니다."

	soup = BeautifulSoup(extracted, "html.parser")

	# <pre> (including nested <code>) -> fenced block
	for pre in soup.find_all("pre"):
	code_tag = pre.find("code")
	code_text = code_tag.get_text() if code_tag else pre.get_text()
	code_text = code_text.replace("\r\n", "\n")
	code_text = re.sub(r"\n{3,}", "\n\n", code_text).strip("\n")
	pre.replace_with(f"\n```\n{code_text}\n```\n")

	# inline <code> -> `...`
	for c in soup.find_all("code"):
	c_text = c.get_text().replace("`", "\\`")
	c.replace_with(f"`{c_text}`")

	# strip tags, keep newlines
	text_output = soup.get_text("\n")
	text_output = re.sub(r"\n{3,}", "\n\n", text_output).strip()
	return text_output

	# -----------------------------
	# Sentence splitting (basic, lang-agnostic-ish)
	# -----------------------------
	_SENT_SPLIT_REGEX = re.compile(r"(?<=[\.!\?。！？])\s+\|\n+")

	def split_sentences(text: str):
	# quick-and-dirty splitter handling ., !, ?, CJK punctuation, and newlines
	parts = _SENT_SPLIT_REGEX.split(text)
	return [s.strip() for s in parts if s.strip()]

	# -----------------------------
	# Summarize
	# -----------------------------
	def summarize_text(text: str, max_sentences: int = 3) -> str:
	text = (text or "").strip()
	if not text:
	return ""

	# Try TextRank via summa first
	if HAS_SUMMA:
	try:
	# ratio는 대략적인 길이 비율, 너무 짧으면 sentences 옵션 사용
	# summa는 한국어도 어느정도 작동하지만 품질은 텍스트에 따라 달라짐
	candidate = textrank_summarize(text, split=True)
	if candidate:
	# Pick top-N sentences
	return "\n".join(candidate[:max_sentences]).strip()
	except Exception:
	pass

	# Fallback: Lead-N sentences
	sents = split_sentences(text)
	if not sents:
	# last resort, truncate
	return text[:800]
	return "\n".join(sents[:max_sentences]).strip()

	# -----------------------------
	# Handlers
	# -----------------------------
	def handle_html(url: str) -> str:
	url = (url or "").strip()
	if not url:
	return "❌ URL을 입력하세요."
	try:
	return fetch_html(url)
	except Exception as e:
	return f"에러: {e}"

	def handle_text(url: str) -> str:
	url = (url or "").strip()
	if not url:
	return "❌ URL을 입력하세요."
	try:
	html = fetch_html(url)
	return html_to_text_preserve_code(html)
	except Exception as e:
	return f"에러: {e}"

	def handle_summary(url: str, sent_n: int) -> str:
	url = (url or "").strip()
	if not url:
	return "❌ URL을 입력하세요."
	try:
	html = fetch_html(url)
	text = html_to_text_preserve_code(html)
	if not text or text.startswith("본문을 추출할 수 없습니다."):
	return text
	summary = summarize_text(text, max_sentences=int(sent_n))
	if not summary:
	return "요약을 생성할 수 없습니다."
	return f"📝 자동요약 ({sent_n}문장)\n\n{summary}"
	except Exception as e:
	return f"에러: {e}"

	# -----------------------------
	# UI
	# -----------------------------
	with gr.Blocks(css="""
	#container { max-width: 920px; margin: 0 auto; }
	.small { color:#666; font-size:14px; }
	""") as demo:
	gr.Markdown("## 링크 입력 → 원본 HTML / 텍스트(코드블럭 보존) / 자동요약", elem_id="container")

	with gr.Row():
	url_input = gr.Textbox(label="URL", placeholder="https://example.com", scale=4)
	gr.Markdown('<div class="small">URL을 입력하고 원하는 동작 버튼을 누르세요.</div>')

	with gr.Row():
	btn_html = gr.Button("원본 HTML 보기", scale=1)
	btn_text = gr.Button("텍스트 보기 (코드블럭 보존)", scale=1)
	with gr.Row():
	sent_n = gr.Slider(1, 8, value=3, step=1, label="요약 문장 수")
	btn_sum = gr.Button("자동요약 보기", scale=1)

	output = gr.Textbox(label="결과", lines=26, show_copy_button=True)

	btn_html.click(fn=handle_html, inputs=url_input, outputs=output)
	btn_text.click(fn=handle_text, inputs=url_input, outputs=output)
	btn_sum.click(fn=handle_summary, inputs=[url_input, sent_n], outputs=output)

	if __name__ == "__main__":
	demo.launch()