Spaces:

orgoflu
/

moro_text_4

Sleeping

App Files Files Community

orgoflu commited on Sep 11, 2025

Commit

eebc78a

verified ·

1 Parent(s): 574b3b3

Update app.py

Browse files

Files changed (1) hide show

app.py +35 -36

app.py CHANGED Viewed

@@ -4,15 +4,12 @@ nltk.download("punkt")
 import gradio as gr
 import trafilatura, requests, re
 from markdownify import markdownify as md
-from sumy.parsers.plaintext import PlaintextParser
-from sumy.nlp.tokenizers import Tokenizer
-from sumy.summarizers.text_rank import TextRankSummarizer
 from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
 # ===== 모델 목록 =====
 MODEL_OPTIONS = {
     "Qwen2.5-1.5B-Instruct": "Qwen/Qwen2.5-1.5B-Instruct",
-    "CLOVA-Text(대체)": "skt/kogpt2-base-v2"
 }
 # ===== 텍스트 모델 로드 =====
@@ -23,38 +20,41 @@ def load_text_model(model_choice):
     return pipeline("text-generation", model=model, tokenizer=tokenizer, device=-1)
 # ===== 텍스트 전처리 =====
-def clean_text(text):
     return re.sub(r'\s+', ' ', text).strip()
-# ===== 자동 요약 =====
-def summarize_text(text):
     text = clean_text(text)
-    length = len(text)
-    sentence_count = 1 if length < 300 else 2 if length < 800 else 3 if length < 1500 else 4
-    try:
-        parser = PlaintextParser.from_string(text, Tokenizer("korean"))
-        if not parser.document.sentences: raise ValueError
-    except:
-        try:
-            parser = PlaintextParser.from_string(text, Tokenizer("english"))
-            if not parser.document.sentences: raise ValueError
-        except:
-            return re.split(r'(?<=[.!?])\s+', text)[:sentence_count]
-    summarizer = TextRankSummarizer()
-    return [str(s) for s in summarizer(parser.document, sentence_count)]
 # ===== 재작성 =====
-def rewrite_with_llm(sentences, model_choice):
-    llm_pipeline = load_text_model(model_choice)
-    joined_text = "\n".join(sentences)
     prompt = f"""다음 문장을 의미는 유지하되, 원문에 없는 내용은 절대 추가하지 말고,
-문장만 더 자연스럽게 바꿔주세요. 다른 설명이나 부연 문장은 쓰지 마세요.
 문장:
-{joined_text}
 """
-    result = llm_pipeline(prompt, max_new_tokens=150, do_sample=False, temperature=0)
-    return result[0]["generated_text"].replace(prompt, "").strip()
 # ===== URL 처리 =====
 def process_url(url, model_choice):
@@ -62,23 +62,22 @@ def process_url(url, model_choice):
         r = requests.get(url, headers={"User-Agent": "Mozilla/5.0"}, timeout=10)
         r.raise_for_status()
-        # 원문 순수 텍스트 추출 (요약용)
         plain_text = trafilatura.extract(r.text, output_format="txt", include_tables=False, favor_recall=True) or ""
-        # HTML → 마크다운 (출력용)
         html_content = trafilatura.extract(r.text, output_format="html", include_tables=False, favor_recall=True)
         markdown_text = md(html_content or r.text, heading_style="ATX")
-        # 첫 줄 → 툴팁
         first_line = plain_text.strip().split("\n")[0].strip()
         link_html = f'<a href="{url}" title="{first_line}" target="_blank">원문 보기</a>'
-        # 요약
-        summary_sentences = summarize_text(plain_text) or ["요약 없음"]
         # 재작성
-        paraphrased_text = rewrite_with_llm(summary_sentences, model_choice)
-        return link_html + "<br><br>" + markdown_text, "\n".join(summary_sentences), paraphrased_text
     except Exception as e:
         return f"에러 발생: {e}", "요약 없음", "재작성 없음"
@@ -94,8 +93,8 @@ iface = gr.Interface(
         gr.Textbox(label="자동 요약", lines=5),
         gr.Textbox(label="자동 재작성 (LLM)", lines=5)
     ],
-    title="한국어 본문 추출 + 자동 요약 + LLM 재작성",
-    description="원문 텍스트에서 바로 요약 후, 선택한 모델(Qwen 또는 KoGPT2)로 재작성합니다."
 )
 if __name__ == "__main__":

 import gradio as gr
 import trafilatura, requests, re
 from markdownify import markdownify as md
 from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
 # ===== 모델 목록 =====
 MODEL_OPTIONS = {
     "Qwen2.5-1.5B-Instruct": "Qwen/Qwen2.5-1.5B-Instruct",
+    "CLOVA-Text(대체)": "skt/kogpt2-base-v2"  # 허가 없이 사용 가능
 }
 # ===== 텍스트 모델 로드 =====
     return pipeline("text-generation", model=model, tokenizer=tokenizer, device=-1)
 # ===== 텍스트 전처리 =====
+def clean_text(text):
     return re.sub(r'\s+', ' ', text).strip()
+# ===== 텍스트 분할 =====
+def chunk_text(text, chunk_size=500):
     text = clean_text(text)
+    return [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)]
+# ===== LLM 요약 =====
+def llm_summary(text, model_choice):
+    llm = load_text_model(model_choice)
+    prompt = f"다음 글을 3문장 이내로 요약:\n{text}"
+    out = llm(prompt, max_new_tokens=150, do_sample=False, temperature=0.7,
+              repetition_penalty=1.2, no_repeat_ngram_size=3)
+    return out[0]["generated_text"].replace(prompt, "").strip()
+# ===== 분할 요약 → 통합 요약 =====
+def multi_stage_summary(text, model_choice):
+    chunks = chunk_text(text)
+    partial_summaries = [llm_summary(chunk, model_choice) for chunk in chunks]
+    combined_summary = " ".join(partial_summaries)
+    return llm_summary(combined_summary, model_choice)
 # ===== 재작성 =====
+def rewrite_with_llm(text, model_choice):
+    llm = load_text_model(model_choice)
     prompt = f"""다음 문장을 의미는 유지하되, 원문에 없는 내용은 절대 추가하지 말고,
+반복 없이 간결하고 매끄럽게 바꿔주세요.
 문장:
+{text}
 """
+    out = llm(prompt, max_new_tokens=200, do_sample=False, temperature=0.7,
+              repetition_penalty=1.2, no_repeat_ngram_size=3)
+    return out[0]["generated_text"].replace(prompt, "").strip()
 # ===== URL 처리 =====
 def process_url(url, model_choice):
         r = requests.get(url, headers={"User-Agent": "Mozilla/5.0"}, timeout=10)
         r.raise_for_status()
+        # 원문 추출
         plain_text = trafilatura.extract(r.text, output_format="txt", include_tables=False, favor_recall=True) or ""
         html_content = trafilatura.extract(r.text, output_format="html", include_tables=False, favor_recall=True)
         markdown_text = md(html_content or r.text, heading_style="ATX")
+        # 첫 줄 툴팁
         first_line = plain_text.strip().split("\n")[0].strip()
         link_html = f'<a href="{url}" title="{first_line}" target="_blank">원문 보기</a>'
+        # 분할 요약 → 통합 요약
+        final_summary = multi_stage_summary(plain_text, model_choice)
         # 재작성
+        paraphrased_text = rewrite_with_llm(final_summary, model_choice)
+        return link_html + "<br><br>" + markdown_text, final_summary, paraphrased_text
     except Exception as e:
         return f"에러 발생: {e}", "요약 없음", "재작성 없음"
         gr.Textbox(label="자동 요약", lines=5),
         gr.Textbox(label="자동 재작성 (LLM)", lines=5)
     ],
+    title="한국어 본문 추출 + 분할 요약 + LLM 재작성",
+    description="긴 원문도 분할 요약 후 통합 재작성으로 품질 유지"
 )
 if __name__ == "__main__":