orgoflu commited on
Commit
5f01545
Β·
verified Β·
1 Parent(s): cc4c1e3

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +83 -0
app.py ADDED
@@ -0,0 +1,83 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import nltk
2
+ nltk.download("punkt")
3
+
4
+ import gradio as gr
5
+ import trafilatura, requests, re
6
+ from markdownify import markdownify as md
7
+ from sumy.parsers.plaintext import PlaintextParser
8
+ from sumy.nlp.tokenizers import Tokenizer
9
+ from sumy.summarizers.text_rank import TextRankSummarizer
10
+ from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline, AutoModelForVision2Seq
11
+
12
+ MODEL_OPTIONS = {
13
+ "Qwen2.5-1.5B-Instruct": "Qwen/Qwen2.5-1.5B-Instruct",
14
+ "CLOVA-Donut-CORDv2": "naver-clova-ix/donut-base-finetuned-cord-v2"
15
+ }
16
+
17
+ def load_model(model_name):
18
+ if model_name == MODEL_OPTIONS["CLOVA-Donut-CORDv2"]:
19
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
20
+ model = AutoModelForVision2Seq.from_pretrained(model_name)
21
+ return pipeline("image-to-text", model=model, tokenizer=tokenizer)
22
+ else:
23
+ tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
24
+ model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True).to("cpu")
25
+ return pipeline("text-generation", model=model, tokenizer=tokenizer, device=-1)
26
+
27
+ def clean_text(text):
28
+ return re.sub(r'\s+', ' ', text).strip()
29
+
30
+ def summarize_text(text):
31
+ text = clean_text(text)
32
+ length = len(text)
33
+ sentence_count = 1 if length < 300 else 2 if length < 800 else 3 if length < 1500 else 4
34
+ try:
35
+ parser = PlaintextParser.from_string(text, Tokenizer("korean"))
36
+ if not parser.document.sentences: raise ValueError
37
+ except:
38
+ try:
39
+ parser = PlaintextParser.from_string(text, Tokenizer("english"))
40
+ if not parser.document.sentences: raise ValueError
41
+ except:
42
+ return re.split(r'(?<=[.!?])\s+', text)[:sentence_count]
43
+ summarizer = TextRankSummarizer()
44
+ return [str(s) for s in summarizer(parser.document, sentence_count)]
45
+
46
+ def rewrite_with_llm(sentences, model_choice):
47
+ if model_choice == "CLOVA-Donut-CORDv2":
48
+ return "\n".join(sentences)
49
+ llm_pipeline = load_model(MODEL_OPTIONS[model_choice])
50
+ joined_text = "\n".join(sentences)
51
+ prompt = f"λ‹€μŒ λ¬Έμž₯을 μ˜λ―ΈλŠ” μœ μ§€ν•˜λ˜, 원문에 μ—†λŠ” λ‚΄μš©μ€ μ ˆλŒ€ μΆ”κ°€ν•˜μ§€ 말고 μžμ—°μŠ€λŸ½κ²Œ λ°”κΏ”μ£Όμ„Έμš”.\n\nλ¬Έμž₯:\n{joined_text}"
52
+ result = llm_pipeline(prompt, max_new_tokens=150, do_sample=False, temperature=0)
53
+ return result[0]["generated_text"].replace(prompt, "").strip()
54
+
55
+ def process_url(url, model_choice):
56
+ try:
57
+ r = requests.get(url, headers={"User-Agent": "Mozilla/5.0"}, timeout=10)
58
+ r.raise_for_status()
59
+ html_content = trafilatura.extract(r.text, output_format="html", include_tables=False, favor_recall=True)
60
+ markdown_text = md(html_content or r.text, heading_style="ATX")
61
+ summary_sentences = summarize_text(markdown_text) or ["μš”μ•½ μ—†μŒ"]
62
+ paraphrased_text = rewrite_with_llm(summary_sentences, model_choice)
63
+ return markdown_text, "\n".join(summary_sentences), paraphrased_text
64
+ except Exception as e:
65
+ return f"μ—λŸ¬ λ°œμƒ: {e}", "μš”μ•½ μ—†μŒ", "μž¬μž‘μ„± μ—†μŒ"
66
+
67
+ iface = gr.Interface(
68
+ fn=process_url,
69
+ inputs=[
70
+ gr.Textbox(label="URL μž…λ ₯", placeholder="https://example.com"),
71
+ gr.Dropdown(choices=list(MODEL_OPTIONS.keys()), value="Qwen2.5-1.5B-Instruct", label="μž¬μž‘μ„± λͺ¨λΈ 선택")
72
+ ],
73
+ outputs=[
74
+ gr.Markdown(label="μΆ”μΆœλœ λ³Έλ¬Έ"),
75
+ gr.Textbox(label="μžλ™ μš”μ•½", lines=5),
76
+ gr.Textbox(label="μžλ™ μž¬μž‘μ„± (LLM)", lines=5)
77
+ ],
78
+ title="ν•œκ΅­μ–΄ λ³Έλ¬Έ μΆ”μΆœ + μžλ™ μš”μ•½ + LLM μž¬μž‘μ„±",
79
+ description="URLμ—μ„œ 본문을 μΆ”μΆœν•˜κ³ , μžλ™ μš”μ•½ ν›„ μ„ νƒν•œ λͺ¨λΈλ‘œ μž¬μž‘μ„±ν•©λ‹ˆλ‹€."
80
+ )
81
+
82
+ if __name__ == "__main__":
83
+ iface.launch()