orgoflu commited on
Commit
26bd648
ยท
verified ยท
1 Parent(s): 208dd23

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +127 -23
app.py CHANGED
@@ -1,23 +1,30 @@
 
 
 
1
  import gradio as gr
 
 
 
 
 
 
 
2
  import torch
3
- from transformers import pipeline, AutoTokenizer, AutoModelForVision2Seq
4
 
5
- # ===== ๋ชจ๋ธ ๋ชฉ๋ก =====
6
  MODEL_OPTIONS = {
7
  "Qwen2.5-1.5B-Instruct": "Qwen/Qwen2.5-1.5B-Instruct",
8
- "Gemma-3-4B-it": "google/gemma-3-4b-it",
9
  "CLOVA-Donut-CORDv2": "naver-clova-ix/donut-base-finetuned-cord-v2"
10
  }
11
 
12
  # ===== ๋ชจ๋ธ ๋กœ๋“œ =====
13
  def load_model(model_name):
14
  if model_name == "naver-clova-ix/donut-base-finetuned-cord-v2":
15
- # Vision2Seq ๋ชจ๋ธ ๋กœ๋“œ
16
  tokenizer = AutoTokenizer.from_pretrained(model_name)
17
  model = AutoModelForVision2Seq.from_pretrained(model_name)
18
  return pipeline("image-to-text", model=model, tokenizer=tokenizer)
19
  else:
20
- from transformers import AutoModelForCausalLM, AutoTokenizer
21
  tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
22
  model = AutoModelForCausalLM.from_pretrained(
23
  model_name,
@@ -26,25 +33,122 @@ def load_model(model_name):
26
  ).to("cpu")
27
  return pipeline("text-generation", model=model, tokenizer=tokenizer, device=-1)
28
 
29
- # ===== CLOVA ์ด๋ฏธ์ง€ ์ฒ˜๋ฆฌ =====
30
- def process_image_with_clova(image):
31
- pipe = load_model("naver-clova-ix/donut-base-finetuned-cord-v2")
32
- result = pipe(image)
33
- return result[0]["generated_text"]
34
 
35
- # ===== Gradio UI =====
36
- with gr.Blocks() as iface:
37
- gr.Markdown("## Qwen / Gemma / CLOVA Donut ํ…Œ์ŠคํŠธ")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
38
 
39
- with gr.Tab("ํ…์ŠคํŠธ URL ์š”์•ฝ/์žฌ์ž‘์„ฑ"):
40
- url_input = gr.Textbox(label="URL ์ž…๋ ฅ")
41
- model_choice = gr.Dropdown(choices=list(MODEL_OPTIONS.keys()), value="Qwen2.5-1.5B-Instruct")
42
- output_text = gr.Textbox(label="์ถœ๋ ฅ")
43
- # ์—ฌ๊ธฐ์— ๊ธฐ์กด URL ์ฒ˜๋ฆฌ ํ•จ์ˆ˜ ์—ฐ๊ฒฐ
44
 
45
- with gr.Tab("CLOVA ์ด๋ฏธ์ง€ โ†’ ํ…์ŠคํŠธ"):
46
- image_input = gr.Image(type="pil", label="์ด๋ฏธ์ง€ ์—…๋กœ๋“œ")
47
- clova_output = gr.Textbox(label="์ธ์‹ ๊ฒฐ๊ณผ")
48
- image_input.change(process_image_with_clova, inputs=image_input, outputs=clova_output)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
49
 
50
- iface.launch()
 
 
1
+ import nltk
2
+ nltk.download("punkt")
3
+
4
  import gradio as gr
5
+ import trafilatura
6
+ import requests
7
+ from markdownify import markdownify as md
8
+ from sumy.parsers.plaintext import PlaintextParser
9
+ from sumy.nlp.tokenizers import Tokenizer
10
+ from sumy.summarizers.text_rank import TextRankSummarizer
11
+ import re
12
  import torch
13
+ from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline, AutoModelForVision2Seq
14
 
15
+ # ===== ์‚ฌ์šฉํ•  ๋ชจ๋ธ 2๊ฐœ =====
16
  MODEL_OPTIONS = {
17
  "Qwen2.5-1.5B-Instruct": "Qwen/Qwen2.5-1.5B-Instruct",
 
18
  "CLOVA-Donut-CORDv2": "naver-clova-ix/donut-base-finetuned-cord-v2"
19
  }
20
 
21
  # ===== ๋ชจ๋ธ ๋กœ๋“œ =====
22
  def load_model(model_name):
23
  if model_name == "naver-clova-ix/donut-base-finetuned-cord-v2":
 
24
  tokenizer = AutoTokenizer.from_pretrained(model_name)
25
  model = AutoModelForVision2Seq.from_pretrained(model_name)
26
  return pipeline("image-to-text", model=model, tokenizer=tokenizer)
27
  else:
 
28
  tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
29
  model = AutoModelForCausalLM.from_pretrained(
30
  model_name,
 
33
  ).to("cpu")
34
  return pipeline("text-generation", model=model, tokenizer=tokenizer, device=-1)
35
 
36
+ # ===== ํ…์ŠคํŠธ ์ „์ฒ˜๋ฆฌ =====
37
+ def clean_text(text: str) -> str:
38
+ return re.sub(r'\s+', ' ', text).strip()
 
 
39
 
40
+ def remove_duplicates(sentences):
41
+ seen, result = set(), []
42
+ for s in sentences:
43
+ s_clean = s.strip()
44
+ if s_clean and s_clean not in seen:
45
+ seen.add(s_clean)
46
+ result.append(s_clean)
47
+ return result
48
+
49
+ # ===== ์ž๋™ ์š”์•ฝ =====
50
+ def summarize_text(text):
51
+ text = clean_text(text)
52
+ length = len(text)
53
+ if length < 300:
54
+ sentence_count = 1
55
+ elif length < 800:
56
+ sentence_count = 2
57
+ elif length < 1500:
58
+ sentence_count = 3
59
+ else:
60
+ sentence_count = 4
61
+
62
+ try:
63
+ parser = PlaintextParser.from_string(text, Tokenizer("korean"))
64
+ if len(parser.document.sentences) == 0:
65
+ raise ValueError
66
+ except:
67
+ try:
68
+ parser = PlaintextParser.from_string(text, Tokenizer("english"))
69
+ if len(parser.document.sentences) == 0:
70
+ raise ValueError
71
+ except:
72
+ sentences = re.split(r'(?<=[.!?])\s+', text)
73
+ return sentences[:sentence_count]
74
+
75
+ summarizer = TextRankSummarizer()
76
+ summary_sentences = summarizer(parser.document, sentence_count)
77
+ summary_list = [str(sentence) for sentence in summary_sentences]
78
+ summary_list = remove_duplicates(summary_list)
79
+ summary_list.sort(key=lambda s: text.find(s))
80
+ return summary_list
81
+
82
+ # ===== LLM ์žฌ์ž‘์„ฑ =====
83
+ def rewrite_with_llm(sentences, model_choice):
84
+ model_name = MODEL_OPTIONS[model_choice]
85
+ llm_pipeline = load_model(model_name)
86
+
87
+ joined_text = "\n".join(sentences)
88
+
89
+ if model_choice == "CLOVA-Donut-CORDv2":
90
+ # CLOVA Donut์€ ์›๋ž˜ ์ด๋ฏธ์ง€ ์ „์šฉ์ด์ง€๋งŒ, ํ…์ŠคํŠธ๋ฅผ ์ด๋ฏธ์ง€ ์—†์ด ์ฒ˜๋ฆฌํ•˜๋„๋ก ๋ณ€ํ˜•
91
+ # ์—ฌ๊ธฐ์„œ๋Š” ๋‹จ์ˆœํžˆ ์ž…๋ ฅ ํ…์ŠคํŠธ๋ฅผ ๊ทธ๋Œ€๋กœ ๋ฐ˜ํ™˜ํ•˜๊ฑฐ๋‚˜, ํ•„์š” ์‹œ ํ›„์ฒ˜๋ฆฌ ๊ฐ€๋Šฅ
92
+ return joined_text # CLOVA๋Š” ํ…์ŠคํŠธ ์žฌ์ž‘์„ฑ ๊ธฐ๋Šฅ์ด ์—†์œผ๋ฏ€๋กœ ๊ทธ๋Œ€๋กœ ๋ฐ˜ํ™˜
93
 
94
+ prompt = f"""๋‹ค์Œ ๋ฌธ์žฅ์„ ์˜๋ฏธ๋Š” ์œ ์ง€ํ•˜๋˜, ์›๋ฌธ์— ์—†๋Š” ๋‚ด์šฉ์€ ์ ˆ๋Œ€ ์ถ”๊ฐ€ํ•˜์ง€ ๋ง๊ณ ,
95
+ ๋ฌธ์žฅ๋งŒ ๋” ์ž์—ฐ์Šค๋Ÿฝ๊ฒŒ ๋ฐ”๊ฟ”์ฃผ์„ธ์š”. ๋‹ค๋ฅธ ์„ค๋ช…์ด๋‚˜ ๋ถ€์—ฐ ๋ฌธ์žฅ์€ ์“ฐ์ง€ ๋งˆ์„ธ์š”.
 
 
 
96
 
97
+ ๋ฌธ์žฅ:
98
+ {joined_text}
99
+ """
100
+ result = llm_pipeline(prompt, max_new_tokens=150, do_sample=False, temperature=0)
101
+ return result[0]["generated_text"].replace(prompt, "").strip()
102
+
103
+ # ===== ์ „์ฒด ํŒŒ์ดํ”„๋ผ์ธ =====
104
+ def extract_summarize_paraphrase(url, model_choice):
105
+ headers = {"User-Agent": "Mozilla/5.0"}
106
+ try:
107
+ r = requests.get(url, headers=headers, timeout=10)
108
+ r.raise_for_status()
109
+
110
+ html_content = trafilatura.extract(
111
+ r.text,
112
+ output_format="html",
113
+ include_tables=False,
114
+ favor_recall=True
115
+ )
116
+
117
+ if not html_content:
118
+ markdown_text = md(r.text, heading_style="ATX")
119
+ else:
120
+ markdown_text = md(html_content, heading_style="ATX")
121
+
122
+ summary_sentences = summarize_text(markdown_text)
123
+ if not summary_sentences:
124
+ summary_sentences = ["์š”์•ฝ ์—†์Œ"]
125
+
126
+ paraphrased_text = rewrite_with_llm(summary_sentences, model_choice)
127
+
128
+ return (
129
+ markdown_text or "๋ณธ๋ฌธ ์—†์Œ",
130
+ "\n".join(summary_sentences),
131
+ paraphrased_text
132
+ )
133
+
134
+ except Exception as e:
135
+ return f"์—๋Ÿฌ ๋ฐœ์ƒ: {e}", "์š”์•ฝ ์—†์Œ", "์žฌ์ž‘์„ฑ ์—†์Œ"
136
+
137
+ # ===== Gradio UI =====
138
+ iface = gr.Interface(
139
+ fn=extract_summarize_paraphrase,
140
+ inputs=[
141
+ gr.Textbox(label="URL ์ž…๋ ฅ", placeholder="https://example.com"),
142
+ gr.Dropdown(choices=list(MODEL_OPTIONS.keys()), value="Qwen2.5-1.5B-Instruct", label="์žฌ์ž‘์„ฑ ๋ชจ๋ธ ์„ ํƒ")
143
+ ],
144
+ outputs=[
145
+ gr.Markdown(label="์ถ”์ถœ๋œ ๋ณธ๋ฌธ"),
146
+ gr.Textbox(label="์ž๋™ ์š”์•ฝ", lines=5),
147
+ gr.Textbox(label="์ž๋™ ์žฌ์ž‘์„ฑ (LLM)", lines=5)
148
+ ],
149
+ title="ํ•œ๊ตญ์–ด ๋ณธ๋ฌธ ์ถ”์ถœ + ์ž๋™ ์š”์•ฝ + LLM ์žฌ์ž‘์„ฑ",
150
+ description="Qwen 1.5B ๋˜๋Š” CLOVA Donut(CORDv2)๋กœ ์žฌ์ž‘์„ฑ"
151
+ )
152
 
153
+ if __name__ == "__main__":
154
+ iface.launch()