sdfdsfads23333
Browse files
app.py
CHANGED
|
@@ -6,34 +6,31 @@ import gradio as gr
|
|
| 6 |
import spaces
|
| 7 |
import torch
|
| 8 |
from PIL import Image
|
| 9 |
-
from transformers import
|
| 10 |
|
| 11 |
-
# Stage 1: OCR ๋ชจ๋ธ (๋ฌธ์์์ ํ
์คํธ ์ถ์ถ)
|
| 12 |
-
OCR_MODEL_ID = "
|
| 13 |
|
| 14 |
# Stage 2: LLM ๋ชจ๋ธ (ํ
์คํธ์์ ์ฝ ์ด๋ฆ ์ถ์ถ)
|
| 15 |
LLM_MODEL_ID = "Qwen/Qwen2.5-7B-Instruct"
|
| 16 |
|
| 17 |
|
| 18 |
def _load_ocr_model():
|
| 19 |
-
"""
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
model = AutoModel.from_pretrained(
|
| 23 |
-
OCR_MODEL_ID,
|
| 24 |
-
trust_remote_code=True
|
| 25 |
-
).to(device)
|
| 26 |
-
|
| 27 |
-
processor = AutoProcessor.from_pretrained(
|
| 28 |
OCR_MODEL_ID,
|
| 29 |
-
|
|
|
|
|
|
|
|
|
|
| 30 |
)
|
| 31 |
|
|
|
|
| 32 |
return model, processor
|
| 33 |
|
| 34 |
|
| 35 |
def _load_llm_model():
|
| 36 |
-
"""
|
| 37 |
model = AutoModelForCausalLM.from_pretrained(
|
| 38 |
LLM_MODEL_ID,
|
| 39 |
device_map="auto",
|
|
@@ -46,7 +43,7 @@ def _load_llm_model():
|
|
| 46 |
return model, tokenizer
|
| 47 |
|
| 48 |
|
| 49 |
-
print("๐ Loading
|
| 50 |
OCR_MODEL, OCR_PROCESSOR = _load_ocr_model()
|
| 51 |
print("โ
OCR model loaded!")
|
| 52 |
|
|
@@ -73,15 +70,39 @@ def _extract_json_block(text: str) -> Optional[str]:
|
|
| 73 |
|
| 74 |
|
| 75 |
def extract_text_from_image(image: Image.Image) -> str:
|
| 76 |
-
"""Stage 1:
|
| 77 |
try:
|
| 78 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 79 |
|
| 80 |
with torch.no_grad():
|
| 81 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 82 |
|
| 83 |
-
extracted_text
|
| 84 |
-
return extracted_text.strip()
|
| 85 |
|
| 86 |
except Exception as e:
|
| 87 |
raise Exception(f"OCR ์ค๋ฅ: {str(e)}")
|
|
@@ -308,7 +329,7 @@ with gr.Blocks(theme=gr.themes.Soft(), css=CUSTOM_CSS) as demo:
|
|
| 308 |
---
|
| 309 |
|
| 310 |
**โน๏ธ 2๋จ๊ณ ํ์ดํ๋ผ์ธ**
|
| 311 |
-
- **Stage 1**:
|
| 312 |
- **Stage 2**: Qwen2.5 7B (LLM) - ์ถ์ถ๋ ํ
์คํธ์์ ์ฝ ์ด๋ฆ๋ง ์๋ณ
|
| 313 |
|
| 314 |
์ค์ ๋ณต์ฝ์ ์์ฌยท์ฝ์ฌ์ ์ง์๋ฅผ ๋ฐ๋ฅด์ธ์.
|
|
|
|
| 6 |
import spaces
|
| 7 |
import torch
|
| 8 |
from PIL import Image
|
| 9 |
+
from transformers import Qwen2VLForConditionalGeneration, AutoProcessor, AutoTokenizer, AutoModelForCausalLM
|
| 10 |
|
| 11 |
+
# Stage 1: OCR ๋ชจ๋ธ (Qwen2-VL๋ก ๋ฌธ์์์ ํ
์คํธ ์ถ์ถ)
|
| 12 |
+
OCR_MODEL_ID = "Qwen/Qwen2-VL-7B-Instruct"
|
| 13 |
|
| 14 |
# Stage 2: LLM ๋ชจ๋ธ (ํ
์คํธ์์ ์ฝ ์ด๋ฆ ์ถ์ถ)
|
| 15 |
LLM_MODEL_ID = "Qwen/Qwen2.5-7B-Instruct"
|
| 16 |
|
| 17 |
|
| 18 |
def _load_ocr_model():
|
| 19 |
+
"""Qwen2-VL OCR ๋ชจ๋ธ ๋ก๋"""
|
| 20 |
+
model = Qwen2VLForConditionalGeneration.from_pretrained(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 21 |
OCR_MODEL_ID,
|
| 22 |
+
device_map="auto",
|
| 23 |
+
load_in_8bit=True,
|
| 24 |
+
torch_dtype=torch.float16,
|
| 25 |
+
trust_remote_code=True,
|
| 26 |
)
|
| 27 |
|
| 28 |
+
processor = AutoProcessor.from_pretrained(OCR_MODEL_ID, trust_remote_code=True)
|
| 29 |
return model, processor
|
| 30 |
|
| 31 |
|
| 32 |
def _load_llm_model():
|
| 33 |
+
"""Qwen2.5 7B ๋ชจ๋ธ ๋ก๋ (8bit ์์ํ)"""
|
| 34 |
model = AutoModelForCausalLM.from_pretrained(
|
| 35 |
LLM_MODEL_ID,
|
| 36 |
device_map="auto",
|
|
|
|
| 43 |
return model, tokenizer
|
| 44 |
|
| 45 |
|
| 46 |
+
print("๐ Loading Qwen2-VL OCR model...")
|
| 47 |
OCR_MODEL, OCR_PROCESSOR = _load_ocr_model()
|
| 48 |
print("โ
OCR model loaded!")
|
| 49 |
|
|
|
|
| 70 |
|
| 71 |
|
| 72 |
def extract_text_from_image(image: Image.Image) -> str:
|
| 73 |
+
"""Stage 1: Qwen2-VL๋ก ์ด๋ฏธ์ง์์ ํ
์คํธ ์ถ์ถ (OCR)"""
|
| 74 |
try:
|
| 75 |
+
messages = [
|
| 76 |
+
{
|
| 77 |
+
"role": "user",
|
| 78 |
+
"content": [
|
| 79 |
+
{"type": "text", "text": "์ด ์ด๋ฏธ์ง์ ๋ชจ๋ ํ
์คํธ๋ฅผ ์ ํํ ์ถ์ถํด์ ๊ทธ๋๋ก ์ถ๋ ฅํด์ฃผ์ธ์. OCR ๊ฒฐ๊ณผ๋ง ์ถ๋ ฅํ์ธ์."},
|
| 80 |
+
{"type": "image"},
|
| 81 |
+
],
|
| 82 |
+
}
|
| 83 |
+
]
|
| 84 |
+
|
| 85 |
+
chat_text = OCR_PROCESSOR.apply_chat_template(messages, add_generation_prompt=True)
|
| 86 |
+
inputs = OCR_PROCESSOR(text=[chat_text], images=[image], return_tensors="pt").to(OCR_MODEL.device)
|
| 87 |
|
| 88 |
with torch.no_grad():
|
| 89 |
+
output_ids = OCR_MODEL.generate(
|
| 90 |
+
**inputs,
|
| 91 |
+
max_new_tokens=1024,
|
| 92 |
+
temperature=0.1, # ์ ํํ OCR์ ์ํด ๋ฎ์ temperature
|
| 93 |
+
do_sample=False, # ๊ฒฐ์ ์ ์ถ๋ ฅ
|
| 94 |
+
)
|
| 95 |
+
|
| 96 |
+
output_text = OCR_PROCESSOR.batch_decode(output_ids, skip_special_tokens=False)[0]
|
| 97 |
+
|
| 98 |
+
# Extract assistant response
|
| 99 |
+
if "<|im_start|>assistant" in output_text:
|
| 100 |
+
extracted_text = output_text.split("<|im_start|>assistant")[-1]
|
| 101 |
+
extracted_text = extracted_text.replace("<|im_end|>", "").strip()
|
| 102 |
+
else:
|
| 103 |
+
extracted_text = output_text.strip()
|
| 104 |
|
| 105 |
+
return extracted_text
|
|
|
|
| 106 |
|
| 107 |
except Exception as e:
|
| 108 |
raise Exception(f"OCR ์ค๋ฅ: {str(e)}")
|
|
|
|
| 329 |
---
|
| 330 |
|
| 331 |
**โน๏ธ 2๋จ๊ณ ํ์ดํ๋ผ์ธ**
|
| 332 |
+
- **Stage 1**: Qwen2-VL 7B (OCR) - ์ด๋ฏธ์ง์์ ๋ชจ๋ ํ
์คํธ ์ถ์ถ
|
| 333 |
- **Stage 2**: Qwen2.5 7B (LLM) - ์ถ์ถ๋ ํ
์คํธ์์ ์ฝ ์ด๋ฆ๋ง ์๋ณ
|
| 334 |
|
| 335 |
์ค์ ๋ณต์ฝ์ ์์ฌยท์ฝ์ฌ์ ์ง์๋ฅผ ๋ฐ๋ฅด์ธ์.
|