Spaces:

LLDDWW
/

MedCard

Sleeping

LLDDWW Claude commited on Oct 1, 2025

Commit

6a2327c

1 Parent(s): e94e117

chore: switch OCR model to TrOCR

- Replace Qwen2-VL with microsoft/trocr-large-printed for OCR
- Update model loading and inference code for TrOCR architecture
- Simplify OCR processing logic

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>

Files changed (1) hide show

app.py +15 -40

app.py CHANGED Viewed

@@ -6,26 +6,24 @@ import gradio as gr
 import spaces
 import torch
 from PIL import Image
-from transformers import Qwen2VLForConditionalGeneration, AutoProcessor, AutoTokenizer, AutoModelForCausalLM
-# Stage 1: OCR 모델 (Qwen2-VL로 문서에서 텍스트 추출)
-OCR_MODEL_ID = "Qwen/Qwen2-VL-7B-Instruct"
 # Stage 2: LLM 모델 (텍스트에서 약 이름 추출)
 LLM_MODEL_ID = "Qwen/Qwen2.5-7B-Instruct"
 def _load_ocr_model():
-    """Qwen2-VL OCR 모델 로드"""
-    model = Qwen2VLForConditionalGeneration.from_pretrained(
         OCR_MODEL_ID,
         device_map="auto",
-        load_in_8bit=True,
         torch_dtype=torch.float16,
-        trust_remote_code=True,
     )
-    processor = AutoProcessor.from_pretrained(OCR_MODEL_ID, trust_remote_code=True)
     return model, processor
@@ -43,9 +41,9 @@ def _load_llm_model():
     return model, tokenizer
-print("🔄 Loading Qwen2-VL OCR model...")
 OCR_MODEL, OCR_PROCESSOR = _load_ocr_model()
-print("✅ OCR model loaded!")
 print("🔄 Loading Qwen2.5-7B-Instruct...")
 LLM_MODEL, LLM_TOKENIZER = _load_llm_model()
@@ -70,39 +68,16 @@ def _extract_json_block(text: str) -> Optional[str]:
 def extract_text_from_image(image: Image.Image) -> str:
-    """Stage 1: Qwen2-VL로 이미지에서 텍스트 추출 (OCR)"""
     try:
-        messages = [
-            {
-                "role": "user",
-                "content": [
-                    {"type": "text", "text": "이 이미지의 모든 텍스트를 정확히 추출해서 그대로 출력해주세요. OCR 결과만 출력하세요."},
-                    {"type": "image"},
-                ],
-            }
-        ]
-        chat_text = OCR_PROCESSOR.apply_chat_template(messages, add_generation_prompt=True)
-        inputs = OCR_PROCESSOR(text=[chat_text], images=[image], return_tensors="pt").to(OCR_MODEL.device)
         with torch.no_grad():
-            output_ids = OCR_MODEL.generate(
-                **inputs,
-                max_new_tokens=1024,
-                temperature=0.1,  # 정확한 OCR을 위해 낮은 temperature
-                do_sample=False,  # 결정적 출력
-            )
-        output_text = OCR_PROCESSOR.batch_decode(output_ids, skip_special_tokens=False)[0]
-        # Extract assistant response
-        if "<|im_start|>assistant" in output_text:
-            extracted_text = output_text.split("<|im_start|>assistant")[-1]
-            extracted_text = extracted_text.replace("<|im_end|>", "").strip()
-        else:
-            extracted_text = output_text.strip()
-        return extracted_text
     except Exception as e:
         raise Exception(f"OCR 오류: {str(e)}")
@@ -329,7 +304,7 @@ with gr.Blocks(theme=gr.themes.Soft(), css=CUSTOM_CSS) as demo:
     ---
     **ℹ️ 2단계 파이프라인**
-    - **Stage 1**: Qwen2-VL 7B (OCR) - 이미지에서 모든 텍스트 추출
     - **Stage 2**: Qwen2.5 7B (LLM) - 추출된 텍스트에서 약 이름만 식별
     실제 복약은 의사·약사의 지시를 따르세요.

 import spaces
 import torch
 from PIL import Image
+from transformers import VisionEncoderDecoderModel, TrOCRProcessor, AutoTokenizer, AutoModelForCausalLM
+# Stage 1: OCR 모델 (TrOCR로 문서에서 텍스트 추출)
+OCR_MODEL_ID = "microsoft/trocr-large-printed"
 # Stage 2: LLM 모델 (텍스트에서 약 이름 추출)
 LLM_MODEL_ID = "Qwen/Qwen2.5-7B-Instruct"
 def _load_ocr_model():
+    """TrOCR 모델 로드"""
+    model = VisionEncoderDecoderModel.from_pretrained(
         OCR_MODEL_ID,
         device_map="auto",
         torch_dtype=torch.float16,
     )
+    processor = TrOCRProcessor.from_pretrained(OCR_MODEL_ID)
     return model, processor
     return model, tokenizer
+print("🔄 Loading TrOCR model...")
 OCR_MODEL, OCR_PROCESSOR = _load_ocr_model()
+print("✅ TrOCR model loaded!")
 print("🔄 Loading Qwen2.5-7B-Instruct...")
 LLM_MODEL, LLM_TOKENIZER = _load_llm_model()
 def extract_text_from_image(image: Image.Image) -> str:
+    """Stage 1: TrOCR로 이미지에서 텍스트 추출 (OCR)"""
     try:
+        # TrOCR은 이미지 전체를 한 번에 처리
+        pixel_values = OCR_PROCESSOR(image, return_tensors="pt").pixel_values.to(OCR_MODEL.device)
         with torch.no_grad():
+            generated_ids = OCR_MODEL.generate(pixel_values)
+        extracted_text = OCR_PROCESSOR.batch_decode(generated_ids, skip_special_tokens=True)[0]
+        return extracted_text.strip()
     except Exception as e:
         raise Exception(f"OCR 오류: {str(e)}")
     ---
     **ℹ️ 2단계 파이프라인**
+    - **Stage 1**: TrOCR (OCR) - 이미지에서 모든 텍스트 추출
     - **Stage 2**: Qwen2.5 7B (LLM) - 추출된 텍스트에서 약 이름만 식별
     실제 복약은 의사·약사의 지시를 따르세요.