LLDDWW commited on
Commit
92bb45b
ยท
1 Parent(s): 64c57fb

sdfdsfads23333

Browse files
Files changed (1) hide show
  1. app.py +24 -20
app.py CHANGED
@@ -12,7 +12,7 @@ from transformers import AutoModel, AutoProcessor, AutoTokenizer, AutoModelForCa
12
  OCR_MODEL_ID = "ibm-granite/granite-docling-258M"
13
 
14
  # Stage 2: LLM ๋ชจ๋ธ (ํ…์ŠคํŠธ์—์„œ ์•ฝ ์ด๋ฆ„ ์ถ”์ถœ)
15
- LLM_MODEL_ID = "meta-llama/Llama-3.1-8B-Instruct"
16
 
17
 
18
  def _load_ocr_model():
@@ -50,7 +50,7 @@ print("๐Ÿ”„ Loading Granite Docling OCR model...")
50
  OCR_MODEL, OCR_PROCESSOR = _load_ocr_model()
51
  print("โœ… OCR model loaded!")
52
 
53
- print("๐Ÿ”„ Loading Llama-3.1-8B-Instruct...")
54
  LLM_MODEL, LLM_TOKENIZER = _load_llm_model()
55
  print("โœ… LLM model loaded!")
56
 
@@ -88,21 +88,24 @@ def extract_text_from_image(image: Image.Image) -> str:
88
 
89
 
90
  def extract_medications_from_text(text: str) -> List[str]:
91
- """Stage 2: Llama 3.1๋กœ ํ…์ŠคํŠธ์—์„œ ์•ฝ ์ด๋ฆ„๋งŒ ์ถ”์ถœ"""
92
  try:
93
- prompt = f"""<|begin_of_text|><|start_header_id|>system<|end_header_id|>
94
-
95
- You are a medical text analyzer. Extract only medication names from the given text and return them as a JSON array.
96
- Return ONLY valid JSON format: {{"medications": ["name1", "name2"]}}
97
- <|eot_id|><|start_header_id|>user<|end_header_id|>
98
-
99
- Extract all medication names from this text:
100
-
101
- {text}
102
-
103
- Return only the JSON array of medication names.<|eot_id|><|start_header_id|>assistant<|end_header_id|>
104
-
105
- """
 
 
 
106
 
107
  inputs = LLM_TOKENIZER(prompt, return_tensors="pt").to(LLM_MODEL.device)
108
 
@@ -118,9 +121,10 @@ Return only the JSON array of medication names.<|eot_id|><|start_header_id|>assi
118
 
119
  response = LLM_TOKENIZER.decode(outputs[0], skip_special_tokens=True)
120
 
121
- # Extract assistant response
122
- if "<|start_header_id|>assistant<|end_header_id|>" in response:
123
- response = response.split("<|start_header_id|>assistant<|end_header_id|>")[-1].strip()
 
124
 
125
  # Parse JSON
126
  json_match = re.search(r'\{.*?\}', response, re.DOTALL)
@@ -305,7 +309,7 @@ with gr.Blocks(theme=gr.themes.Soft(), css=CUSTOM_CSS) as demo:
305
 
306
  **โ„น๏ธ 2๋‹จ๊ณ„ ํŒŒ์ดํ”„๋ผ์ธ**
307
  - **Stage 1**: Granite Docling (OCR) - ์ด๋ฏธ์ง€์—์„œ ๋ชจ๋“  ํ…์ŠคํŠธ ์ถ”์ถœ
308
- - **Stage 2**: Llama 3.1 8B (LLM) - ์ถ”์ถœ๋œ ํ…์ŠคํŠธ์—์„œ ์•ฝ ์ด๋ฆ„๋งŒ ์‹๋ณ„
309
 
310
  ์‹ค์ œ ๋ณต์•ฝ์€ ์˜์‚ฌยท์•ฝ์‚ฌ์˜ ์ง€์‹œ๋ฅผ ๋”ฐ๋ฅด์„ธ์š”.
311
  """)
 
12
  OCR_MODEL_ID = "ibm-granite/granite-docling-258M"
13
 
14
  # Stage 2: LLM ๋ชจ๋ธ (ํ…์ŠคํŠธ์—์„œ ์•ฝ ์ด๋ฆ„ ์ถ”์ถœ)
15
+ LLM_MODEL_ID = "Qwen/Qwen2.5-7B-Instruct"
16
 
17
 
18
  def _load_ocr_model():
 
50
  OCR_MODEL, OCR_PROCESSOR = _load_ocr_model()
51
  print("โœ… OCR model loaded!")
52
 
53
+ print("๐Ÿ”„ Loading Qwen2.5-7B-Instruct...")
54
  LLM_MODEL, LLM_TOKENIZER = _load_llm_model()
55
  print("โœ… LLM model loaded!")
56
 
 
88
 
89
 
90
  def extract_medications_from_text(text: str) -> List[str]:
91
+ """Stage 2: Qwen2.5๋กœ ํ…์ŠคํŠธ์—์„œ ์•ฝ ์ด๋ฆ„๋งŒ ์ถ”์ถœ"""
92
  try:
93
+ messages = [
94
+ {
95
+ "role": "system",
96
+ "content": "You are a medical text analyzer. Extract only medication names from the given text and return them as a JSON array. Return ONLY valid JSON format."
97
+ },
98
+ {
99
+ "role": "user",
100
+ "content": f"Extract all medication names from this text:\n\n{text}\n\nReturn format: {{\"medications\": [\"name1\", \"name2\"]}}"
101
+ }
102
+ ]
103
+
104
+ prompt = LLM_TOKENIZER.apply_chat_template(
105
+ messages,
106
+ tokenize=False,
107
+ add_generation_prompt=True
108
+ )
109
 
110
  inputs = LLM_TOKENIZER(prompt, return_tensors="pt").to(LLM_MODEL.device)
111
 
 
121
 
122
  response = LLM_TOKENIZER.decode(outputs[0], skip_special_tokens=True)
123
 
124
+ # Extract assistant response (Qwen format)
125
+ if "<|im_start|>assistant" in response:
126
+ response = response.split("<|im_start|>assistant")[-1]
127
+ response = response.replace("<|im_end|>", "").strip()
128
 
129
  # Parse JSON
130
  json_match = re.search(r'\{.*?\}', response, re.DOTALL)
 
309
 
310
  **โ„น๏ธ 2๋‹จ๊ณ„ ํŒŒ์ดํ”„๋ผ์ธ**
311
  - **Stage 1**: Granite Docling (OCR) - ์ด๋ฏธ์ง€์—์„œ ๋ชจ๋“  ํ…์ŠคํŠธ ์ถ”์ถœ
312
+ - **Stage 2**: Qwen2.5 7B (LLM) - ์ถ”์ถœ๋œ ํ…์ŠคํŠธ์—์„œ ์•ฝ ์ด๋ฆ„๋งŒ ์‹๋ณ„
313
 
314
  ์‹ค์ œ ๋ณต์•ฝ์€ ์˜์‚ฌยท์•ฝ์‚ฌ์˜ ์ง€์‹œ๋ฅผ ๋”ฐ๋ฅด์„ธ์š”.
315
  """)