Spaces:

LLDDWW
/

MedCard

Running

App Files Files Community

LLDDWW commited on Sep 30

Commit

e53f54d

1 Parent(s): dbf7d32

feat: add qwen vl narratives and cartoon generation

Browse files

Files changed (2) hide show

app.py +169 -73
requirements.txt +2 -1

app.py CHANGED Viewed

@@ -3,12 +3,20 @@ import re
 from typing import Any, Dict, List, Optional
 import gradio as gr
-import torch
 import spaces
 from PIL import Image, ImageDraw
-from transformers import AutoModelForVision2Seq, AutoProcessor
 VL_MODEL_ID = "Qwen/Qwen2.5-VL-7B-Instruct"
 def _load_vl_model():
@@ -29,6 +37,39 @@ def _load_vl_model():
 VL_MODEL, VL_PROCESSOR = _load_vl_model()
 def _extract_assistant_content(decoded: str) -> str:
     if "<|im_start|>assistant" in decoded:
         content = decoded.split("<|im_start|>assistant")[-1]
@@ -44,40 +85,34 @@ def _extract_json_block(text: str) -> Optional[str]:
     return match.group(0)
-def _sanitize_medication(item: Dict[str, Any]) -> Dict[str, Any]:
-    def _as_str(value: Any) -> str:
-        if value is None:
-            return ""
-        return str(value).strip()
-    name = _as_str(item.get("name"))
-    dose = _as_str(item.get("dose_per_intake"))
     times = item.get("times_per_day")
     if isinstance(times, (int, float)):
         times_str = str(int(times)) if float(times).is_integer() else str(times)
     else:
-        times_str = _as_str(times)
-    time_slots_raw = item.get("time_slots")
-    if isinstance(time_slots_raw, (list, tuple)):
-        time_slots = [str(t).strip() for t in time_slots_raw if str(t).strip()]
-    elif isinstance(time_slots_raw, str):
-        slots = [s.strip() for s in re.split(r"[,;]\s*", time_slots_raw) if s.strip()]
-        time_slots = slots
-    else:
-        time_slots = []
     return {
-        "name": name,
-        "dose_per_intake": dose,
         "times_per_day": times_str,
-        "time_slots": time_slots,
-        "description": _as_str(item.get("description")),
-        "usage_example": _as_str(item.get("usage_example")),
-        "dosage_example": _as_str(item.get("dosage_example")),
-        "side_effects": _as_str(item.get("side_effects")),
-        "warnings": _as_str(item.get("warnings")),
     }
@@ -87,7 +122,7 @@ def _parse_vl_response(text: str) -> Dict[str, Any]:
         return {
             "raw_text": "",
             "medications": [],
-            "warnings": ["LLM 응답에서 JSON을 찾지 못했습니다.", text.strip()],
         }
     try:
         data = json.loads(json_block)
@@ -95,11 +130,9 @@ def _parse_vl_response(text: str) -> Dict[str, Any]:
         return {
             "raw_text": "",
             "medications": [],
-            "warnings": ["LLM JSON 파싱 실패", text.strip()],
         }
-    raw_text = str(data.get("raw_text", "")).strip()
     meds_raw = data.get("medications") or []
     medications: List[Dict[str, Any]] = []
     if isinstance(meds_raw, list):
@@ -116,7 +149,7 @@ def _parse_vl_response(text: str) -> Dict[str, Any]:
         warnings = []
     return {
-        "raw_text": raw_text,
         "medications": medications,
         "warnings": warnings,
     }
@@ -135,27 +168,26 @@ def analyze_image_with_qwen(image: Image.Image) -> Dict[str, Any]:
         "    {\n"
         "      \"name\": \"약 이름\",\n"
         "      \"dose_per_intake\": \"1회 용량 (예: 1정, 5mL)\",\n"
-        "      \"times_per_day\": \"하루 복용 횟수 (모르면 빈 문자열)\",\n"
         "      \"time_slots\": [\"복용 시간대\"],\n"
-        "      \"description\": \"어떤 약인지 한 줄 설명\",\n"
-        "      \"usage_example\": \"언제 복용하면 좋은지 예시\",\n"
-        "      \"dosage_example\": \"복용 방법 예시(예: 식후 30분, 1회 1정)\",\n"
-        "      \"side_effects\": \"주요 부작용 또는 주의사항\",\n"
-        "      \"warnings\": \"추가 주의 문구\"\n"
         "    }\n"
         "  ],\n"
-        "  \"warnings\": [\"전체적인 경고 문구\"]\n"
         "}"
     )
     user_prompt = (
-        "위 JSON 스키마를 그대로 따르세요. 빈 값은 빈 문자열로 둡니다. "
-        "모든 값은 한국어로 작성하고, 중학생도 이해할 수 있는 말투로 설명하세요."
     )
     messages = [
         {
             "role": "system",
-            "content": "당신은 약사 선생님으로서 약봉투 이미지를 해석하고 친절하게 설명합니다.",
         },
         {
             "role": "user",
@@ -169,11 +201,7 @@ def analyze_image_with_qwen(image: Image.Image) -> Dict[str, Any]:
     ]
     chat_text = VL_PROCESSOR.apply_chat_template(messages, add_generation_prompt=True)
-    inputs = VL_PROCESSOR(
-        text=[chat_text],
-        images=[image],
-        return_tensors="pt",
-    ).to(VL_MODEL.device)
     output_ids = VL_MODEL.generate(
         **inputs,
@@ -188,6 +216,85 @@ def analyze_image_with_qwen(image: Image.Image) -> Dict[str, Any]:
     return _parse_vl_response(assistant_text)
 def render_card(primary: Dict[str, Any]) -> Image.Image:
     width, height = 720, 400
     canvas = Image.new("RGB", (width, height), "white")
@@ -231,28 +338,6 @@ def medications_to_csv(medications: List[Dict[str, Any]]) -> str:
     return ",".join(row)
-def build_markdown(medications: List[Dict[str, Any]]) -> str:
-    if not medications:
-        return "### 약 설명\n- 약 정보를 인식하지 못했습니다. 약사에게 직접 확인해 주세요."
-    lines: List[str] = ["### 쉽게 알아보는 약 설명"]
-    for med in medications:
-        lines.append(f"- **{med.get('name') or '이름 미확인'}**")
-        if med.get("description"):
-            lines.append(f"  - 하는 일: {med['description']}")
-        if med.get("usage_example"):
-            lines.append(f"  - 복용 예시: {med['usage_example']}")
-        if med.get("dosage_example"):
-            lines.append(f"  - 복용 방법 예시: {med['dosage_example']}")
-        if med.get("side_effects"):
-            lines.append(f"  - 부작용/주의: {med['side_effects']}")
-        if med.get("warnings"):
-            lines.append(f"  - 추가 주의: {med['warnings']}")
-    lines.append("\n> ⚠️ 실제 복약은 의사·약사의 지시에 반드시 따르세요.")
-    return "\n".join(lines)
 def format_warnings(warnings: List[str]) -> str:
     if not warnings:
         return "✅ 인식된 정보가 충분해요. 복약 시간만 잘 지켜 주세요."
@@ -272,6 +357,7 @@ def run_pipeline(image: Optional[Image.Image]):
             "이미지를 먼저 업로드해 주세요.",
             "📷 약 봉투 사진을 올리면 인식이 시작돼요.",
             "",
         )
     result = analyze_image_with_qwen(image)
@@ -284,14 +370,23 @@ def run_pipeline(image: Optional[Image.Image]):
         "time_slots": [],
     }
     card_img = render_card(primary)
     csv_row = medications_to_csv(medications)
-    markdown = build_markdown(medications)
     warnings_md = format_warnings(result.get("warnings", []))
     raw_text = result.get("raw_text", "")
     json_text = json.dumps(result, ensure_ascii=False, indent=2)
-    return json_text, card_img, csv_row, markdown, warnings_md, raw_text
 CUSTOM_CSS = """
@@ -319,7 +414,7 @@ body {background: radial-gradient(circle at top left, #f5f0ff 0%, #fff7ec 60%, #
 HERO_HTML = """
 <div class="hero">
   <h1>MedCard-KR · 약봉투 한 컷으로 이해하는 복용 안내</h1>
-  <p>Qwen2.5-VL이 사진 속 글자를 직접 읽고, 약 설명·복용 예시·부작용까지 한 번에 정리해 드립니다.</p>
 </div>
 """
@@ -336,6 +431,7 @@ with gr.Blocks(theme=gr.themes.Soft(), css=CUSTOM_CSS) as demo:
             gr.Markdown("### 2. 결과를 확인하세요")
             explain_md = gr.Markdown("여기에 약 설명이 표시됩니다.", elem_classes=["output-card"])
             raw_box = gr.Textbox(label="모델이 읽은 원문 텍스트", lines=5, interactive=False)
             card_out = gr.Image(type="pil", label="일정 카드(미리보기)")
             csv_box = gr.Textbox(label="CSV(약명,1회용량,1일횟수,시간대)", lines=2, elem_classes=["csv-box"])
             with gr.Accordion("세부 JSON 결과", open=False, elem_classes=["accordion"]):
@@ -344,7 +440,7 @@ with gr.Blocks(theme=gr.themes.Soft(), css=CUSTOM_CSS) as demo:
     btn.click(
         run_pipeline,
         inputs=img_in,
-        outputs=[json_out, card_out, csv_box, explain_md, warn_md, raw_box],
     )
     gr.Markdown(

 from typing import Any, Dict, List, Optional
 import gradio as gr
 import spaces
+import torch
+from diffusers import AutoPipelineForText2Image
 from PIL import Image, ImageDraw
+from transformers import (
+    AutoModelForCausalLM,
+    AutoModelForVision2Seq,
+    AutoProcessor,
+    AutoTokenizer,
+)
 VL_MODEL_ID = "Qwen/Qwen2.5-VL-7B-Instruct"
+TEXT_MODEL_ID = "Qwen/Qwen2.5-1.5B-Instruct"
+IMAGE_MODEL_ID = "stabilityai/stable-diffusion-2-1"
 def _load_vl_model():
 VL_MODEL, VL_PROCESSOR = _load_vl_model()
+def _load_text_model():
+    device_map = "auto" if torch.cuda.is_available() else None
+    dtype = torch.float16 if torch.cuda.is_available() else torch.float32
+    model = AutoModelForCausalLM.from_pretrained(
+        TEXT_MODEL_ID,
+        device_map=device_map,
+        torch_dtype=dtype,
+        trust_remote_code=True,
+    )
+    if device_map is None:
+        model = model.to(torch.device("cpu"))
+    tokenizer = AutoTokenizer.from_pretrained(TEXT_MODEL_ID, trust_remote_code=True)
+    return model, tokenizer
+TEXT_MODEL, TEXT_TOKENIZER = _load_text_model()
+def _load_image_pipeline():
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    dtype = torch.float16 if torch.cuda.is_available() else torch.float32
+    pipe = AutoPipelineForText2Image.from_pretrained(
+        IMAGE_MODEL_ID,
+        torch_dtype=dtype,
+        safety_checker=None,
+    )
+    pipe.to(device)
+    return pipe
+IMAGE_PIPELINE = _load_image_pipeline()
 def _extract_assistant_content(decoded: str) -> str:
     if "<|im_start|>assistant" in decoded:
         content = decoded.split("<|im_start|>assistant")[-1]
     return match.group(0)
+def _sanitize_list(value: Any) -> List[str]:
+    if isinstance(value, (list, tuple)):
+        return [str(v).strip() for v in value if str(v).strip()]
+    if isinstance(value, str):
+        return [v.strip() for v in re.split(r"[,;]", value) if v.strip()]
+    return []
+def _sanitize_medication(item: Dict[str, Any]) -> Dict[str, Any]:
+    def _to_str(val: Any) -> str:
+        return "" if val is None else str(val).strip()
     times = item.get("times_per_day")
     if isinstance(times, (int, float)):
         times_str = str(int(times)) if float(times).is_integer() else str(times)
     else:
+        times_str = _to_str(times)
     return {
+        "name": _to_str(item.get("name")),
+        "dose_per_intake": _to_str(item.get("dose_per_intake")),
         "times_per_day": times_str,
+        "time_slots": _sanitize_list(item.get("time_slots")),
+        "description": _to_str(item.get("description")),
+        "usage_example": _to_str(item.get("usage_example")),
+        "dosage_example": _to_str(item.get("dosage_example")),
+        "side_effects": _to_str(item.get("side_effects")),
+        "warnings": _to_str(item.get("warnings")),
     }
         return {
             "raw_text": "",
             "medications": [],
+            "warnings": ["모델 응답에서 JSON 형식을 찾지 못했습니다."] + ([text.strip()] if text.strip() else []),
         }
     try:
         data = json.loads(json_block)
         return {
             "raw_text": "",
             "medications": [],
+            "warnings": ["모델 JSON 파싱 실패", text.strip()],
         }
     meds_raw = data.get("medications") or []
     medications: List[Dict[str, Any]] = []
     if isinstance(meds_raw, list):
         warnings = []
     return {
+        "raw_text": str(data.get("raw_text", "")).strip(),
         "medications": medications,
         "warnings": warnings,
     }
         "    {\n"
         "      \"name\": \"약 이름\",\n"
         "      \"dose_per_intake\": \"1회 용량 (예: 1정, 5mL)\",\n"
+        "      \"times_per_day\": \"하루 복용 횟수\",\n"
         "      \"time_slots\": [\"복용 시간대\"],\n"
+        "      \"description\": \"약 설명\",\n"
+        "      \"usage_example\": \"복용 예시\",\n"
+        "      \"dosage_example\": \"복용 방법 예시\",\n"
+        "      \"side_effects\": \"주요 부작용\",\n"
+        "      \"warnings\": \"주의 문구\"\n"
         "    }\n"
         "  ],\n"
+        "  \"warnings\": [\"전체 경고\"]\n"
         "}"
     )
     user_prompt = (
+        "위 JSON 스키마를 반드시 따르세요. 모든 값은 한국어로 작성하고, 빈 정보는 빈 문자열로 두세요."
     )
     messages = [
         {
             "role": "system",
+            "content": "당신은 약사 선생님입니다. 정확하고 친절하게 정보를 정리하세요.",
         },
         {
             "role": "user",
     ]
     chat_text = VL_PROCESSOR.apply_chat_template(messages, add_generation_prompt=True)
+    inputs = VL_PROCESSOR(text=[chat_text], images=[image], return_tensors="pt").to(VL_MODEL.device)
     output_ids = VL_MODEL.generate(
         **inputs,
     return _parse_vl_response(assistant_text)
+@spaces.GPU(enable_queue=True)
+def generate_explanations(raw_text: str, medications: List[Dict[str, Any]]) -> Dict[str, str]:
+    med_summary_lines = []
+    for med in medications:
+        summary = f"- {med.get('name', '이름 미확인')} {med.get('dose_per_intake', '')}"
+        med_summary_lines.append(summary.strip())
+    med_summary = "\n".join(med_summary_lines)
+    system_prompt = "약사 선생님처럼 어르신과 어린이에게 각각 쉽게 설명하세요."
+    user_prompt = (
+        "다음은 약 봉투에서 읽은 원문과 약 목록입니다. \n"
+        "JSON으로 답변하세요. 형식은 {\"elderly\": {\"narrative\": ..., \"image_prompt\": ...}, \"child\": {\"narrative\": ..., \"image_prompt\": ...}} 입니다.\n"
+        "narrative는 한국어, image_prompt는 영어로 한 컷 만화 스타일을 묘사하세요.\n"
+        f"약 목록:\n{med_summary}\n\n원문:\n{raw_text}\n"
+    )
+    messages = [
+        {"role": "system", "content": system_prompt},
+        {"role": "user", "content": user_prompt},
+    ]
+    input_ids = TEXT_TOKENIZER.apply_chat_template(
+        messages,
+        add_generation_prompt=True,
+        return_tensors="pt",
+    ).to(TEXT_MODEL.device)
+    with torch.no_grad():
+        output_ids = TEXT_MODEL.generate(
+            input_ids,
+            max_new_tokens=512,
+            temperature=0.3,
+            top_p=0.8,
+        )
+    generated_ids = output_ids[0][input_ids.shape[1]:]
+    text = TEXT_TOKENIZER.decode(generated_ids, skip_special_tokens=True).strip()
+    json_block = _extract_json_block(text)
+    if not json_block:
+        return {
+            "elderly_narrative": "설명을 준비하지 못했습니다. 약사에게 직접 문의하세요.",
+            "child_narrative": "설명을 준비하지 못했습니다. 약사에게 직접 문의하세요.",
+            "image_prompt": "single panel cartoon pharmacist helping family, soft colors",
+        }
+    try:
+        data = json.loads(json_block)
+    except json.JSONDecodeError:
+        return {
+            "elderly_narrative": "설명을 준비하지 못했습니다. 약사에게 직접 문의하세요.",
+            "child_narrative": "설명을 준비하지 못했습니다. 약사에게 직접 문의하세요.",
+            "image_prompt": "single panel cartoon pharmacist helping family, soft colors",
+        }
+    elderly = data.get("elderly", {})
+    child = data.get("child", {})
+    return {
+        "elderly_narrative": str(elderly.get("narrative", "")).strip(),
+        "child_narrative": str(child.get("narrative", "")).strip(),
+        "image_prompt": str(child.get("image_prompt") or elderly.get("image_prompt") or "single panel cartoon pharmacist helping family, pastel colors").strip(),
+    }
+@spaces.GPU(enable_queue=True)
+def generate_cartoon_image(prompt: str) -> Image.Image:
+    if not prompt:
+        prompt = "single panel wholesome cartoon, pharmacist gently explaining medicine to family, warm pastel colors"
+    negative_prompt = "text, watermark, logo, blurry"
+    image = IMAGE_PIPELINE(
+        prompt=prompt,
+        negative_prompt=negative_prompt,
+        num_inference_steps=30,
+        guidance_scale=7.5,
+    ).images[0]
+    return image
 def render_card(primary: Dict[str, Any]) -> Image.Image:
     width, height = 720, 400
     canvas = Image.new("RGB", (width, height), "white")
     return ",".join(row)
 def format_warnings(warnings: List[str]) -> str:
     if not warnings:
         return "✅ 인식된 정보가 충분해요. 복약 시간만 잘 지켜 주세요."
             "이미지를 먼저 업로드해 주세요.",
             "📷 약 봉투 사진을 올리면 인식이 시작돼요.",
             "",
+            None,
         )
     result = analyze_image_with_qwen(image)
         "time_slots": [],
     }
+    narratives = generate_explanations(result.get("raw_text", ""), medications)
     card_img = render_card(primary)
     csv_row = medications_to_csv(medications)
+    markdown = (
+        "## 어르신을 위한 설명\n"
+        + (narratives.get("elderly_narrative") or "- 설명을 준비하지 못했습니다.")
+        + "\n\n## 어린이를 위한 설명\n"
+        + (narratives.get("child_narrative") or "- 설명을 준비하지 못했습니다.")
+        + "\n\n> 항상 의료진의 안내를 우선하세요."
+    )
     warnings_md = format_warnings(result.get("warnings", []))
     raw_text = result.get("raw_text", "")
     json_text = json.dumps(result, ensure_ascii=False, indent=2)
+    cartoon_image = generate_cartoon_image(narratives.get("image_prompt"))
+    return json_text, card_img, csv_row, markdown, warnings_md, raw_text, cartoon_image
 CUSTOM_CSS = """
 HERO_HTML = """
 <div class="hero">
   <h1>MedCard-KR · 약봉투 한 컷으로 이해하는 복용 안내</h1>
+  <p>Qwen2.5-VL이 약 봉투를 직접 읽고, 약사처럼 쉽게 설명과 한 컷 만화를 함께 제공합니다.</p>
 </div>
 """
             gr.Markdown("### 2. 결과를 확인하세요")
             explain_md = gr.Markdown("여기에 약 설명이 표시됩니다.", elem_classes=["output-card"])
             raw_box = gr.Textbox(label="모델이 읽은 원문 텍스트", lines=5, interactive=False)
+            cartoon_img = gr.Image(type="pil", label="한 컷 만화")
             card_out = gr.Image(type="pil", label="일정 카드(미리보기)")
             csv_box = gr.Textbox(label="CSV(약명,1회용량,1일횟수,시간대)", lines=2, elem_classes=["csv-box"])
             with gr.Accordion("세부 JSON 결과", open=False, elem_classes=["accordion"]):
     btn.click(
         run_pipeline,
         inputs=img_in,
+        outputs=[json_out, card_out, csv_box, explain_md, warn_md, raw_box, cartoon_img],
     )
     gr.Markdown(

requirements.txt CHANGED Viewed

@@ -2,7 +2,8 @@ transformers
 torch
 accelerate
 einops
 gradio
 Pillow
 sentencepiece
-torchvision

 torch
 accelerate
 einops
+diffusers
+safetensors
 gradio
 Pillow
 sentencepiece