feat: explain meds in plain language
Browse files
app.py
CHANGED
|
@@ -1,6 +1,6 @@
|
|
| 1 |
import json
|
| 2 |
import re
|
| 3 |
-
from typing import Any, Dict, List, Optional
|
| 4 |
|
| 5 |
import gradio as gr
|
| 6 |
from PIL import Image, ImageDraw
|
|
@@ -23,6 +23,45 @@ TIME_KEYWORDS = [
|
|
| 23 |
"๊ธฐ์",
|
| 24 |
]
|
| 25 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 26 |
|
| 27 |
def _extract_time_slots(text: str) -> List[str]:
|
| 28 |
slots = []
|
|
@@ -39,40 +78,54 @@ def _extract_time_slots(text: str) -> List[str]:
|
|
| 39 |
return slots
|
| 40 |
|
| 41 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 42 |
def parse_fields(raw: str) -> Dict[str, Any]:
|
| 43 |
"""Extract drug name and dosage information from OCR text."""
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
fallback = re.search(r"([๊ฐ-ํฃA-Za-z]{2,})", text)
|
| 54 |
-
drug_name = fallback.group(1) if fallback else None
|
| 55 |
-
|
| 56 |
-
# 2) 1ํ ์ฉ๋: "1ํ 1์ ", "1์ ", "5 mL" ๋ฑ
|
| 57 |
-
dose_per_intake: Optional[str] = None
|
| 58 |
-
dose_match = re.search(r"(1ํ\s*)?(\d+[\./]?\d*)\s*([๊ฐ-ํฃA-Za-z]+|mL|ml|mg|์ )", text)
|
| 59 |
-
if dose_match:
|
| 60 |
-
dose_per_intake = f"{dose_match.group(2)} {dose_match.group(3)}".strip()
|
| 61 |
-
|
| 62 |
-
# 3) 1์ผ ๋ณต์ฉ ํ์: "1์ผ 3ํ", "ํ๋ฃจ 2ํ"
|
| 63 |
times_per_day: Optional[int] = None
|
| 64 |
-
times_match = re.search(r"(?:1์ผ|ํ๋ฃจ)\s*(\d+)\s*ํ",
|
| 65 |
if times_match:
|
| 66 |
times_per_day = int(times_match.group(1))
|
| 67 |
|
| 68 |
-
|
| 69 |
-
time_slots = _extract_time_slots(text)
|
| 70 |
|
| 71 |
return {
|
| 72 |
"drug_name": drug_name,
|
| 73 |
"dose_per_intake": dose_per_intake,
|
| 74 |
"times_per_day": times_per_day,
|
| 75 |
"time_slots": time_slots or None,
|
|
|
|
| 76 |
}
|
| 77 |
|
| 78 |
|
|
@@ -130,15 +183,60 @@ def to_csv_row(output: Dict[str, Any]) -> str:
|
|
| 130 |
return ",".join(row)
|
| 131 |
|
| 132 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 133 |
def run_pipeline(image: Optional[Image.Image]):
|
| 134 |
if image is None:
|
| 135 |
-
return "์ด๋ฏธ์ง๋ฅผ ์
๋ก๋ํ์ธ์.", None, None
|
| 136 |
|
| 137 |
output = ocr_and_parse(image)
|
| 138 |
card = render_card(output["fields"])
|
| 139 |
csv_row = to_csv_row(output)
|
| 140 |
json_text = json.dumps(output, ensure_ascii=False, indent=2)
|
| 141 |
-
|
|
|
|
| 142 |
|
| 143 |
|
| 144 |
with gr.Blocks(theme=gr.themes.Soft()) as demo:
|
|
@@ -151,7 +249,8 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
|
|
| 151 |
with gr.Column():
|
| 152 |
json_out = gr.Code(label="์ธ์ ๊ฒฐ๊ณผ(JSON)")
|
| 153 |
card_out = gr.Image(type="pil", label="์ผ์ ์นด๋(๋ฏธ๋ฆฌ๋ณด๊ธฐ)")
|
| 154 |
-
|
|
|
|
| 155 |
|
| 156 |
|
| 157 |
if __name__ == "__main__":
|
|
|
|
| 1 |
import json
|
| 2 |
import re
|
| 3 |
+
from typing import Any, Dict, List, Optional, Sequence
|
| 4 |
|
| 5 |
import gradio as gr
|
| 6 |
from PIL import Image, ImageDraw
|
|
|
|
| 23 |
"๊ธฐ์",
|
| 24 |
]
|
| 25 |
|
| 26 |
+
# Very small knowledge base for common Korean OTC medications.
|
| 27 |
+
MED_KNOWLEDGE: Sequence[Dict[str, Any]] = [
|
| 28 |
+
{
|
| 29 |
+
"keywords": ["ํ์ด๋ ๋", "์์ธํธ์๋ฏธ๋
ธํ", "acetaminophen"],
|
| 30 |
+
"category": "์งํตยทํด์ด์ ",
|
| 31 |
+
"what_it_does": "๋ชธ์ด์ด๋ ๊ฐ๊ธฐ๋ก ์ด์ด ๋๊ฑฐ๋ ๋จธ๋ฆฌ๊ฐ ์ํ ๋ ํต์ฆ๊ณผ ์ด์ ๋ฎ์ถฐ ์ค๋๋ค.",
|
| 32 |
+
"example": "์: ์ํ์ํ ์ค๋น๋ก ๊ธด์ฅํ๋๋ฐ ๋จธ๋ฆฌ๊ฐ ์ง๋๊ฑฐ๋ฆด ๋, ํ ์ ๋ณต์ฉํ๋ฉด ํต์ฆ์ด ์ค์ด๋ญ๋๋ค.",
|
| 33 |
+
"tip": "์์ ๋ถ๋ด์ ์ค์ด๊ธฐ ์ํด ๊ฐ๋จํ ๊ฐ์๊ณผ ํจ๊ป ๋ฌผ๊ณผ ๋ณต์ฉํ๊ณ , ํ๋ฃจ ์ด ๋ณต์ฉ ํ์(์ผ๋ฐ์ ์ผ๋ก 4ํ ์ดํ)๋ฅผ ๋๊ธฐ์ง ๋ง์ธ์.",
|
| 34 |
+
},
|
| 35 |
+
{
|
| 36 |
+
"keywords": ["์ด๋ถํ๋กํ", "๋ถ๋ฃจํ", "ibuprofen"],
|
| 37 |
+
"category": "์งํตยท์์ผ์ ",
|
| 38 |
+
"what_it_does": "๋ชธ์ ์ผ์ฆ์ ๊ฐ๋ผ์ํ๊ณ ํต์ฆ์ ์ํํด์ ๊ทผ์กํต์ด๋ ์นํต์ ์์ฃผ ์ฌ์ฉ๋ฉ๋๋ค.",
|
| 39 |
+
"example": "์: ์ฒด์ก ์๊ฐ์ ๋ฌด๋ฆ์ ์ด์ง ์์์ ๋ ๋ถ๊ธฐ์ ์ํ์ ์ค์ฌ ์ค๋๋ค.",
|
| 40 |
+
"tip": "์ํ์ ๋ณต์ฉํ๋ฉด ์ ์ฐ๋ฆผ์ ์ค์ผ ์ ์๊ณ , ๋ค๋ฅธ ์์ผ์งํต์ ์๋ ์๊ฐ ๊ฐ๊ฒฉ์ ๋์ธ์.",
|
| 41 |
+
},
|
| 42 |
+
{
|
| 43 |
+
"keywords": ["์์", "์ธํฐ๋ฆฌ์ง", "cetirizine", "์ง๋ฅดํ
"],
|
| 44 |
+
"category": "์๋ ๋ฅด๊ธฐ ์ํ์ ",
|
| 45 |
+
"what_it_does": "์ฝ๊ฐ ๊ฐ์ง๊ฑฐ๋ฆฌ๊ฑฐ๋ ํผ๋ถ๊ฐ ๊ฐ๋ ค์ธ ๋ ์๋ ๋ฅด๊ธฐ ๋ฐ์์ ๊ฐ๋ผ์ํ ์ค๋๋ค.",
|
| 46 |
+
"example": "์: ๋ด์ฒ ๊ฝ๊ฐ๋ฃจ ๋๋ฌธ์ ๊ธฐ์นจ๊ณผ ์ฝง๋ฌผ์ด ๋์ฌ ๋ ์ฆ์์ ์ค์ฌ ์ค๋๋ค.",
|
| 47 |
+
"tip": "์กธ๋ฆด ์ ์์ผ๋ ์ฒซ ๋ณต์ฉ ํ์๋ ์ด์ ์ด๋ ์ง์ค์ด ํ์ํ ํ๋์ ํผํ์ธ์.",
|
| 48 |
+
},
|
| 49 |
+
{
|
| 50 |
+
"keywords": ["ํผ์คํ", "pancreatin", "์์ฅ", "์ํ์ "],
|
| 51 |
+
"category": "์ํ์ ",
|
| 52 |
+
"what_it_does": "๊ธฐ๋ฆ์ง ์์์ ๋จน๊ณ ๋ฐฐ๊ฐ ๋๋ถ๋ฃฉํ ๋ ์ํ๋ฅผ ๋์ ์์ ํธํ๊ฒ ํด ์ค๋๋ค.",
|
| 53 |
+
"example": "์: ์นํจ์ ๋ง์ด ๋จน์ด ์์ด ๋๋ถ๋ฃฉํ ๋ ์์ ๊ฐ๋ณ๊ฒ ํด ์ค๋๋ค.",
|
| 54 |
+
"tip": "์ํ์ ๋ณต์ฉํ๋ฉด ํจ๊ณผ๊ฐ ์ข์ผ๋ฉฐ, ๋ณตํต์ด ๊ณ์๋๋ฉด ๋ณ์์ ๋ฐฉ๋ฌธํ์ธ์.",
|
| 55 |
+
},
|
| 56 |
+
{
|
| 57 |
+
"keywords": ["๋นํ๋ฏผ", "multivitamin", "vitamin"],
|
| 58 |
+
"category": "์์์ ",
|
| 59 |
+
"what_it_does": "๋ชธ์ ํ์ํ ๋นํ๋ฏผ์ ์ฑ์ ํผ๊ณคํจ์ ์ค์ด๊ณ ๋ฉด์ญ๋ ฅ์ ๋์ต๋๋ค.",
|
| 60 |
+
"example": "์: ์ํ ์ค๋น๋ก ์ ์ ์ค์์ ๋ ๋ชธ์ด ์ง์น์ง ์๋๋ก ๋์์ค๋๋ค.",
|
| 61 |
+
"tip": "ํ๋ฃจ ๊ถ์ฅ๋์ ์ง์ผ ๊พธ์คํ ๋ณต์ฉํ๋ฉด ๋ ํจ๊ณผ์ ์ด๋ฉฐ, ๋ฌผ๊ณผ ํจ๊ป ์ผํค์ธ์.",
|
| 62 |
+
},
|
| 63 |
+
]
|
| 64 |
+
|
| 65 |
|
| 66 |
def _extract_time_slots(text: str) -> List[str]:
|
| 67 |
slots = []
|
|
|
|
| 78 |
return slots
|
| 79 |
|
| 80 |
|
| 81 |
+
STOPWORDS = {"์ฉ๋ฒ", "์ฉ๋", "๋ณต์ฉ", "๋ฐฉ๋ฒ", "์ฝ", "์ "}
|
| 82 |
+
|
| 83 |
+
|
| 84 |
+
def _extract_medications(text: str) -> List[Dict[str, Optional[str]]]:
|
| 85 |
+
meds: List[Dict[str, Optional[str]]] = []
|
| 86 |
+
pattern = re.compile(
|
| 87 |
+
r"([๊ฐ-ํฃA-Za-z]{2,})[\sยท]*(\d+[\./]?\d*\s*(?:mg|mL|ML|ml|์ |์บก์))?"
|
| 88 |
+
)
|
| 89 |
+
seen: set[str] = set()
|
| 90 |
+
for match in pattern.finditer(text):
|
| 91 |
+
name = match.group(1)
|
| 92 |
+
if name in STOPWORDS or len(name) <= 1:
|
| 93 |
+
continue
|
| 94 |
+
if any(sw in name for sw in STOPWORDS):
|
| 95 |
+
continue
|
| 96 |
+
name_norm = name.strip()
|
| 97 |
+
if name_norm in seen:
|
| 98 |
+
continue
|
| 99 |
+
seen.add(name_norm)
|
| 100 |
+
dose = match.group(2).strip() if match.group(2) else None
|
| 101 |
+
meds.append({"name": name_norm, "dose": dose})
|
| 102 |
+
return meds
|
| 103 |
+
|
| 104 |
+
|
| 105 |
def parse_fields(raw: str) -> Dict[str, Any]:
|
| 106 |
"""Extract drug name and dosage information from OCR text."""
|
| 107 |
+
collapsed = raw.replace("\n", " ")
|
| 108 |
+
collapsed = re.sub(r"\s+", " ", collapsed)
|
| 109 |
+
|
| 110 |
+
medications = _extract_medications(collapsed)
|
| 111 |
+
|
| 112 |
+
first = medications[0] if medications else {"name": None, "dose": None}
|
| 113 |
+
drug_name = first.get("name")
|
| 114 |
+
dose_per_intake = first.get("dose")
|
| 115 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 116 |
times_per_day: Optional[int] = None
|
| 117 |
+
times_match = re.search(r"(?:1์ผ|ํ๋ฃจ)\s*(\d+)\s*ํ", collapsed)
|
| 118 |
if times_match:
|
| 119 |
times_per_day = int(times_match.group(1))
|
| 120 |
|
| 121 |
+
time_slots = _extract_time_slots(collapsed)
|
|
|
|
| 122 |
|
| 123 |
return {
|
| 124 |
"drug_name": drug_name,
|
| 125 |
"dose_per_intake": dose_per_intake,
|
| 126 |
"times_per_day": times_per_day,
|
| 127 |
"time_slots": time_slots or None,
|
| 128 |
+
"medications": medications,
|
| 129 |
}
|
| 130 |
|
| 131 |
|
|
|
|
| 183 |
return ",".join(row)
|
| 184 |
|
| 185 |
|
| 186 |
+
def _match_knowledge(name: str) -> Optional[Dict[str, Any]]:
|
| 187 |
+
lowered = name.lower()
|
| 188 |
+
for info in MED_KNOWLEDGE:
|
| 189 |
+
for kw in info["keywords"]:
|
| 190 |
+
if kw.lower() in lowered or lowered in kw.lower():
|
| 191 |
+
return info
|
| 192 |
+
return None
|
| 193 |
+
|
| 194 |
+
|
| 195 |
+
def build_explanations(output: Dict[str, Any]) -> str:
|
| 196 |
+
meds = output["fields"].get("medications") or []
|
| 197 |
+
if not meds:
|
| 198 |
+
return (
|
| 199 |
+
"### ์ฝ ์ค๋ช
\n"
|
| 200 |
+
"- ์ฝ ์ด๋ฆ์ ์ ํํ ์ธ์ํ์ง ๋ชปํ์ด์. ์ฌ์ง์ ๋ค์ ์ฐ๊ฑฐ๋ ์ฝ์ฌ์๊ฒ ์ง์ ํ์ธํด ์ฃผ์ธ์.\n"
|
| 201 |
+
"\n> โ ๏ธ ์๋ฃ์ง ์ฒ๋ฐฉ๊ณผ ๋ณต์ฝ ์ง์๊ฐ ๊ฐ์ฅ ์ฐ์ ์
๋๋ค."
|
| 202 |
+
)
|
| 203 |
+
|
| 204 |
+
lines = ["### ์ฝ๊ฒ ์์๋ณด๋ ์ฝ ์ค๋ช
"]
|
| 205 |
+
for med in meds:
|
| 206 |
+
name = med.get("name") or "์ด๋ฆ ๋ฏธํ์ธ"
|
| 207 |
+
info = _match_knowledge(name) if name else None
|
| 208 |
+
dose = med.get("dose")
|
| 209 |
+
if info:
|
| 210 |
+
lines.append(
|
| 211 |
+
f"- **{name}** ({info['category']})"
|
| 212 |
+
)
|
| 213 |
+
if dose:
|
| 214 |
+
lines.append(f" - ์ฝ ๋ดํฌ์ ์ ํ ์ฉ๋: `{dose}`")
|
| 215 |
+
lines.append(f" - ํ๋ ์ผ: {info['what_it_does']}")
|
| 216 |
+
lines.append(f" - ์คํ์ ์์: {info['example']}")
|
| 217 |
+
lines.append(f" - ๋ณต์ฉ ํ: {info['tip']}")
|
| 218 |
+
else:
|
| 219 |
+
lines.append(f"- **{name}**")
|
| 220 |
+
if dose:
|
| 221 |
+
lines.append(f" - ์ฝ ๋ดํฌ ์ฉ๋: `{dose}`")
|
| 222 |
+
lines.append(
|
| 223 |
+
" - ์์ง ๋ฐ์ดํฐ๊ฐ ์์ด์. ์ฝ ์ด๋ฆ์ ๋ค์ ํ์ธํ๊ฑฐ๋ ์ฝ์ฌ์๊ฒ ๋ฌผ์ด๋ณด์ธ์."
|
| 224 |
+
)
|
| 225 |
+
|
| 226 |
+
lines.append("\n> โ ๏ธ ์ค์ ๋ณต์ฝ์ ์์ฌยท์ฝ์ฌ์ ์ง์์ ๋ฐ๋์ ๋ฐ๋ฅด์ธ์.")
|
| 227 |
+
return "\n".join(lines)
|
| 228 |
+
|
| 229 |
+
|
| 230 |
def run_pipeline(image: Optional[Image.Image]):
|
| 231 |
if image is None:
|
| 232 |
+
return "์ด๋ฏธ์ง๋ฅผ ์
๋ก๋ํ์ธ์.", None, None, "์ด๋ฏธ์ง๋ฅผ ๋จผ์ ์
๋ก๋ํด ์ฃผ์ธ์."
|
| 233 |
|
| 234 |
output = ocr_and_parse(image)
|
| 235 |
card = render_card(output["fields"])
|
| 236 |
csv_row = to_csv_row(output)
|
| 237 |
json_text = json.dumps(output, ensure_ascii=False, indent=2)
|
| 238 |
+
explanations = build_explanations(output)
|
| 239 |
+
return json_text, card, csv_row, explanations
|
| 240 |
|
| 241 |
|
| 242 |
with gr.Blocks(theme=gr.themes.Soft()) as demo:
|
|
|
|
| 249 |
with gr.Column():
|
| 250 |
json_out = gr.Code(label="์ธ์ ๊ฒฐ๊ณผ(JSON)")
|
| 251 |
card_out = gr.Image(type="pil", label="์ผ์ ์นด๋(๋ฏธ๋ฆฌ๋ณด๊ธฐ)")
|
| 252 |
+
explain_md = gr.Markdown(label="์ฝ๊ฒ ์์๋ณด๋ ์ฝ ์ค๋ช
")
|
| 253 |
+
btn.click(run_pipeline, inputs=img_in, outputs=[json_out, card_out, csv_box, explain_md])
|
| 254 |
|
| 255 |
|
| 256 |
if __name__ == "__main__":
|