LLDDWW commited on
Commit
149f48c
ยท
1 Parent(s): 19edf2f

feat: explain meds in plain language

Browse files
Files changed (1) hide show
  1. app.py +125 -26
app.py CHANGED
@@ -1,6 +1,6 @@
1
  import json
2
  import re
3
- from typing import Any, Dict, List, Optional
4
 
5
  import gradio as gr
6
  from PIL import Image, ImageDraw
@@ -23,6 +23,45 @@ TIME_KEYWORDS = [
23
  "๊ธฐ์ƒ",
24
  ]
25
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
26
 
27
  def _extract_time_slots(text: str) -> List[str]:
28
  slots = []
@@ -39,40 +78,54 @@ def _extract_time_slots(text: str) -> List[str]:
39
  return slots
40
 
41
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
42
  def parse_fields(raw: str) -> Dict[str, Any]:
43
  """Extract drug name and dosage information from OCR text."""
44
- text = raw.replace("\n", " ")
45
- text = re.sub(r"\s+", " ", text)
46
-
47
- # 1) ์•ฝ ์ด๋ฆ„: ๋‹จ์–ด + ์šฉ๋Ÿ‰ ํŒจํ„ด ์ฃผ๋ณ€์—์„œ ์ฐพ๊ธฐ
48
- drug_name: Optional[str] = None
49
- drug_match = re.search(r"([๊ฐ€-ํžฃA-Za-z]+)\s*(\d+)\s*(mg|mL|ML|์ •)", text)
50
- if drug_match:
51
- drug_name = drug_match.group(1)
52
- else:
53
- fallback = re.search(r"([๊ฐ€-ํžฃA-Za-z]{2,})", text)
54
- drug_name = fallback.group(1) if fallback else None
55
-
56
- # 2) 1ํšŒ ์šฉ๋Ÿ‰: "1ํšŒ 1์ •", "1์ •", "5 mL" ๋“ฑ
57
- dose_per_intake: Optional[str] = None
58
- dose_match = re.search(r"(1ํšŒ\s*)?(\d+[\./]?\d*)\s*([๊ฐ€-ํžฃA-Za-z]+|mL|ml|mg|์ •)", text)
59
- if dose_match:
60
- dose_per_intake = f"{dose_match.group(2)} {dose_match.group(3)}".strip()
61
-
62
- # 3) 1์ผ ๋ณต์šฉ ํšŸ์ˆ˜: "1์ผ 3ํšŒ", "ํ•˜๋ฃจ 2ํšŒ"
63
  times_per_day: Optional[int] = None
64
- times_match = re.search(r"(?:1์ผ|ํ•˜๋ฃจ)\s*(\d+)\s*ํšŒ", text)
65
  if times_match:
66
  times_per_day = int(times_match.group(1))
67
 
68
- # 4) ์‹œ๊ฐ„๋Œ€ ํ‚ค์›Œ๋“œ/์‹œ๊ฐ ์ถ”์ถœ
69
- time_slots = _extract_time_slots(text)
70
 
71
  return {
72
  "drug_name": drug_name,
73
  "dose_per_intake": dose_per_intake,
74
  "times_per_day": times_per_day,
75
  "time_slots": time_slots or None,
 
76
  }
77
 
78
 
@@ -130,15 +183,60 @@ def to_csv_row(output: Dict[str, Any]) -> str:
130
  return ",".join(row)
131
 
132
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
133
  def run_pipeline(image: Optional[Image.Image]):
134
  if image is None:
135
- return "์ด๋ฏธ์ง€๋ฅผ ์—…๋กœ๋“œํ•˜์„ธ์š”.", None, None
136
 
137
  output = ocr_and_parse(image)
138
  card = render_card(output["fields"])
139
  csv_row = to_csv_row(output)
140
  json_text = json.dumps(output, ensure_ascii=False, indent=2)
141
- return json_text, card, csv_row
 
142
 
143
 
144
  with gr.Blocks(theme=gr.themes.Soft()) as demo:
@@ -151,7 +249,8 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
151
  with gr.Column():
152
  json_out = gr.Code(label="์ธ์‹ ๊ฒฐ๊ณผ(JSON)")
153
  card_out = gr.Image(type="pil", label="์ผ์ • ์นด๋“œ(๋ฏธ๋ฆฌ๋ณด๊ธฐ)")
154
- btn.click(run_pipeline, inputs=img_in, outputs=[json_out, card_out, csv_box])
 
155
 
156
 
157
  if __name__ == "__main__":
 
1
  import json
2
  import re
3
+ from typing import Any, Dict, List, Optional, Sequence
4
 
5
  import gradio as gr
6
  from PIL import Image, ImageDraw
 
23
  "๊ธฐ์ƒ",
24
  ]
25
 
26
+ # Very small knowledge base for common Korean OTC medications.
27
+ MED_KNOWLEDGE: Sequence[Dict[str, Any]] = [
28
+ {
29
+ "keywords": ["ํƒ€์ด๋ ˆ๋†€", "์•„์„ธํŠธ์•„๋ฏธ๋…ธํŽœ", "acetaminophen"],
30
+ "category": "์ง„ํ†ตยทํ•ด์—ด์ œ",
31
+ "what_it_does": "๋ชธ์‚ด์ด๋‚˜ ๊ฐ๊ธฐ๋กœ ์—ด์ด ๋‚˜๊ฑฐ๋‚˜ ๋จธ๋ฆฌ๊ฐ€ ์•„ํ”Œ ๋•Œ ํ†ต์ฆ๊ณผ ์—ด์„ ๋‚ฎ์ถฐ ์ค๋‹ˆ๋‹ค.",
32
+ "example": "์˜ˆ: ์ˆ˜ํ•™์‹œํ—˜ ์ค€๋น„๋กœ ๊ธด์žฅํ–ˆ๋Š”๋ฐ ๋จธ๋ฆฌ๊ฐ€ ์ง€๋ˆ๊ฑฐ๋ฆด ๋•Œ, ํ•œ ์•Œ ๋ณต์šฉํ•˜๋ฉด ํ†ต์ฆ์ด ์ค„์–ด๋“ญ๋‹ˆ๋‹ค.",
33
+ "tip": "์œ„์— ๋ถ€๋‹ด์„ ์ค„์ด๊ธฐ ์œ„ํ•ด ๊ฐ„๋‹จํ•œ ๊ฐ„์‹๊ณผ ํ•จ๊ป˜ ๋ฌผ๊ณผ ๋ณต์šฉํ•˜๊ณ , ํ•˜๋ฃจ ์ด ๋ณต์šฉ ํšŸ์ˆ˜(์ผ๋ฐ˜์ ์œผ๋กœ 4ํšŒ ์ดํ•˜)๋ฅผ ๋„˜๊ธฐ์ง€ ๋งˆ์„ธ์š”.",
34
+ },
35
+ {
36
+ "keywords": ["์ด๋ถ€ํ”„๋กœํŽœ", "๋ถ€๋ฃจํŽœ", "ibuprofen"],
37
+ "category": "์ง„ํ†ตยท์†Œ์—ผ์ œ",
38
+ "what_it_does": "๋ชธ์† ์—ผ์ฆ์„ ๊ฐ€๋ผ์•‰ํžˆ๊ณ  ํ†ต์ฆ์„ ์™„ํ™”ํ•ด์„œ ๊ทผ์œกํ†ต์ด๋‚˜ ์น˜ํ†ต์— ์ž์ฃผ ์‚ฌ์šฉ๋ฉ๋‹ˆ๋‹ค.",
39
+ "example": "์˜ˆ: ์ฒด์œก ์‹œ๊ฐ„์— ๋ฌด๋ฆŽ์„ ์‚ด์ง ์‚์—ˆ์„ ๋•Œ ๋ถ“๊ธฐ์™€ ์•„ํ””์„ ์ค„์—ฌ ์ค๋‹ˆ๋‹ค.",
40
+ "tip": "์‹ํ›„์— ๋ณต์šฉํ•˜๋ฉด ์† ์“ฐ๋ฆผ์„ ์ค„์ผ ์ˆ˜ ์žˆ๊ณ , ๋‹ค๋ฅธ ์†Œ์—ผ์ง„ํ†ต์ œ์™€๋Š” ์‹œ๊ฐ„ ๊ฐ„๊ฒฉ์„ ๋‘์„ธ์š”.",
41
+ },
42
+ {
43
+ "keywords": ["์‹œ์ž˜", "์„ธํ‹ฐ๋ฆฌ์ง„", "cetirizine", "์ง€๋ฅดํ…"],
44
+ "category": "์•Œ๋ ˆ๋ฅด๊ธฐ ์™„ํ™”์ œ",
45
+ "what_it_does": "์ฝ”๊ฐ€ ๊ฐ„์งˆ๊ฑฐ๋ฆฌ๊ฑฐ๋‚˜ ํ”ผ๋ถ€๊ฐ€ ๊ฐ€๋ ค์šธ ๋•Œ ์•Œ๋ ˆ๋ฅด๊ธฐ ๋ฐ˜์‘์„ ๊ฐ€๋ผ์•‰ํ˜€ ์ค๋‹ˆ๋‹ค.",
46
+ "example": "์˜ˆ: ๋ด„์ฒ  ๊ฝƒ๊ฐ€๋ฃจ ๋•Œ๋ฌธ์— ๊ธฐ์นจ๊ณผ ์ฝง๋ฌผ์ด ๋‚˜์˜ฌ ๋•Œ ์ฆ์ƒ์„ ์ค„์—ฌ ์ค๋‹ˆ๋‹ค.",
47
+ "tip": "์กธ๋ฆด ์ˆ˜ ์žˆ์œผ๋‹ˆ ์ฒซ ๋ณต์šฉ ํ›„์—๋Š” ์šด์ „์ด๋‚˜ ์ง‘์ค‘์ด ํ•„์š”ํ•œ ํ™œ๋™์€ ํ”ผํ•˜์„ธ์š”.",
48
+ },
49
+ {
50
+ "keywords": ["ํ›ผ์Šคํƒˆ", "pancreatin", "์œ„์žฅ", "์†Œํ™”์ œ"],
51
+ "category": "์†Œํ™”์ œ",
52
+ "what_it_does": "๊ธฐ๋ฆ„์ง„ ์Œ์‹์„ ๋จน๊ณ  ๋ฐฐ๊ฐ€ ๋”๋ถ€๋ฃฉํ•  ๋•Œ ์†Œํ™”๋ฅผ ๋„์™€ ์†์„ ํŽธํ•˜๊ฒŒ ํ•ด ์ค๋‹ˆ๋‹ค.",
53
+ "example": "์˜ˆ: ์น˜ํ‚จ์„ ๋งŽ์ด ๋จน์–ด ์†์ด ๋”๋ถ€๋ฃฉํ•  ๋•Œ ์†์„ ๊ฐ€๋ณ๊ฒŒ ํ•ด ์ค๋‹ˆ๋‹ค.",
54
+ "tip": "์‹ํ›„์— ๋ณต์šฉํ•˜๋ฉด ํšจ๊ณผ๊ฐ€ ์ข‹์œผ๋ฉฐ, ๋ณตํ†ต์ด ๊ณ„์†๋˜๋ฉด ๋ณ‘์›์„ ๋ฐฉ๋ฌธํ•˜์„ธ์š”.",
55
+ },
56
+ {
57
+ "keywords": ["๋น„ํƒ€๋ฏผ", "multivitamin", "vitamin"],
58
+ "category": "์˜์–‘์ œ",
59
+ "what_it_does": "๋ชธ์— ํ•„์š”ํ•œ ๋น„ํƒ€๋ฏผ์„ ์ฑ„์›Œ ํ”ผ๊ณคํ•จ์„ ์ค„์ด๊ณ  ๋ฉด์—ญ๋ ฅ์„ ๋•์Šต๋‹ˆ๋‹ค.",
60
+ "example": "์˜ˆ: ์‹œํ—˜ ์ค€๋น„๋กœ ์ž ์„ ์ค„์˜€์„ ๋•Œ ๋ชธ์ด ์ง€์น˜์ง€ ์•Š๋„๋ก ๋„์™€์ค๋‹ˆ๋‹ค.",
61
+ "tip": "ํ•˜๋ฃจ ๊ถŒ์žฅ๋Ÿ‰์„ ์ง€์ผœ ๊พธ์ค€ํžˆ ๋ณต์šฉํ•˜๋ฉด ๋” ํšจ๊ณผ์ ์ด๋ฉฐ, ๋ฌผ๊ณผ ํ•จ๊ป˜ ์‚ผํ‚ค์„ธ์š”.",
62
+ },
63
+ ]
64
+
65
 
66
  def _extract_time_slots(text: str) -> List[str]:
67
  slots = []
 
78
  return slots
79
 
80
 
81
+ STOPWORDS = {"์šฉ๋ฒ•", "์šฉ๋Ÿ‰", "๋ณต์šฉ", "๋ฐฉ๋ฒ•", "์•ฝ", "์ •"}
82
+
83
+
84
+ def _extract_medications(text: str) -> List[Dict[str, Optional[str]]]:
85
+ meds: List[Dict[str, Optional[str]]] = []
86
+ pattern = re.compile(
87
+ r"([๊ฐ€-ํžฃA-Za-z]{2,})[\sยท]*(\d+[\./]?\d*\s*(?:mg|mL|ML|ml|์ •|์บก์А))?"
88
+ )
89
+ seen: set[str] = set()
90
+ for match in pattern.finditer(text):
91
+ name = match.group(1)
92
+ if name in STOPWORDS or len(name) <= 1:
93
+ continue
94
+ if any(sw in name for sw in STOPWORDS):
95
+ continue
96
+ name_norm = name.strip()
97
+ if name_norm in seen:
98
+ continue
99
+ seen.add(name_norm)
100
+ dose = match.group(2).strip() if match.group(2) else None
101
+ meds.append({"name": name_norm, "dose": dose})
102
+ return meds
103
+
104
+
105
  def parse_fields(raw: str) -> Dict[str, Any]:
106
  """Extract drug name and dosage information from OCR text."""
107
+ collapsed = raw.replace("\n", " ")
108
+ collapsed = re.sub(r"\s+", " ", collapsed)
109
+
110
+ medications = _extract_medications(collapsed)
111
+
112
+ first = medications[0] if medications else {"name": None, "dose": None}
113
+ drug_name = first.get("name")
114
+ dose_per_intake = first.get("dose")
115
+
 
 
 
 
 
 
 
 
 
 
116
  times_per_day: Optional[int] = None
117
+ times_match = re.search(r"(?:1์ผ|ํ•˜๋ฃจ)\s*(\d+)\s*ํšŒ", collapsed)
118
  if times_match:
119
  times_per_day = int(times_match.group(1))
120
 
121
+ time_slots = _extract_time_slots(collapsed)
 
122
 
123
  return {
124
  "drug_name": drug_name,
125
  "dose_per_intake": dose_per_intake,
126
  "times_per_day": times_per_day,
127
  "time_slots": time_slots or None,
128
+ "medications": medications,
129
  }
130
 
131
 
 
183
  return ",".join(row)
184
 
185
 
186
+ def _match_knowledge(name: str) -> Optional[Dict[str, Any]]:
187
+ lowered = name.lower()
188
+ for info in MED_KNOWLEDGE:
189
+ for kw in info["keywords"]:
190
+ if kw.lower() in lowered or lowered in kw.lower():
191
+ return info
192
+ return None
193
+
194
+
195
+ def build_explanations(output: Dict[str, Any]) -> str:
196
+ meds = output["fields"].get("medications") or []
197
+ if not meds:
198
+ return (
199
+ "### ์•ฝ ์„ค๋ช…\n"
200
+ "- ์•ฝ ์ด๋ฆ„์„ ์ •ํ™•ํžˆ ์ธ์‹ํ•˜์ง€ ๋ชปํ–ˆ์–ด์š”. ์‚ฌ์ง„์„ ๋‹ค์‹œ ์ฐ๊ฑฐ๋‚˜ ์•ฝ์‚ฌ์—๊ฒŒ ์ง์ ‘ ํ™•์ธํ•ด ์ฃผ์„ธ์š”.\n"
201
+ "\n> โš ๏ธ ์˜๋ฃŒ์ง„ ์ฒ˜๋ฐฉ๊ณผ ๋ณต์•ฝ ์ง€์‹œ๊ฐ€ ๊ฐ€์žฅ ์šฐ์„ ์ž…๋‹ˆ๋‹ค."
202
+ )
203
+
204
+ lines = ["### ์‰ฝ๊ฒŒ ์•Œ์•„๋ณด๋Š” ์•ฝ ์„ค๋ช…"]
205
+ for med in meds:
206
+ name = med.get("name") or "์ด๋ฆ„ ๋ฏธํ™•์ธ"
207
+ info = _match_knowledge(name) if name else None
208
+ dose = med.get("dose")
209
+ if info:
210
+ lines.append(
211
+ f"- **{name}** ({info['category']})"
212
+ )
213
+ if dose:
214
+ lines.append(f" - ์•ฝ ๋ด‰ํˆฌ์— ์ ํžŒ ์šฉ๋Ÿ‰: `{dose}`")
215
+ lines.append(f" - ํ•˜๋Š” ์ผ: {info['what_it_does']}")
216
+ lines.append(f" - ์ค‘ํ•™์ƒ ์˜ˆ์‹œ: {info['example']}")
217
+ lines.append(f" - ๋ณต์šฉ ํŒ: {info['tip']}")
218
+ else:
219
+ lines.append(f"- **{name}**")
220
+ if dose:
221
+ lines.append(f" - ์•ฝ ๋ด‰ํˆฌ ์šฉ๋Ÿ‰: `{dose}`")
222
+ lines.append(
223
+ " - ์•„์ง ๋ฐ์ดํ„ฐ๊ฐ€ ์—†์–ด์š”. ์•ฝ ์ด๋ฆ„์„ ๋‹ค์‹œ ํ™•์ธํ•˜๊ฑฐ๋‚˜ ์•ฝ์‚ฌ์—๊ฒŒ ๋ฌผ์–ด๋ณด์„ธ์š”."
224
+ )
225
+
226
+ lines.append("\n> โš ๏ธ ์‹ค์ œ ๋ณต์•ฝ์€ ์˜์‚ฌยท์•ฝ์‚ฌ์˜ ์ง€์‹œ์— ๋ฐ˜๋“œ์‹œ ๋”ฐ๋ฅด์„ธ์š”.")
227
+ return "\n".join(lines)
228
+
229
+
230
  def run_pipeline(image: Optional[Image.Image]):
231
  if image is None:
232
+ return "์ด๋ฏธ์ง€๋ฅผ ์—…๋กœ๋“œํ•˜์„ธ์š”.", None, None, "์ด๋ฏธ์ง€๋ฅผ ๋จผ์ € ์—…๋กœ๋“œํ•ด ์ฃผ์„ธ์š”."
233
 
234
  output = ocr_and_parse(image)
235
  card = render_card(output["fields"])
236
  csv_row = to_csv_row(output)
237
  json_text = json.dumps(output, ensure_ascii=False, indent=2)
238
+ explanations = build_explanations(output)
239
+ return json_text, card, csv_row, explanations
240
 
241
 
242
  with gr.Blocks(theme=gr.themes.Soft()) as demo:
 
249
  with gr.Column():
250
  json_out = gr.Code(label="์ธ์‹ ๊ฒฐ๊ณผ(JSON)")
251
  card_out = gr.Image(type="pil", label="์ผ์ • ์นด๋“œ(๋ฏธ๋ฆฌ๋ณด๊ธฐ)")
252
+ explain_md = gr.Markdown(label="์‰ฝ๊ฒŒ ์•Œ์•„๋ณด๋Š” ์•ฝ ์„ค๋ช…")
253
+ btn.click(run_pipeline, inputs=img_in, outputs=[json_out, card_out, csv_box, explain_md])
254
 
255
 
256
  if __name__ == "__main__":