Avinashnalla7 commited on
Commit
7fd3f6f
·
1 Parent(s): 4478465

fix: restore worker code + entrypoint

Browse files
Dockerfile CHANGED
@@ -1,25 +1,8 @@
1
- # ---- Build stage ----
2
- FROM node:20-alpine AS build
3
 
4
  WORKDIR /app
 
 
5
 
6
- COPY package.json package-lock.json ./
7
- RUN npm ci
8
-
9
- COPY . .
10
- RUN npm run build
11
-
12
- # ---- Runtime stage ----
13
- FROM nginx:alpine
14
-
15
- # Remove default nginx config
16
- RUN rm /etc/nginx/conf.d/default.conf
17
-
18
- # Custom nginx config
19
- COPY nginx.conf /etc/nginx/conf.d/default.conf
20
-
21
- # Copy built assets
22
- COPY --from=build /app/dist /usr/share/nginx/html
23
-
24
- EXPOSE 7860
25
- CMD ["nginx", "-g", "daemon off;"]
 
1
+ FROM python:3.11-slim
 
2
 
3
  WORKDIR /app
4
+ COPY requirements.txt /app/requirements.txt
5
+ RUN pip install --no-cache-dir -r /app/requirements.txt
6
 
7
+ COPY . /app
8
+ CMD ["python", "-u", "app.py"]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
app.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ from backend.worker.worker import main
2
+
3
+ if __name__ == "__main__":
4
+ main()
backend/__init__.py ADDED
File without changes
backend/worker/__init__.py ADDED
File without changes
backend/worker/config.py ADDED
@@ -0,0 +1,89 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import os
4
+ from dataclasses import dataclass
5
+ from pathlib import Path
6
+
7
+
8
+ @dataclass(frozen=True)
9
+ class Settings:
10
+ # Repo paths
11
+ repo_root: Path
12
+ backend_dir: Path
13
+ worker_dir: Path
14
+
15
+ # Gmail
16
+ credentials_path: Path
17
+ token_path: Path
18
+
19
+ label_incoming: str
20
+ label_known: str
21
+ label_unknown: str
22
+ label_train: str
23
+
24
+ # Notification
25
+ notify_to_email: str
26
+ notify_from_email: str
27
+
28
+ # Trainer
29
+ trainer_base_url: str
30
+
31
+ # OpenAI
32
+ openai_api_key: str
33
+ openai_model: str
34
+
35
+ # Worker behavior
36
+ poll_seconds: int
37
+ max_messages_per_poll: int
38
+ render_pages: int
39
+ render_dpi: int
40
+
41
+
42
+ def load_settings(repo_root: Path) -> Settings:
43
+ backend_dir = repo_root / "backend"
44
+ worker_dir = backend_dir / "worker"
45
+
46
+ # IMPORTANT: use the SAME env var you actually store in backend/.env
47
+ # Your file shows OPENAI_API_KEY_TEST=...
48
+ openai_api_key = os.environ.get("OPENAI_API_KEY_TEST", "").strip()
49
+ if not openai_api_key:
50
+ raise RuntimeError("Missing OPENAI_API_KEY_TEST env var in backend/.env")
51
+
52
+ notify_to = os.environ.get("PDF_PIPELINE_NOTIFY_TO", "").strip()
53
+ if not notify_to:
54
+ raise RuntimeError("Missing PDF_PIPELINE_NOTIFY_TO env var")
55
+
56
+ notify_from = os.environ.get("PDF_PIPELINE_NOTIFY_FROM", "").strip()
57
+ if not notify_from:
58
+ raise RuntimeError("Missing PDF_PIPELINE_NOTIFY_FROM env var")
59
+
60
+ trainer_base_url = os.environ.get("PDF_TRAINER_BASE_URL", "http://localhost:5173").strip()
61
+ if not trainer_base_url:
62
+ raise RuntimeError("Missing PDF_TRAINER_BASE_URL env var")
63
+
64
+ return Settings(
65
+ repo_root=repo_root,
66
+ backend_dir=backend_dir,
67
+ worker_dir=worker_dir,
68
+
69
+ credentials_path=Path(os.environ.get("GMAIL_CREDENTIALS_JSON", str(backend_dir / "credentials.json"))),
70
+ token_path=Path(os.environ.get("GMAIL_TOKEN_JSON", str(backend_dir / "token.json"))),
71
+
72
+ label_incoming=os.environ.get("PDF_PIPELINE_LABEL_INCOMING", "PDF_PIPELINE/INCOMING"),
73
+ label_known=os.environ.get("PDF_PIPELINE_LABEL_KNOWN", "PDF_PIPELINE/KNOWN"),
74
+ label_unknown=os.environ.get("PDF_PIPELINE_LABEL_UNKNOWN", "PDF_PIPELINE/UNKNOWN"),
75
+ label_train=os.environ.get("PDF_PIPELINE_LABEL_TRAIN", "PDF_PIPELINE/TRAIN"),
76
+
77
+ notify_to_email=notify_to,
78
+ notify_from_email=notify_from,
79
+
80
+ trainer_base_url=trainer_base_url,
81
+
82
+ openai_api_key=openai_api_key,
83
+ openai_model=os.environ.get("OPENAI_MODEL", "gpt-4.1-mini"),
84
+
85
+ poll_seconds=int(os.environ.get("PDF_PIPELINE_POLL_SECONDS", "20")),
86
+ max_messages_per_poll=int(os.environ.get("PDF_PIPELINE_MAX_PER_POLL", "5")),
87
+ render_pages=int(os.environ.get("PDF_PIPELINE_RENDER_PAGES", "2")),
88
+ render_dpi=int(os.environ.get("PDF_PIPELINE_RENDER_DPI", "200")),
89
+ )
backend/worker/gmail_client.py ADDED
@@ -0,0 +1,149 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+ import base64
3
+ import os
4
+ from dataclasses import dataclass
5
+ from email.message import EmailMessage
6
+ from pathlib import Path
7
+ from typing import List, Optional, Tuple
8
+
9
+ from google.oauth2.credentials import Credentials
10
+ from googleapiclient.discovery import build
11
+
12
+
13
+ SCOPES = [
14
+ "https://www.googleapis.com/auth/gmail.modify",
15
+ "https://www.googleapis.com/auth/gmail.send",
16
+ ]
17
+
18
+
19
+ @dataclass
20
+ class GmailMessage:
21
+ msg_id: str
22
+ thread_id: str
23
+
24
+
25
+ class GmailClient:
26
+ def __init__(self, credentials_path: Path, token_path: Path):
27
+ if not credentials_path.exists():
28
+ raise FileNotFoundError(f"Missing OAuth client json: {credentials_path}")
29
+ if not token_path.exists():
30
+ raise FileNotFoundError(f"Missing token json: {token_path}")
31
+
32
+ creds = Credentials.from_authorized_user_file(str(token_path), SCOPES)
33
+ self.service = build("gmail", "v1", credentials=creds, cache_discovery=False)
34
+
35
+ def list_labels(self) -> List[dict]:
36
+ resp = self.service.users().labels().list(userId="me").execute()
37
+ return resp.get("labels", [])
38
+
39
+ def get_label_id(self, name: str) -> Optional[str]:
40
+ for lbl in self.list_labels():
41
+ if lbl.get("name") == name:
42
+ return lbl.get("id")
43
+ return None
44
+
45
+ def ensure_label(self, name: str) -> str:
46
+ existing = self.get_label_id(name)
47
+ if existing:
48
+ return existing
49
+
50
+ body = {
51
+ "name": name,
52
+ "labelListVisibility": "labelShow",
53
+ "messageListVisibility": "show",
54
+ }
55
+ created = self.service.users().labels().create(userId="me", body=body).execute()
56
+ return created["id"]
57
+
58
+ def search_unread_pdf_messages(self, label_name: str, max_results: int = 10) -> List[GmailMessage]:
59
+ # Gmail search query: label + unread + pdf attachments
60
+ query = f'label:"{label_name}" is:unread has:attachment filename:pdf'
61
+ resp = self.service.users().messages().list(userId="me", q=query, maxResults=max_results).execute()
62
+ msgs = resp.get("messages", []) or []
63
+ out: List[GmailMessage] = []
64
+ for m in msgs:
65
+ out.append(GmailMessage(msg_id=m["id"], thread_id=m.get("threadId", "")))
66
+ return out
67
+
68
+ def get_message_full(self, msg_id: str) -> dict:
69
+ return self.service.users().messages().get(userId="me", id=msg_id, format="full").execute()
70
+
71
+ def _walk_parts(self, payload: dict) -> List[dict]:
72
+ parts = []
73
+ stack = [payload]
74
+ while stack:
75
+ node = stack.pop()
76
+ if not isinstance(node, dict):
77
+ continue
78
+ if node.get("parts"):
79
+ stack.extend(node["parts"])
80
+ parts.append(node)
81
+ return parts
82
+
83
+ def list_pdf_attachments(self, msg_full: dict) -> List[Tuple[str, str]]:
84
+ """
85
+ Returns [(filename, attachmentId), ...] for application/pdf parts.
86
+ """
87
+ payload = msg_full.get("payload", {}) or {}
88
+ parts = self._walk_parts(payload)
89
+
90
+ out: List[Tuple[str, str]] = []
91
+ for p in parts:
92
+ filename = (p.get("filename") or "").strip()
93
+ body = p.get("body") or {}
94
+ att_id = body.get("attachmentId")
95
+ mime = (p.get("mimeType") or "").lower()
96
+
97
+ if filename.lower().endswith(".pdf") or mime == "application/pdf":
98
+ if filename and att_id:
99
+ out.append((filename, att_id))
100
+ return out
101
+
102
+ def download_attachment(self, msg_id: str, attachment_id: str) -> bytes:
103
+ att = (
104
+ self.service.users()
105
+ .messages()
106
+ .attachments()
107
+ .get(userId="me", messageId=msg_id, id=attachment_id)
108
+ .execute()
109
+ )
110
+ data = att.get("data", "")
111
+ return base64.urlsafe_b64decode(data.encode("utf-8"))
112
+
113
+ def move_message(
114
+ self,
115
+ msg_id: str,
116
+ add_labels: List[str],
117
+ remove_labels: List[str],
118
+ mark_read: bool = True,
119
+ ) -> None:
120
+ add_ids = [self.ensure_label(n) for n in add_labels]
121
+ remove_ids = [self.ensure_label(n) for n in remove_labels]
122
+
123
+ if mark_read:
124
+ remove_ids.append("UNREAD")
125
+
126
+ body = {"addLabelIds": add_ids, "removeLabelIds": remove_ids}
127
+ self.service.users().messages().modify(userId="me", id=msg_id, body=body).execute()
128
+
129
+ def send_email(self, to_email: str, subject: str, body_text: str, from_email: Optional[str] = None, attachments: Optional[List[Tuple[str, bytes]]] = None) -> None:
130
+ msg = EmailMessage()
131
+ msg["To"] = to_email
132
+ msg["Subject"] = subject
133
+ if from_email:
134
+ msg["From"] = from_email
135
+ msg.set_content(body_text)
136
+
137
+ attachments = attachments or []
138
+ for filename, data in attachments:
139
+ # basic content type guess for pdf/json
140
+ if filename.lower().endswith(".pdf"):
141
+ maintype, subtype = "application", "pdf"
142
+ elif filename.lower().endswith(".json"):
143
+ maintype, subtype = "application", "json"
144
+ else:
145
+ maintype, subtype = "application", "octet-stream"
146
+ msg.add_attachment(data, maintype=maintype, subtype=subtype, filename=filename)
147
+
148
+ raw = base64.urlsafe_b64encode(msg.as_bytes()).decode("utf-8")
149
+ self.service.users().messages().send(userId="me", body={"raw": raw}).execute()
backend/worker/openai_classifier.py ADDED
@@ -0,0 +1,312 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import base64
4
+ import json
5
+ import re
6
+ from pathlib import Path
7
+ from typing import Any, Dict, List, Optional, Tuple
8
+
9
+ from openai import OpenAI
10
+
11
+
12
+ # ----------------------------
13
+ # Known templates (mirror your main system)
14
+ # ----------------------------
15
+ KNOWN_TEMPLATES: List[Dict[str, Any]] = [
16
+ {
17
+ "template_id": "T1_IFACTOR_DELIVERED_ORDER",
18
+ "name": "I-FACTOR Delivered Order Form",
19
+ "keywords_all": ["delivered order form"],
20
+ "keywords_any": ["i-factor", "cerapedics", "product information", "stickers", "bill to", "delivered to"],
21
+ },
22
+ {
23
+ "template_id": "T2_SEASPINE_DELIVERED_GOODS_FORM",
24
+ "name": "SeaSpine Delivered Goods Form",
25
+ "keywords_all": ["delivered goods form"],
26
+ "keywords_any": ["seaspine", "isotis", "handling fee", "sales order", "invoice"],
27
+ },
28
+ {
29
+ "template_id": "T3_ASTURA_SALES_ORDER_FORM",
30
+ "name": "Astura Sales Order Form",
31
+ "keywords_all": [],
32
+ "keywords_any": ["astura", "dc141", "ca200", "cbba", "sales order"],
33
+ },
34
+ {
35
+ "template_id": "T4_MEDICAL_ESTIMATION_OF_CHARGES",
36
+ "name": "Medical Estimation of Charges",
37
+ "keywords_all": [],
38
+ "keywords_any": ["estimation of charges", "good faith estimate", "patient responsibility", "insurance"],
39
+ },
40
+ {
41
+ "template_id": "T5_CLINICAL_PROGRESS_NOTE_POSTOP",
42
+ "name": "Clinical Progress Note Postop",
43
+ "keywords_all": [],
44
+ "keywords_any": ["clinical progress note", "progress note", "post-op", "assessment", "plan"],
45
+ },
46
+ {
47
+ "template_id": "T6_CUSTOMER_CHARGE_SHEET_SPINE",
48
+ "name": "Customer Charge Sheet Spine",
49
+ "keywords_all": [],
50
+ "keywords_any": ["customer charge sheet", "charge sheet", "spine", "qty", "unit price", "total"],
51
+ },
52
+ {
53
+ "template_id": "T7_SALES_ORDER_ZIMMER",
54
+ "name": "Zimmer Sales Order",
55
+ "keywords_all": [],
56
+ "keywords_any": ["zimmer", "zimmer biomet", "biomet", "sales order", "purchase order", "po number"],
57
+ },
58
+ ]
59
+
60
+
61
+ # ----------------------------
62
+ # Public API (EXPLICIT key/model)
63
+ # ----------------------------
64
+ def classify_with_openai(
65
+ image_paths: List[str],
66
+ *,
67
+ api_key: str,
68
+ model: str,
69
+ max_pages: int = 2,
70
+ ) -> Dict[str, Any]:
71
+ """
72
+ Input: list of PNG file paths (page renders).
73
+ Output:
74
+ {
75
+ "template_id": "T1_..." OR "UNKNOWN",
76
+ "confidence": 0..1,
77
+ "reason": "short string",
78
+ "trainer_schema": {} # reserved for later
79
+ }
80
+
81
+ Hard guarantees:
82
+ - does NOT read environment variables
83
+ - does NOT guess api keys
84
+ - strict normalization to known template_ids
85
+ """
86
+ api_key = (api_key or "").strip()
87
+ model = (model or "").strip()
88
+
89
+ if not api_key:
90
+ raise RuntimeError("classify_with_openai: api_key is empty")
91
+ if not model:
92
+ raise RuntimeError("classify_with_openai: model is empty")
93
+
94
+ if not image_paths:
95
+ return {
96
+ "template_id": "UNKNOWN",
97
+ "confidence": 0.0,
98
+ "reason": "No rendered images provided.",
99
+ "trainer_schema": {},
100
+ }
101
+
102
+ # Encode first N pages (keep small + deterministic)
103
+ pages_b64: List[str] = []
104
+ for p in image_paths[: max_pages if max_pages > 0 else 1]:
105
+ pages_b64.append(_png_file_to_b64(Path(p)))
106
+
107
+ client = OpenAI(api_key=api_key)
108
+
109
+ system = (
110
+ "You are a strict document template classifier.\n"
111
+ "You will be shown PNG images of PDF pages (scanned forms).\n"
112
+ "Your job is to decide which known template matches.\n\n"
113
+ "Hard rules:\n"
114
+ "1) Output VALID JSON only. No markdown. No extra text.\n"
115
+ "2) Choose ONE template_id from the provided list OR return template_id='UNKNOWN'.\n"
116
+ "3) If uncertain, return UNKNOWN.\n"
117
+ "4) Use printed headers, vendor branding, and distinctive layout cues.\n"
118
+ "5) confidence must be 0..1.\n"
119
+ )
120
+
121
+ prompt_payload = {
122
+ "known_templates": KNOWN_TEMPLATES,
123
+ "output_schema": {
124
+ "template_id": "string (one of known template_ids) OR 'UNKNOWN'",
125
+ "confidence": "number 0..1",
126
+ "reason": "short string",
127
+ },
128
+ }
129
+
130
+ user_text = (
131
+ "Classify the attached document images against known_templates.\n"
132
+ "Return JSON matching output_schema.\n\n"
133
+ f"{json.dumps(prompt_payload, indent=2)}"
134
+ )
135
+
136
+ # Multi-modal message: text + images
137
+ content: List[Dict[str, Any]] = [{"type": "text", "text": user_text}]
138
+ for b64png in pages_b64:
139
+ content.append(
140
+ {
141
+ "type": "image_url",
142
+ "image_url": {"url": f"data:image/png;base64,{b64png}"},
143
+ }
144
+ )
145
+
146
+ resp = client.chat.completions.create(
147
+ model=model,
148
+ temperature=0.0,
149
+ messages=[
150
+ {"role": "system", "content": system},
151
+ {"role": "user", "content": content},
152
+ ],
153
+ )
154
+
155
+ raw = (resp.choices[0].message.content or "").strip()
156
+ parsed = _parse_json_object(raw)
157
+
158
+ template_id = str(parsed.get("template_id") or "").strip()
159
+ confidence = _to_float(parsed.get("confidence"), default=0.0)
160
+ confidence = max(0.0, min(1.0, confidence))
161
+ reason = str(parsed.get("reason") or "").strip()
162
+
163
+ # Normalize: only allow known template ids or UNKNOWN
164
+ template_id = _normalize_template_id(template_id)
165
+
166
+ # If model returns UNKNOWN but gives high confidence, clamp confidence.
167
+ if template_id == "UNKNOWN" and confidence > 0.6:
168
+ confidence = 0.6
169
+
170
+ return {
171
+ "template_id": template_id,
172
+ "confidence": confidence,
173
+ "reason": reason[:500],
174
+ "trainer_schema": {},
175
+ }
176
+
177
+
178
+ # ----------------------------
179
+ # Legacy wrapper (ENV-based) - keep only if you want
180
+ # ----------------------------
181
+ def classify_with_openai_from_env(image_paths: List[str]) -> Dict[str, Any]:
182
+ """
183
+ Backwards compatible wrapper.
184
+ Reads env vars, then calls classify_with_openai(api_key=..., model=...).
185
+
186
+ Use this only if you have old code you haven't updated yet.
187
+ """
188
+ import os
189
+
190
+ api_key = (os.getenv("OPENAI_API_KEY_TEST") or os.getenv("OPENAI_API_KEY") or "").strip()
191
+ if not api_key:
192
+ raise RuntimeError("Missing OPENAI_API_KEY_TEST (or OPENAI_API_KEY)")
193
+
194
+ model = (os.getenv("OPENAI_MODEL") or "gpt-4o-mini").strip()
195
+
196
+ # IMPORTANT: call the explicit version (one implementation only)
197
+ return classify_with_openai(
198
+ image_paths,
199
+ api_key=api_key,
200
+ model=model,
201
+ )
202
+
203
+
204
+ # ----------------------------
205
+ # Helpers
206
+ # ----------------------------
207
+ def _normalize_template_id(template_id: str) -> str:
208
+ tid = (template_id or "").strip()
209
+ if not tid:
210
+ return "UNKNOWN"
211
+
212
+ known_ids = {t["template_id"] for t in KNOWN_TEMPLATES}
213
+ if tid in known_ids:
214
+ return tid
215
+
216
+ # common garbage patterns (model returns name instead of id, etc.)
217
+ low = tid.lower()
218
+ for t in KNOWN_TEMPLATES:
219
+ if t["name"].lower() == low:
220
+ return t["template_id"]
221
+
222
+ return "UNKNOWN"
223
+
224
+
225
+ def _png_file_to_b64(path: Path) -> str:
226
+ data = path.read_bytes()
227
+ return base64.b64encode(data).decode("utf-8")
228
+
229
+
230
+ _JSON_BLOCK_RE = re.compile(r"\{.*\}", re.DOTALL)
231
+
232
+
233
+ def _parse_json_object(text: str) -> Dict[str, Any]:
234
+ """
235
+ Extract and parse the first {...} JSON object from model output.
236
+ Handles:
237
+ - pure JSON
238
+ - JSON embedded in text
239
+ - fenced code blocks (we strip fences)
240
+ """
241
+ if not text:
242
+ return {}
243
+
244
+ s = text.strip()
245
+
246
+ # Strip ```json fences if present
247
+ s = _strip_code_fences(s)
248
+
249
+ # Fast path: starts with "{"
250
+ if s.startswith("{"):
251
+ try:
252
+ return json.loads(s)
253
+ except Exception:
254
+ pass
255
+
256
+ # Try to find a JSON-looking block
257
+ m = _JSON_BLOCK_RE.search(s)
258
+ if not m:
259
+ return {}
260
+
261
+ chunk = m.group(0)
262
+ try:
263
+ return json.loads(chunk)
264
+ except Exception:
265
+ # last attempt: remove trailing commas (common model mistake)
266
+ cleaned = _remove_trailing_commas(chunk)
267
+ try:
268
+ return json.loads(cleaned)
269
+ except Exception:
270
+ return {}
271
+
272
+
273
+ def _strip_code_fences(s: str) -> str:
274
+ # remove leading ```json / ``` and trailing ```
275
+ if s.startswith("```"):
276
+ s = re.sub(r"^```[a-zA-Z0-9]*\s*", "", s)
277
+ s = re.sub(r"\s*```$", "", s)
278
+ return s.strip()
279
+
280
+
281
+ def _remove_trailing_commas(s: str) -> str:
282
+ # naive but effective: remove ",}" and ",]" patterns repeatedly
283
+ prev = None
284
+ cur = s
285
+ while prev != cur:
286
+ prev = cur
287
+ cur = re.sub(r",\s*}", "}", cur)
288
+ cur = re.sub(r",\s*]", "]", cur)
289
+ return cur
290
+
291
+
292
+ def _to_float(x: Any, default: float = 0.0) -> float:
293
+ try:
294
+ return float(x)
295
+ except Exception:
296
+ return default
297
+
298
+
299
+ # ----------------------------
300
+ # Optional: quick self-check (manual)
301
+ # ----------------------------
302
+ def _debug_summarize_result(res: Dict[str, Any]) -> str:
303
+ return f"template_id={res.get('template_id')} conf={res.get('confidence')} reason={str(res.get('reason') or '')[:80]}"
304
+
305
+
306
+ def _validate_known_templates() -> Tuple[bool, str]:
307
+ ids = [t.get("template_id") for t in KNOWN_TEMPLATES]
308
+ if any(not i for i in ids):
309
+ return False, "One or more templates missing template_id"
310
+ if len(set(ids)) != len(ids):
311
+ return False, "Duplicate template_id in KNOWN_TEMPLATES"
312
+ return True, "ok"
backend/worker/out/.keep ADDED
File without changes
backend/worker/pdf_render.py ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ from dataclasses import dataclass
4
+ from pathlib import Path
5
+ from typing import List
6
+
7
+ import fitz # PyMuPDF
8
+ from PIL import Image
9
+
10
+
11
+ @dataclass
12
+ class RenderedImage:
13
+ path: Path
14
+ page_index: int
15
+
16
+
17
+ def render_pdf_to_pngs(pdf_path: Path, out_dir: Path, pages: int = 2, dpi: int = 200) -> List[RenderedImage]:
18
+ out_dir.mkdir(parents=True, exist_ok=True)
19
+
20
+ doc = fitz.open(pdf_path)
21
+ n = min(pages, doc.page_count)
22
+
23
+ zoom = dpi / 72.0
24
+ mat = fitz.Matrix(zoom, zoom)
25
+
26
+ rendered: List[RenderedImage] = []
27
+ for i in range(n):
28
+ page = doc.load_page(i)
29
+ pix = page.get_pixmap(matrix=mat, alpha=False)
30
+
31
+ img_path = out_dir / f"{pdf_path.stem}_p{i+1}.png"
32
+ pix.save(str(img_path))
33
+
34
+ # normalize to RGB with PIL (avoids weird modes)
35
+ im = Image.open(img_path).convert("RGB")
36
+ im.save(img_path)
37
+
38
+ rendered.append(RenderedImage(path=img_path, page_index=i))
39
+
40
+ doc.close()
41
+ return rendered
backend/worker/prompts.py ADDED
@@ -0,0 +1,87 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ TEMPLATE_IDS = [
2
+ "T1_IFACTOR_DELIVERED_ORDER",
3
+ "T2_SEASPINE_DELIVERED_GOODS_FORM",
4
+ "T3_ASTURA_SALES_ORDER_FORM",
5
+ "T4_MEDICAL_ESTIMATION_OF_CHARGES",
6
+ "T5_CLINICAL_PROGRESS_NOTE_POSTOP",
7
+ "T6_CUSTOMER_CHARGE_SHEET_SPINE",
8
+ "T7_SALES_ORDER_ZIMMER",
9
+ ]
10
+
11
+ SYSTEM_PROMPT = f"""
12
+ You are classifying a medical/healthcare sales/order PDF form into one of the known templates,
13
+ and extracting a "trainer schema" for onboarding.
14
+
15
+ Known template_ids:
16
+ {TEMPLATE_IDS}
17
+
18
+ Rules:
19
+ - You MUST return JSON only (no markdown, no extra text).
20
+ - If none match confidently, return template_id "UNKNOWN".
21
+ - Always produce a schema object (even for UNKNOWN) so onboarding can proceed.
22
+
23
+ Output JSON shape (strict):
24
+ {{
25
+ "template_id": "<one of known template_ids or UNKNOWN>",
26
+ "confidence": 0.0,
27
+ "reason": "<short reason>",
28
+ "trainer_schema": {{
29
+ "form_id": "<suggested id>",
30
+ "version": 1,
31
+ "page": 1,
32
+ "scalar_value_region_mode": "offset_from_anchor_v1",
33
+ "fields": [
34
+ {{
35
+ "field_id": "facility_organization",
36
+ "label": "Facility / Organization",
37
+ "type": "entity",
38
+ "anchor_hint": "<printed label text or None>",
39
+ "value_hint": "<what to extract>"
40
+ }},
41
+ {{
42
+ "field_id": "case_location_address",
43
+ "label": "Case Location / Address",
44
+ "type": "entity",
45
+ "anchor_hint": "<printed label text or None>",
46
+ "value_hint": "<what to extract>"
47
+ }},
48
+ {{
49
+ "field_id": "vendor",
50
+ "label": "Vendor",
51
+ "type": "entity",
52
+ "anchor_hint": "<printed label text or None>",
53
+ "value_hint": "<what to extract>"
54
+ }},
55
+ {{
56
+ "field_id": "physician_name",
57
+ "label": "Physician Name",
58
+ "type": "person",
59
+ "anchor_hint": "<printed label text or None>",
60
+ "value_hint": "<what to extract>"
61
+ }},
62
+ {{
63
+ "field_id": "date_of_surgery",
64
+ "label": "Date of Surgery",
65
+ "type": "date",
66
+ "anchor_hint": "<printed label text or None>",
67
+ "value_hint": "<what to extract>"
68
+ }},
69
+ {{
70
+ "field_id": "items",
71
+ "label": "Items / Line Items",
72
+ "type": "table",
73
+ "table_hint": {{
74
+ "expected_columns": ["item_number","description","qty","lot_number","price","extended_price"],
75
+ "where_on_page": "<short description>",
76
+ "header_text_examples": ["Item Number","Description","Qty"]
77
+ }}
78
+ }}
79
+ ]
80
+ }}
81
+ }}
82
+ """
83
+
84
+ USER_PROMPT = """
85
+ Classify the form template and generate trainer_schema based on the provided page images.
86
+ Focus on printed structure, titles, logos, and table headers.
87
+ """
backend/worker/template_registry_snapshot.py ADDED
File without changes
backend/worker/template_store.py ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+ import json
3
+ from pathlib import Path
4
+ from typing import Any, Dict, List
5
+
6
+ TEMPLATE_DIR = Path(__file__).resolve().parent / "trainer_templates"
7
+
8
+ def list_trainer_templates() -> List[Dict[str, Any]]:
9
+ TEMPLATE_DIR.mkdir(parents=True, exist_ok=True)
10
+ out: List[Dict[str, Any]] = []
11
+
12
+ for p in sorted(TEMPLATE_DIR.glob("*.json")):
13
+ try:
14
+ cfg = json.loads(p.read_text(encoding="utf-8"))
15
+ except Exception:
16
+ continue
17
+
18
+ template_id = cfg.get("template_id") or cfg.get("form_id") or p.stem
19
+ name = cfg.get("name") or cfg.get("form_id") or template_id
20
+
21
+ out.append({
22
+ "template_id": template_id,
23
+ "name": name,
24
+ # optional: trainer config itself (don’t spam prompt if huge)
25
+ "has_config": True,
26
+ })
27
+
28
+ return out
29
+
30
+ def save_trainer_template(template_id: str, cfg: Dict[str, Any]) -> Path:
31
+ TEMPLATE_DIR.mkdir(parents=True, exist_ok=True)
32
+ cfg = dict(cfg)
33
+ cfg["template_id"] = template_id # enforce
34
+ path = TEMPLATE_DIR / f"{template_id}.json"
35
+ path.write_text(json.dumps(cfg, indent=2), encoding="utf-8")
36
+ return path
backend/worker/tmp/.keep ADDED
File without changes
backend/worker/uploads/.keep ADDED
File without changes
backend/worker/worker.py ADDED
@@ -0,0 +1,286 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import os
4
+ import time
5
+ import uuid
6
+ from dataclasses import dataclass
7
+ from pathlib import Path
8
+ from typing import List, Tuple
9
+
10
+ from dotenv import load_dotenv
11
+
12
+ from .gmail_client import GmailClient
13
+ from .openai_classifier import classify_with_openai
14
+ from .pdf_render import render_pdf_to_pngs
15
+
16
+ # Force load repo_root/backend/.env (single source of truth)
17
+ REPO_ROOT = Path(__file__).resolve().parents[2]
18
+ load_dotenv(REPO_ROOT / "backend" / ".env", override=True)
19
+
20
+
21
+ @dataclass
22
+ class Settings:
23
+ creds_path: Path
24
+ token_path: Path
25
+
26
+ label_incoming: str
27
+ label_known: str
28
+ label_unknown: str
29
+ label_train: str
30
+
31
+ # Rep email for UNKNOWN detection
32
+ rep_notify_to: str
33
+ notify_from: str
34
+
35
+ # OpenAI
36
+ openai_api_key: str
37
+ openai_model: str
38
+
39
+ poll_seconds: int
40
+ max_messages_per_poll: int
41
+
42
+ render_pages: int
43
+ render_dpi: int
44
+
45
+ trainer_base_url: str
46
+
47
+
48
+ def load_settings() -> Settings:
49
+ base = Path(__file__).resolve().parents[1] # backend/
50
+ creds = Path(os.environ.get("GMAIL_CREDENTIALS_JSON", str(base / "credentials.json")))
51
+ token = Path(os.environ.get("GMAIL_TOKEN_JSON", str(base / "token.json")))
52
+
53
+ openai_api_key = (os.environ.get("OPENAI_API_KEY_TEST") or os.environ.get("OPENAI_API_KEY") or "").strip()
54
+ openai_model = (os.environ.get("OPENAI_MODEL") or "gpt-4o-mini").strip()
55
+
56
+ return Settings(
57
+ creds_path=creds,
58
+ token_path=token,
59
+
60
+ label_incoming=os.environ.get("PDF_PIPELINE_LABEL_INCOMING", "PDF_PIPELINE/INCOMING"),
61
+ label_known=os.environ.get("PDF_PIPELINE_LABEL_KNOWN", "PDF_PIPELINE/KNOWN"),
62
+ label_unknown=os.environ.get("PDF_PIPELINE_LABEL_UNKNOWN", "PDF_PIPELINE/UNKNOWN"),
63
+ label_train=os.environ.get("PDF_PIPELINE_LABEL_TRAIN", "PDF_PIPELINE/TRAIN"),
64
+
65
+ notify_from=(os.environ.get("PDF_PIPELINE_NOTIFY_FROM") or "").strip(),
66
+ rep_notify_to=(os.environ.get("PDF_PIPELINE_NOTIFY_TO") or "").strip(),
67
+
68
+ openai_api_key=openai_api_key,
69
+ openai_model=openai_model,
70
+
71
+ poll_seconds=int(os.environ.get("PDF_PIPELINE_POLL_SECONDS", "20")),
72
+ max_messages_per_poll=int(os.environ.get("PDF_PIPELINE_MAX_PER_POLL", "5")),
73
+
74
+ render_pages=int(os.environ.get("PDF_PIPELINE_RENDER_PAGES", "2")),
75
+ render_dpi=int(os.environ.get("PDF_PIPELINE_RENDER_DPI", "200")),
76
+
77
+ trainer_base_url=(os.environ.get("PDF_TRAINER_BASE_URL") or "http://localhost:5173").strip(),
78
+ )
79
+
80
+
81
+ def _safe_name(s: str) -> str:
82
+ return "".join(c if c.isalnum() or c in ("-", "_", ".", " ") else "_" for c in s).strip()
83
+
84
+
85
+ def _write_pipeline_pdf(root_worker_dir: Path, filename: str, pdf_bytes: bytes) -> Tuple[str, Path]:
86
+ """
87
+ Persist PDF for the trainer to fetch later.
88
+ Returns (pdf_id, pdf_path_on_disk).
89
+ """
90
+ uploads_dir = root_worker_dir / "uploads"
91
+ uploads_dir.mkdir(parents=True, exist_ok=True)
92
+
93
+ pdf_id = uuid.uuid4().hex
94
+ pdf_path = uploads_dir / f"{pdf_id}.pdf"
95
+ name_path = uploads_dir / f"{pdf_id}.name.txt"
96
+
97
+ pdf_path.write_bytes(pdf_bytes)
98
+ name_path.write_text(filename, encoding="utf-8")
99
+
100
+ return pdf_id, pdf_path
101
+
102
+
103
+ def _process_train_label(gmail: GmailClient, s: Settings, root: Path) -> None:
104
+ """
105
+ TRAIN behavior:
106
+ - Pull unread PDFs from TRAIN label
107
+ - Store into uploads/ and print trainer link
108
+ - Mark read
109
+ - Do NOT classify
110
+ - Do NOT move labels
111
+ """
112
+ msgs = gmail.search_unread_pdf_messages(s.label_train, max_results=s.max_messages_per_poll)
113
+ if not msgs:
114
+ return
115
+
116
+ for m in msgs:
117
+ msg_full = gmail.get_message_full(m.msg_id)
118
+ pdf_atts = gmail.list_pdf_attachments(msg_full)
119
+
120
+ if not pdf_atts:
121
+ gmail.move_message(m.msg_id, add_labels=[], remove_labels=[], mark_read=True)
122
+ continue
123
+
124
+ for (filename, att_id) in pdf_atts:
125
+ filename = _safe_name(filename or "attachment.pdf")
126
+ pdf_bytes = gmail.download_attachment(m.msg_id, att_id)
127
+
128
+ pdf_id, stored_pdf_path = _write_pipeline_pdf(root, filename, pdf_bytes)
129
+ trainer_link = f"{s.trainer_base_url.rstrip('/')}/?pdf_id={pdf_id}"
130
+
131
+ gmail.move_message(m.msg_id, add_labels=[], remove_labels=[], mark_read=True)
132
+
133
+ print(
134
+ f"[worker][TRAIN] stored PDF msg={m.msg_id} file={filename} "
135
+ f"pdf_id={pdf_id} stored={stored_pdf_path}"
136
+ )
137
+ print(f"[worker][TRAIN] open: {trainer_link}")
138
+
139
+
140
+ def main():
141
+ s = load_settings()
142
+
143
+ # Validate settings
144
+ if not s.rep_notify_to:
145
+ raise RuntimeError("Missing PDF_PIPELINE_NOTIFY_TO (rep email for UNKNOWN detection)")
146
+ if not s.notify_from:
147
+ raise RuntimeError("Missing PDF_PIPELINE_NOTIFY_FROM (OAuth Gmail account email)")
148
+ if not s.trainer_base_url:
149
+ raise RuntimeError("Missing PDF_TRAINER_BASE_URL (base URL for trainer link)")
150
+ if not s.openai_api_key:
151
+ raise RuntimeError("Missing OPENAI_API_KEY_TEST (or OPENAI_API_KEY) in backend/.env")
152
+
153
+ gmail = GmailClient(s.creds_path, s.token_path)
154
+
155
+ # Ensure labels exist
156
+ gmail.ensure_label(s.label_incoming)
157
+ gmail.ensure_label(s.label_known)
158
+ gmail.ensure_label(s.label_unknown)
159
+ gmail.ensure_label(s.label_train)
160
+
161
+ root = Path(__file__).resolve().parents[0] # backend/worker
162
+ tmp_dir = root / "tmp"
163
+ tmp_dir.mkdir(parents=True, exist_ok=True)
164
+
165
+ print(f"[worker] Watching label: {s.label_incoming}")
166
+ print(f"[worker] Known label: {s.label_known}")
167
+ print(f"[worker] Unknown label: {s.label_unknown}")
168
+ print(f"[worker] Train label: {s.label_train}")
169
+ print(f"[worker] Rep notify to: {s.rep_notify_to}")
170
+ print(f"[worker] OpenAI model: {s.openai_model}")
171
+
172
+ while True:
173
+ try:
174
+ # 1) TRAIN lane
175
+ _process_train_label(gmail, s, root)
176
+
177
+ # 2) Main pipeline (INCOMING -> KNOWN/UNKNOWN)
178
+ msgs = gmail.search_unread_pdf_messages(s.label_incoming, max_results=s.max_messages_per_poll)
179
+ if not msgs:
180
+ time.sleep(s.poll_seconds)
181
+ continue
182
+
183
+ for m in msgs:
184
+ msg_full = gmail.get_message_full(m.msg_id)
185
+ pdf_atts = gmail.list_pdf_attachments(msg_full)
186
+
187
+ if not pdf_atts:
188
+ # Remove INCOMING + mark read so it doesn't loop forever
189
+ gmail.move_message(m.msg_id, add_labels=[], remove_labels=[s.label_incoming], mark_read=True)
190
+ continue
191
+
192
+ any_unknown = False
193
+ unknown_payloads: List[Tuple[str, bytes]] = []
194
+
195
+ # Classify all PDF attachments for this message
196
+ for (filename, att_id) in pdf_atts:
197
+ filename = _safe_name(filename or "attachment.pdf")
198
+ pdf_bytes = gmail.download_attachment(m.msg_id, att_id)
199
+
200
+ stamp = str(int(time.time()))
201
+ pdf_path = tmp_dir / f"{stamp}_{m.msg_id}_{filename}"
202
+ pdf_path.write_bytes(pdf_bytes)
203
+
204
+ img_dir = tmp_dir / f"{stamp}_{m.msg_id}_{pdf_path.stem}"
205
+ rendered = render_pdf_to_pngs(pdf_path, img_dir, pages=s.render_pages, dpi=s.render_dpi)
206
+ image_paths = [str(r.path) for r in rendered]
207
+
208
+ result = classify_with_openai(
209
+ image_paths,
210
+ api_key=s.openai_api_key,
211
+ model=s.openai_model,
212
+ )
213
+
214
+ template_id = (result.get("template_id") or "UNKNOWN").strip()
215
+ conf = float(result.get("confidence") or 0.0)
216
+
217
+ if template_id == "UNKNOWN":
218
+ any_unknown = True
219
+ unknown_payloads.append((filename, pdf_bytes))
220
+ print(f"[worker] UNKNOWN attachment conf={conf:.3f} msg={m.msg_id} file={filename}")
221
+ else:
222
+ print(
223
+ f"[worker] KNOWN attachment template={template_id} conf={conf:.3f} "
224
+ f"msg={m.msg_id} file={filename}"
225
+ )
226
+
227
+ # Apply Gmail label ONCE per message
228
+ if any_unknown:
229
+ gmail.move_message(
230
+ m.msg_id,
231
+ add_labels=[s.label_unknown],
232
+ remove_labels=[s.label_incoming],
233
+ mark_read=True,
234
+ )
235
+ else:
236
+ gmail.move_message(
237
+ m.msg_id,
238
+ add_labels=[s.label_known],
239
+ remove_labels=[s.label_incoming],
240
+ mark_read=True,
241
+ )
242
+
243
+ # Notify rep for each unknown PDF attachment
244
+ if any_unknown:
245
+ for (filename, pdf_bytes) in unknown_payloads:
246
+ pdf_id, stored_pdf_path = _write_pipeline_pdf(root, filename, pdf_bytes)
247
+ trainer_link = f"{s.trainer_base_url.rstrip('/')}/?pdf_id={pdf_id}"
248
+
249
+ subject = "Action required: Unknown PDF format (template not found)"
250
+ body = (
251
+ "Hi,\n\n"
252
+ "We received a PDF that does not match any existing templates in the system.\n\n"
253
+ "Please open the PDF Trainer using the link below and create or update the template configuration:\n"
254
+ f"{trainer_link}\n\n"
255
+ "The original PDF is attached for reference.\n\n"
256
+ "Thank you,\n"
257
+ "Inserio Automation\n"
258
+ )
259
+
260
+ attachments: List[Tuple[str, bytes]] = []
261
+ if len(pdf_bytes) < 20 * 1024 * 1024:
262
+ attachments.append((filename, pdf_bytes))
263
+ else:
264
+ body += "\nNote: The PDF was too large to attach.\n"
265
+
266
+ gmail.send_email(
267
+ to_email=s.rep_notify_to,
268
+ from_email=s.notify_from,
269
+ subject=subject,
270
+ body_text=body,
271
+ attachments=attachments,
272
+ )
273
+
274
+ print(
275
+ f"[worker] UNKNOWN: emailed rep {s.rep_notify_to} msg={m.msg_id} file={filename} "
276
+ f"pdf_id={pdf_id} stored={stored_pdf_path}"
277
+ )
278
+
279
+ except Exception as e:
280
+ print(f"[worker] ERROR: {e}")
281
+
282
+ time.sleep(s.poll_seconds)
283
+
284
+
285
+ if __name__ == "__main__":
286
+ main()
requirements.txt CHANGED
@@ -1,18 +1,8 @@
1
- # Google / Gmail
2
- google-api-python-client==2.111.0
3
- google-auth==2.27.0
4
- google-auth-oauthlib==1.2.0
5
-
6
- # OpenAI
7
- openai==1.12.0
8
-
9
- # PDF -> image
10
- PyMuPDF==1.23.26
11
- Pillow==10.2.0
12
-
13
- # Utilities
14
- python-dotenv==1.0.1
15
- requests==2.31.0
16
-
17
- fastapi==0.115.6
18
- uvicorn==0.30.6
 
1
+ google-api-python-client
2
+ google-auth
3
+ google-auth-oauthlib
4
+ google-auth-httplib2
5
+ python-dotenv
6
+ pydantic
7
+ requests
8
+ Pillow