Spaces:
Runtime error
Runtime error
Commit ·
7fd3f6f
1
Parent(s): 4478465
fix: restore worker code + entrypoint
Browse files- Dockerfile +5 -22
- app.py +4 -0
- backend/__init__.py +0 -0
- backend/worker/__init__.py +0 -0
- backend/worker/config.py +89 -0
- backend/worker/gmail_client.py +149 -0
- backend/worker/openai_classifier.py +312 -0
- backend/worker/out/.keep +0 -0
- backend/worker/pdf_render.py +41 -0
- backend/worker/prompts.py +87 -0
- backend/worker/template_registry_snapshot.py +0 -0
- backend/worker/template_store.py +36 -0
- backend/worker/tmp/.keep +0 -0
- backend/worker/uploads/.keep +0 -0
- backend/worker/worker.py +286 -0
- requirements.txt +8 -18
Dockerfile
CHANGED
|
@@ -1,25 +1,8 @@
|
|
| 1 |
-
|
| 2 |
-
FROM node:20-alpine AS build
|
| 3 |
|
| 4 |
WORKDIR /app
|
|
|
|
|
|
|
| 5 |
|
| 6 |
-
COPY
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
COPY . .
|
| 10 |
-
RUN npm run build
|
| 11 |
-
|
| 12 |
-
# ---- Runtime stage ----
|
| 13 |
-
FROM nginx:alpine
|
| 14 |
-
|
| 15 |
-
# Remove default nginx config
|
| 16 |
-
RUN rm /etc/nginx/conf.d/default.conf
|
| 17 |
-
|
| 18 |
-
# Custom nginx config
|
| 19 |
-
COPY nginx.conf /etc/nginx/conf.d/default.conf
|
| 20 |
-
|
| 21 |
-
# Copy built assets
|
| 22 |
-
COPY --from=build /app/dist /usr/share/nginx/html
|
| 23 |
-
|
| 24 |
-
EXPOSE 7860
|
| 25 |
-
CMD ["nginx", "-g", "daemon off;"]
|
|
|
|
| 1 |
+
FROM python:3.11-slim
|
|
|
|
| 2 |
|
| 3 |
WORKDIR /app
|
| 4 |
+
COPY requirements.txt /app/requirements.txt
|
| 5 |
+
RUN pip install --no-cache-dir -r /app/requirements.txt
|
| 6 |
|
| 7 |
+
COPY . /app
|
| 8 |
+
CMD ["python", "-u", "app.py"]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
app.py
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from backend.worker.worker import main
|
| 2 |
+
|
| 3 |
+
if __name__ == "__main__":
|
| 4 |
+
main()
|
backend/__init__.py
ADDED
|
File without changes
|
backend/worker/__init__.py
ADDED
|
File without changes
|
backend/worker/config.py
ADDED
|
@@ -0,0 +1,89 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
import os
|
| 4 |
+
from dataclasses import dataclass
|
| 5 |
+
from pathlib import Path
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
@dataclass(frozen=True)
|
| 9 |
+
class Settings:
|
| 10 |
+
# Repo paths
|
| 11 |
+
repo_root: Path
|
| 12 |
+
backend_dir: Path
|
| 13 |
+
worker_dir: Path
|
| 14 |
+
|
| 15 |
+
# Gmail
|
| 16 |
+
credentials_path: Path
|
| 17 |
+
token_path: Path
|
| 18 |
+
|
| 19 |
+
label_incoming: str
|
| 20 |
+
label_known: str
|
| 21 |
+
label_unknown: str
|
| 22 |
+
label_train: str
|
| 23 |
+
|
| 24 |
+
# Notification
|
| 25 |
+
notify_to_email: str
|
| 26 |
+
notify_from_email: str
|
| 27 |
+
|
| 28 |
+
# Trainer
|
| 29 |
+
trainer_base_url: str
|
| 30 |
+
|
| 31 |
+
# OpenAI
|
| 32 |
+
openai_api_key: str
|
| 33 |
+
openai_model: str
|
| 34 |
+
|
| 35 |
+
# Worker behavior
|
| 36 |
+
poll_seconds: int
|
| 37 |
+
max_messages_per_poll: int
|
| 38 |
+
render_pages: int
|
| 39 |
+
render_dpi: int
|
| 40 |
+
|
| 41 |
+
|
| 42 |
+
def load_settings(repo_root: Path) -> Settings:
|
| 43 |
+
backend_dir = repo_root / "backend"
|
| 44 |
+
worker_dir = backend_dir / "worker"
|
| 45 |
+
|
| 46 |
+
# IMPORTANT: use the SAME env var you actually store in backend/.env
|
| 47 |
+
# Your file shows OPENAI_API_KEY_TEST=...
|
| 48 |
+
openai_api_key = os.environ.get("OPENAI_API_KEY_TEST", "").strip()
|
| 49 |
+
if not openai_api_key:
|
| 50 |
+
raise RuntimeError("Missing OPENAI_API_KEY_TEST env var in backend/.env")
|
| 51 |
+
|
| 52 |
+
notify_to = os.environ.get("PDF_PIPELINE_NOTIFY_TO", "").strip()
|
| 53 |
+
if not notify_to:
|
| 54 |
+
raise RuntimeError("Missing PDF_PIPELINE_NOTIFY_TO env var")
|
| 55 |
+
|
| 56 |
+
notify_from = os.environ.get("PDF_PIPELINE_NOTIFY_FROM", "").strip()
|
| 57 |
+
if not notify_from:
|
| 58 |
+
raise RuntimeError("Missing PDF_PIPELINE_NOTIFY_FROM env var")
|
| 59 |
+
|
| 60 |
+
trainer_base_url = os.environ.get("PDF_TRAINER_BASE_URL", "http://localhost:5173").strip()
|
| 61 |
+
if not trainer_base_url:
|
| 62 |
+
raise RuntimeError("Missing PDF_TRAINER_BASE_URL env var")
|
| 63 |
+
|
| 64 |
+
return Settings(
|
| 65 |
+
repo_root=repo_root,
|
| 66 |
+
backend_dir=backend_dir,
|
| 67 |
+
worker_dir=worker_dir,
|
| 68 |
+
|
| 69 |
+
credentials_path=Path(os.environ.get("GMAIL_CREDENTIALS_JSON", str(backend_dir / "credentials.json"))),
|
| 70 |
+
token_path=Path(os.environ.get("GMAIL_TOKEN_JSON", str(backend_dir / "token.json"))),
|
| 71 |
+
|
| 72 |
+
label_incoming=os.environ.get("PDF_PIPELINE_LABEL_INCOMING", "PDF_PIPELINE/INCOMING"),
|
| 73 |
+
label_known=os.environ.get("PDF_PIPELINE_LABEL_KNOWN", "PDF_PIPELINE/KNOWN"),
|
| 74 |
+
label_unknown=os.environ.get("PDF_PIPELINE_LABEL_UNKNOWN", "PDF_PIPELINE/UNKNOWN"),
|
| 75 |
+
label_train=os.environ.get("PDF_PIPELINE_LABEL_TRAIN", "PDF_PIPELINE/TRAIN"),
|
| 76 |
+
|
| 77 |
+
notify_to_email=notify_to,
|
| 78 |
+
notify_from_email=notify_from,
|
| 79 |
+
|
| 80 |
+
trainer_base_url=trainer_base_url,
|
| 81 |
+
|
| 82 |
+
openai_api_key=openai_api_key,
|
| 83 |
+
openai_model=os.environ.get("OPENAI_MODEL", "gpt-4.1-mini"),
|
| 84 |
+
|
| 85 |
+
poll_seconds=int(os.environ.get("PDF_PIPELINE_POLL_SECONDS", "20")),
|
| 86 |
+
max_messages_per_poll=int(os.environ.get("PDF_PIPELINE_MAX_PER_POLL", "5")),
|
| 87 |
+
render_pages=int(os.environ.get("PDF_PIPELINE_RENDER_PAGES", "2")),
|
| 88 |
+
render_dpi=int(os.environ.get("PDF_PIPELINE_RENDER_DPI", "200")),
|
| 89 |
+
)
|
backend/worker/gmail_client.py
ADDED
|
@@ -0,0 +1,149 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
import base64
|
| 3 |
+
import os
|
| 4 |
+
from dataclasses import dataclass
|
| 5 |
+
from email.message import EmailMessage
|
| 6 |
+
from pathlib import Path
|
| 7 |
+
from typing import List, Optional, Tuple
|
| 8 |
+
|
| 9 |
+
from google.oauth2.credentials import Credentials
|
| 10 |
+
from googleapiclient.discovery import build
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
SCOPES = [
|
| 14 |
+
"https://www.googleapis.com/auth/gmail.modify",
|
| 15 |
+
"https://www.googleapis.com/auth/gmail.send",
|
| 16 |
+
]
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
@dataclass
|
| 20 |
+
class GmailMessage:
|
| 21 |
+
msg_id: str
|
| 22 |
+
thread_id: str
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
class GmailClient:
|
| 26 |
+
def __init__(self, credentials_path: Path, token_path: Path):
|
| 27 |
+
if not credentials_path.exists():
|
| 28 |
+
raise FileNotFoundError(f"Missing OAuth client json: {credentials_path}")
|
| 29 |
+
if not token_path.exists():
|
| 30 |
+
raise FileNotFoundError(f"Missing token json: {token_path}")
|
| 31 |
+
|
| 32 |
+
creds = Credentials.from_authorized_user_file(str(token_path), SCOPES)
|
| 33 |
+
self.service = build("gmail", "v1", credentials=creds, cache_discovery=False)
|
| 34 |
+
|
| 35 |
+
def list_labels(self) -> List[dict]:
|
| 36 |
+
resp = self.service.users().labels().list(userId="me").execute()
|
| 37 |
+
return resp.get("labels", [])
|
| 38 |
+
|
| 39 |
+
def get_label_id(self, name: str) -> Optional[str]:
|
| 40 |
+
for lbl in self.list_labels():
|
| 41 |
+
if lbl.get("name") == name:
|
| 42 |
+
return lbl.get("id")
|
| 43 |
+
return None
|
| 44 |
+
|
| 45 |
+
def ensure_label(self, name: str) -> str:
|
| 46 |
+
existing = self.get_label_id(name)
|
| 47 |
+
if existing:
|
| 48 |
+
return existing
|
| 49 |
+
|
| 50 |
+
body = {
|
| 51 |
+
"name": name,
|
| 52 |
+
"labelListVisibility": "labelShow",
|
| 53 |
+
"messageListVisibility": "show",
|
| 54 |
+
}
|
| 55 |
+
created = self.service.users().labels().create(userId="me", body=body).execute()
|
| 56 |
+
return created["id"]
|
| 57 |
+
|
| 58 |
+
def search_unread_pdf_messages(self, label_name: str, max_results: int = 10) -> List[GmailMessage]:
|
| 59 |
+
# Gmail search query: label + unread + pdf attachments
|
| 60 |
+
query = f'label:"{label_name}" is:unread has:attachment filename:pdf'
|
| 61 |
+
resp = self.service.users().messages().list(userId="me", q=query, maxResults=max_results).execute()
|
| 62 |
+
msgs = resp.get("messages", []) or []
|
| 63 |
+
out: List[GmailMessage] = []
|
| 64 |
+
for m in msgs:
|
| 65 |
+
out.append(GmailMessage(msg_id=m["id"], thread_id=m.get("threadId", "")))
|
| 66 |
+
return out
|
| 67 |
+
|
| 68 |
+
def get_message_full(self, msg_id: str) -> dict:
|
| 69 |
+
return self.service.users().messages().get(userId="me", id=msg_id, format="full").execute()
|
| 70 |
+
|
| 71 |
+
def _walk_parts(self, payload: dict) -> List[dict]:
|
| 72 |
+
parts = []
|
| 73 |
+
stack = [payload]
|
| 74 |
+
while stack:
|
| 75 |
+
node = stack.pop()
|
| 76 |
+
if not isinstance(node, dict):
|
| 77 |
+
continue
|
| 78 |
+
if node.get("parts"):
|
| 79 |
+
stack.extend(node["parts"])
|
| 80 |
+
parts.append(node)
|
| 81 |
+
return parts
|
| 82 |
+
|
| 83 |
+
def list_pdf_attachments(self, msg_full: dict) -> List[Tuple[str, str]]:
|
| 84 |
+
"""
|
| 85 |
+
Returns [(filename, attachmentId), ...] for application/pdf parts.
|
| 86 |
+
"""
|
| 87 |
+
payload = msg_full.get("payload", {}) or {}
|
| 88 |
+
parts = self._walk_parts(payload)
|
| 89 |
+
|
| 90 |
+
out: List[Tuple[str, str]] = []
|
| 91 |
+
for p in parts:
|
| 92 |
+
filename = (p.get("filename") or "").strip()
|
| 93 |
+
body = p.get("body") or {}
|
| 94 |
+
att_id = body.get("attachmentId")
|
| 95 |
+
mime = (p.get("mimeType") or "").lower()
|
| 96 |
+
|
| 97 |
+
if filename.lower().endswith(".pdf") or mime == "application/pdf":
|
| 98 |
+
if filename and att_id:
|
| 99 |
+
out.append((filename, att_id))
|
| 100 |
+
return out
|
| 101 |
+
|
| 102 |
+
def download_attachment(self, msg_id: str, attachment_id: str) -> bytes:
|
| 103 |
+
att = (
|
| 104 |
+
self.service.users()
|
| 105 |
+
.messages()
|
| 106 |
+
.attachments()
|
| 107 |
+
.get(userId="me", messageId=msg_id, id=attachment_id)
|
| 108 |
+
.execute()
|
| 109 |
+
)
|
| 110 |
+
data = att.get("data", "")
|
| 111 |
+
return base64.urlsafe_b64decode(data.encode("utf-8"))
|
| 112 |
+
|
| 113 |
+
def move_message(
|
| 114 |
+
self,
|
| 115 |
+
msg_id: str,
|
| 116 |
+
add_labels: List[str],
|
| 117 |
+
remove_labels: List[str],
|
| 118 |
+
mark_read: bool = True,
|
| 119 |
+
) -> None:
|
| 120 |
+
add_ids = [self.ensure_label(n) for n in add_labels]
|
| 121 |
+
remove_ids = [self.ensure_label(n) for n in remove_labels]
|
| 122 |
+
|
| 123 |
+
if mark_read:
|
| 124 |
+
remove_ids.append("UNREAD")
|
| 125 |
+
|
| 126 |
+
body = {"addLabelIds": add_ids, "removeLabelIds": remove_ids}
|
| 127 |
+
self.service.users().messages().modify(userId="me", id=msg_id, body=body).execute()
|
| 128 |
+
|
| 129 |
+
def send_email(self, to_email: str, subject: str, body_text: str, from_email: Optional[str] = None, attachments: Optional[List[Tuple[str, bytes]]] = None) -> None:
|
| 130 |
+
msg = EmailMessage()
|
| 131 |
+
msg["To"] = to_email
|
| 132 |
+
msg["Subject"] = subject
|
| 133 |
+
if from_email:
|
| 134 |
+
msg["From"] = from_email
|
| 135 |
+
msg.set_content(body_text)
|
| 136 |
+
|
| 137 |
+
attachments = attachments or []
|
| 138 |
+
for filename, data in attachments:
|
| 139 |
+
# basic content type guess for pdf/json
|
| 140 |
+
if filename.lower().endswith(".pdf"):
|
| 141 |
+
maintype, subtype = "application", "pdf"
|
| 142 |
+
elif filename.lower().endswith(".json"):
|
| 143 |
+
maintype, subtype = "application", "json"
|
| 144 |
+
else:
|
| 145 |
+
maintype, subtype = "application", "octet-stream"
|
| 146 |
+
msg.add_attachment(data, maintype=maintype, subtype=subtype, filename=filename)
|
| 147 |
+
|
| 148 |
+
raw = base64.urlsafe_b64encode(msg.as_bytes()).decode("utf-8")
|
| 149 |
+
self.service.users().messages().send(userId="me", body={"raw": raw}).execute()
|
backend/worker/openai_classifier.py
ADDED
|
@@ -0,0 +1,312 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
import base64
|
| 4 |
+
import json
|
| 5 |
+
import re
|
| 6 |
+
from pathlib import Path
|
| 7 |
+
from typing import Any, Dict, List, Optional, Tuple
|
| 8 |
+
|
| 9 |
+
from openai import OpenAI
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
# ----------------------------
|
| 13 |
+
# Known templates (mirror your main system)
|
| 14 |
+
# ----------------------------
|
| 15 |
+
KNOWN_TEMPLATES: List[Dict[str, Any]] = [
|
| 16 |
+
{
|
| 17 |
+
"template_id": "T1_IFACTOR_DELIVERED_ORDER",
|
| 18 |
+
"name": "I-FACTOR Delivered Order Form",
|
| 19 |
+
"keywords_all": ["delivered order form"],
|
| 20 |
+
"keywords_any": ["i-factor", "cerapedics", "product information", "stickers", "bill to", "delivered to"],
|
| 21 |
+
},
|
| 22 |
+
{
|
| 23 |
+
"template_id": "T2_SEASPINE_DELIVERED_GOODS_FORM",
|
| 24 |
+
"name": "SeaSpine Delivered Goods Form",
|
| 25 |
+
"keywords_all": ["delivered goods form"],
|
| 26 |
+
"keywords_any": ["seaspine", "isotis", "handling fee", "sales order", "invoice"],
|
| 27 |
+
},
|
| 28 |
+
{
|
| 29 |
+
"template_id": "T3_ASTURA_SALES_ORDER_FORM",
|
| 30 |
+
"name": "Astura Sales Order Form",
|
| 31 |
+
"keywords_all": [],
|
| 32 |
+
"keywords_any": ["astura", "dc141", "ca200", "cbba", "sales order"],
|
| 33 |
+
},
|
| 34 |
+
{
|
| 35 |
+
"template_id": "T4_MEDICAL_ESTIMATION_OF_CHARGES",
|
| 36 |
+
"name": "Medical Estimation of Charges",
|
| 37 |
+
"keywords_all": [],
|
| 38 |
+
"keywords_any": ["estimation of charges", "good faith estimate", "patient responsibility", "insurance"],
|
| 39 |
+
},
|
| 40 |
+
{
|
| 41 |
+
"template_id": "T5_CLINICAL_PROGRESS_NOTE_POSTOP",
|
| 42 |
+
"name": "Clinical Progress Note Postop",
|
| 43 |
+
"keywords_all": [],
|
| 44 |
+
"keywords_any": ["clinical progress note", "progress note", "post-op", "assessment", "plan"],
|
| 45 |
+
},
|
| 46 |
+
{
|
| 47 |
+
"template_id": "T6_CUSTOMER_CHARGE_SHEET_SPINE",
|
| 48 |
+
"name": "Customer Charge Sheet Spine",
|
| 49 |
+
"keywords_all": [],
|
| 50 |
+
"keywords_any": ["customer charge sheet", "charge sheet", "spine", "qty", "unit price", "total"],
|
| 51 |
+
},
|
| 52 |
+
{
|
| 53 |
+
"template_id": "T7_SALES_ORDER_ZIMMER",
|
| 54 |
+
"name": "Zimmer Sales Order",
|
| 55 |
+
"keywords_all": [],
|
| 56 |
+
"keywords_any": ["zimmer", "zimmer biomet", "biomet", "sales order", "purchase order", "po number"],
|
| 57 |
+
},
|
| 58 |
+
]
|
| 59 |
+
|
| 60 |
+
|
| 61 |
+
# ----------------------------
|
| 62 |
+
# Public API (EXPLICIT key/model)
|
| 63 |
+
# ----------------------------
|
| 64 |
+
def classify_with_openai(
|
| 65 |
+
image_paths: List[str],
|
| 66 |
+
*,
|
| 67 |
+
api_key: str,
|
| 68 |
+
model: str,
|
| 69 |
+
max_pages: int = 2,
|
| 70 |
+
) -> Dict[str, Any]:
|
| 71 |
+
"""
|
| 72 |
+
Input: list of PNG file paths (page renders).
|
| 73 |
+
Output:
|
| 74 |
+
{
|
| 75 |
+
"template_id": "T1_..." OR "UNKNOWN",
|
| 76 |
+
"confidence": 0..1,
|
| 77 |
+
"reason": "short string",
|
| 78 |
+
"trainer_schema": {} # reserved for later
|
| 79 |
+
}
|
| 80 |
+
|
| 81 |
+
Hard guarantees:
|
| 82 |
+
- does NOT read environment variables
|
| 83 |
+
- does NOT guess api keys
|
| 84 |
+
- strict normalization to known template_ids
|
| 85 |
+
"""
|
| 86 |
+
api_key = (api_key or "").strip()
|
| 87 |
+
model = (model or "").strip()
|
| 88 |
+
|
| 89 |
+
if not api_key:
|
| 90 |
+
raise RuntimeError("classify_with_openai: api_key is empty")
|
| 91 |
+
if not model:
|
| 92 |
+
raise RuntimeError("classify_with_openai: model is empty")
|
| 93 |
+
|
| 94 |
+
if not image_paths:
|
| 95 |
+
return {
|
| 96 |
+
"template_id": "UNKNOWN",
|
| 97 |
+
"confidence": 0.0,
|
| 98 |
+
"reason": "No rendered images provided.",
|
| 99 |
+
"trainer_schema": {},
|
| 100 |
+
}
|
| 101 |
+
|
| 102 |
+
# Encode first N pages (keep small + deterministic)
|
| 103 |
+
pages_b64: List[str] = []
|
| 104 |
+
for p in image_paths[: max_pages if max_pages > 0 else 1]:
|
| 105 |
+
pages_b64.append(_png_file_to_b64(Path(p)))
|
| 106 |
+
|
| 107 |
+
client = OpenAI(api_key=api_key)
|
| 108 |
+
|
| 109 |
+
system = (
|
| 110 |
+
"You are a strict document template classifier.\n"
|
| 111 |
+
"You will be shown PNG images of PDF pages (scanned forms).\n"
|
| 112 |
+
"Your job is to decide which known template matches.\n\n"
|
| 113 |
+
"Hard rules:\n"
|
| 114 |
+
"1) Output VALID JSON only. No markdown. No extra text.\n"
|
| 115 |
+
"2) Choose ONE template_id from the provided list OR return template_id='UNKNOWN'.\n"
|
| 116 |
+
"3) If uncertain, return UNKNOWN.\n"
|
| 117 |
+
"4) Use printed headers, vendor branding, and distinctive layout cues.\n"
|
| 118 |
+
"5) confidence must be 0..1.\n"
|
| 119 |
+
)
|
| 120 |
+
|
| 121 |
+
prompt_payload = {
|
| 122 |
+
"known_templates": KNOWN_TEMPLATES,
|
| 123 |
+
"output_schema": {
|
| 124 |
+
"template_id": "string (one of known template_ids) OR 'UNKNOWN'",
|
| 125 |
+
"confidence": "number 0..1",
|
| 126 |
+
"reason": "short string",
|
| 127 |
+
},
|
| 128 |
+
}
|
| 129 |
+
|
| 130 |
+
user_text = (
|
| 131 |
+
"Classify the attached document images against known_templates.\n"
|
| 132 |
+
"Return JSON matching output_schema.\n\n"
|
| 133 |
+
f"{json.dumps(prompt_payload, indent=2)}"
|
| 134 |
+
)
|
| 135 |
+
|
| 136 |
+
# Multi-modal message: text + images
|
| 137 |
+
content: List[Dict[str, Any]] = [{"type": "text", "text": user_text}]
|
| 138 |
+
for b64png in pages_b64:
|
| 139 |
+
content.append(
|
| 140 |
+
{
|
| 141 |
+
"type": "image_url",
|
| 142 |
+
"image_url": {"url": f"data:image/png;base64,{b64png}"},
|
| 143 |
+
}
|
| 144 |
+
)
|
| 145 |
+
|
| 146 |
+
resp = client.chat.completions.create(
|
| 147 |
+
model=model,
|
| 148 |
+
temperature=0.0,
|
| 149 |
+
messages=[
|
| 150 |
+
{"role": "system", "content": system},
|
| 151 |
+
{"role": "user", "content": content},
|
| 152 |
+
],
|
| 153 |
+
)
|
| 154 |
+
|
| 155 |
+
raw = (resp.choices[0].message.content or "").strip()
|
| 156 |
+
parsed = _parse_json_object(raw)
|
| 157 |
+
|
| 158 |
+
template_id = str(parsed.get("template_id") or "").strip()
|
| 159 |
+
confidence = _to_float(parsed.get("confidence"), default=0.0)
|
| 160 |
+
confidence = max(0.0, min(1.0, confidence))
|
| 161 |
+
reason = str(parsed.get("reason") or "").strip()
|
| 162 |
+
|
| 163 |
+
# Normalize: only allow known template ids or UNKNOWN
|
| 164 |
+
template_id = _normalize_template_id(template_id)
|
| 165 |
+
|
| 166 |
+
# If model returns UNKNOWN but gives high confidence, clamp confidence.
|
| 167 |
+
if template_id == "UNKNOWN" and confidence > 0.6:
|
| 168 |
+
confidence = 0.6
|
| 169 |
+
|
| 170 |
+
return {
|
| 171 |
+
"template_id": template_id,
|
| 172 |
+
"confidence": confidence,
|
| 173 |
+
"reason": reason[:500],
|
| 174 |
+
"trainer_schema": {},
|
| 175 |
+
}
|
| 176 |
+
|
| 177 |
+
|
| 178 |
+
# ----------------------------
|
| 179 |
+
# Legacy wrapper (ENV-based) - keep only if you want
|
| 180 |
+
# ----------------------------
|
| 181 |
+
def classify_with_openai_from_env(image_paths: List[str]) -> Dict[str, Any]:
|
| 182 |
+
"""
|
| 183 |
+
Backwards compatible wrapper.
|
| 184 |
+
Reads env vars, then calls classify_with_openai(api_key=..., model=...).
|
| 185 |
+
|
| 186 |
+
Use this only if you have old code you haven't updated yet.
|
| 187 |
+
"""
|
| 188 |
+
import os
|
| 189 |
+
|
| 190 |
+
api_key = (os.getenv("OPENAI_API_KEY_TEST") or os.getenv("OPENAI_API_KEY") or "").strip()
|
| 191 |
+
if not api_key:
|
| 192 |
+
raise RuntimeError("Missing OPENAI_API_KEY_TEST (or OPENAI_API_KEY)")
|
| 193 |
+
|
| 194 |
+
model = (os.getenv("OPENAI_MODEL") or "gpt-4o-mini").strip()
|
| 195 |
+
|
| 196 |
+
# IMPORTANT: call the explicit version (one implementation only)
|
| 197 |
+
return classify_with_openai(
|
| 198 |
+
image_paths,
|
| 199 |
+
api_key=api_key,
|
| 200 |
+
model=model,
|
| 201 |
+
)
|
| 202 |
+
|
| 203 |
+
|
| 204 |
+
# ----------------------------
|
| 205 |
+
# Helpers
|
| 206 |
+
# ----------------------------
|
| 207 |
+
def _normalize_template_id(template_id: str) -> str:
|
| 208 |
+
tid = (template_id or "").strip()
|
| 209 |
+
if not tid:
|
| 210 |
+
return "UNKNOWN"
|
| 211 |
+
|
| 212 |
+
known_ids = {t["template_id"] for t in KNOWN_TEMPLATES}
|
| 213 |
+
if tid in known_ids:
|
| 214 |
+
return tid
|
| 215 |
+
|
| 216 |
+
# common garbage patterns (model returns name instead of id, etc.)
|
| 217 |
+
low = tid.lower()
|
| 218 |
+
for t in KNOWN_TEMPLATES:
|
| 219 |
+
if t["name"].lower() == low:
|
| 220 |
+
return t["template_id"]
|
| 221 |
+
|
| 222 |
+
return "UNKNOWN"
|
| 223 |
+
|
| 224 |
+
|
| 225 |
+
def _png_file_to_b64(path: Path) -> str:
|
| 226 |
+
data = path.read_bytes()
|
| 227 |
+
return base64.b64encode(data).decode("utf-8")
|
| 228 |
+
|
| 229 |
+
|
| 230 |
+
_JSON_BLOCK_RE = re.compile(r"\{.*\}", re.DOTALL)
|
| 231 |
+
|
| 232 |
+
|
| 233 |
+
def _parse_json_object(text: str) -> Dict[str, Any]:
|
| 234 |
+
"""
|
| 235 |
+
Extract and parse the first {...} JSON object from model output.
|
| 236 |
+
Handles:
|
| 237 |
+
- pure JSON
|
| 238 |
+
- JSON embedded in text
|
| 239 |
+
- fenced code blocks (we strip fences)
|
| 240 |
+
"""
|
| 241 |
+
if not text:
|
| 242 |
+
return {}
|
| 243 |
+
|
| 244 |
+
s = text.strip()
|
| 245 |
+
|
| 246 |
+
# Strip ```json fences if present
|
| 247 |
+
s = _strip_code_fences(s)
|
| 248 |
+
|
| 249 |
+
# Fast path: starts with "{"
|
| 250 |
+
if s.startswith("{"):
|
| 251 |
+
try:
|
| 252 |
+
return json.loads(s)
|
| 253 |
+
except Exception:
|
| 254 |
+
pass
|
| 255 |
+
|
| 256 |
+
# Try to find a JSON-looking block
|
| 257 |
+
m = _JSON_BLOCK_RE.search(s)
|
| 258 |
+
if not m:
|
| 259 |
+
return {}
|
| 260 |
+
|
| 261 |
+
chunk = m.group(0)
|
| 262 |
+
try:
|
| 263 |
+
return json.loads(chunk)
|
| 264 |
+
except Exception:
|
| 265 |
+
# last attempt: remove trailing commas (common model mistake)
|
| 266 |
+
cleaned = _remove_trailing_commas(chunk)
|
| 267 |
+
try:
|
| 268 |
+
return json.loads(cleaned)
|
| 269 |
+
except Exception:
|
| 270 |
+
return {}
|
| 271 |
+
|
| 272 |
+
|
| 273 |
+
def _strip_code_fences(s: str) -> str:
|
| 274 |
+
# remove leading ```json / ``` and trailing ```
|
| 275 |
+
if s.startswith("```"):
|
| 276 |
+
s = re.sub(r"^```[a-zA-Z0-9]*\s*", "", s)
|
| 277 |
+
s = re.sub(r"\s*```$", "", s)
|
| 278 |
+
return s.strip()
|
| 279 |
+
|
| 280 |
+
|
| 281 |
+
def _remove_trailing_commas(s: str) -> str:
|
| 282 |
+
# naive but effective: remove ",}" and ",]" patterns repeatedly
|
| 283 |
+
prev = None
|
| 284 |
+
cur = s
|
| 285 |
+
while prev != cur:
|
| 286 |
+
prev = cur
|
| 287 |
+
cur = re.sub(r",\s*}", "}", cur)
|
| 288 |
+
cur = re.sub(r",\s*]", "]", cur)
|
| 289 |
+
return cur
|
| 290 |
+
|
| 291 |
+
|
| 292 |
+
def _to_float(x: Any, default: float = 0.0) -> float:
|
| 293 |
+
try:
|
| 294 |
+
return float(x)
|
| 295 |
+
except Exception:
|
| 296 |
+
return default
|
| 297 |
+
|
| 298 |
+
|
| 299 |
+
# ----------------------------
|
| 300 |
+
# Optional: quick self-check (manual)
|
| 301 |
+
# ----------------------------
|
| 302 |
+
def _debug_summarize_result(res: Dict[str, Any]) -> str:
|
| 303 |
+
return f"template_id={res.get('template_id')} conf={res.get('confidence')} reason={str(res.get('reason') or '')[:80]}"
|
| 304 |
+
|
| 305 |
+
|
| 306 |
+
def _validate_known_templates() -> Tuple[bool, str]:
|
| 307 |
+
ids = [t.get("template_id") for t in KNOWN_TEMPLATES]
|
| 308 |
+
if any(not i for i in ids):
|
| 309 |
+
return False, "One or more templates missing template_id"
|
| 310 |
+
if len(set(ids)) != len(ids):
|
| 311 |
+
return False, "Duplicate template_id in KNOWN_TEMPLATES"
|
| 312 |
+
return True, "ok"
|
backend/worker/out/.keep
ADDED
|
File without changes
|
backend/worker/pdf_render.py
ADDED
|
@@ -0,0 +1,41 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
from dataclasses import dataclass
|
| 4 |
+
from pathlib import Path
|
| 5 |
+
from typing import List
|
| 6 |
+
|
| 7 |
+
import fitz # PyMuPDF
|
| 8 |
+
from PIL import Image
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
@dataclass
|
| 12 |
+
class RenderedImage:
|
| 13 |
+
path: Path
|
| 14 |
+
page_index: int
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
def render_pdf_to_pngs(pdf_path: Path, out_dir: Path, pages: int = 2, dpi: int = 200) -> List[RenderedImage]:
|
| 18 |
+
out_dir.mkdir(parents=True, exist_ok=True)
|
| 19 |
+
|
| 20 |
+
doc = fitz.open(pdf_path)
|
| 21 |
+
n = min(pages, doc.page_count)
|
| 22 |
+
|
| 23 |
+
zoom = dpi / 72.0
|
| 24 |
+
mat = fitz.Matrix(zoom, zoom)
|
| 25 |
+
|
| 26 |
+
rendered: List[RenderedImage] = []
|
| 27 |
+
for i in range(n):
|
| 28 |
+
page = doc.load_page(i)
|
| 29 |
+
pix = page.get_pixmap(matrix=mat, alpha=False)
|
| 30 |
+
|
| 31 |
+
img_path = out_dir / f"{pdf_path.stem}_p{i+1}.png"
|
| 32 |
+
pix.save(str(img_path))
|
| 33 |
+
|
| 34 |
+
# normalize to RGB with PIL (avoids weird modes)
|
| 35 |
+
im = Image.open(img_path).convert("RGB")
|
| 36 |
+
im.save(img_path)
|
| 37 |
+
|
| 38 |
+
rendered.append(RenderedImage(path=img_path, page_index=i))
|
| 39 |
+
|
| 40 |
+
doc.close()
|
| 41 |
+
return rendered
|
backend/worker/prompts.py
ADDED
|
@@ -0,0 +1,87 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
TEMPLATE_IDS = [
|
| 2 |
+
"T1_IFACTOR_DELIVERED_ORDER",
|
| 3 |
+
"T2_SEASPINE_DELIVERED_GOODS_FORM",
|
| 4 |
+
"T3_ASTURA_SALES_ORDER_FORM",
|
| 5 |
+
"T4_MEDICAL_ESTIMATION_OF_CHARGES",
|
| 6 |
+
"T5_CLINICAL_PROGRESS_NOTE_POSTOP",
|
| 7 |
+
"T6_CUSTOMER_CHARGE_SHEET_SPINE",
|
| 8 |
+
"T7_SALES_ORDER_ZIMMER",
|
| 9 |
+
]
|
| 10 |
+
|
| 11 |
+
SYSTEM_PROMPT = f"""
|
| 12 |
+
You are classifying a medical/healthcare sales/order PDF form into one of the known templates,
|
| 13 |
+
and extracting a "trainer schema" for onboarding.
|
| 14 |
+
|
| 15 |
+
Known template_ids:
|
| 16 |
+
{TEMPLATE_IDS}
|
| 17 |
+
|
| 18 |
+
Rules:
|
| 19 |
+
- You MUST return JSON only (no markdown, no extra text).
|
| 20 |
+
- If none match confidently, return template_id "UNKNOWN".
|
| 21 |
+
- Always produce a schema object (even for UNKNOWN) so onboarding can proceed.
|
| 22 |
+
|
| 23 |
+
Output JSON shape (strict):
|
| 24 |
+
{{
|
| 25 |
+
"template_id": "<one of known template_ids or UNKNOWN>",
|
| 26 |
+
"confidence": 0.0,
|
| 27 |
+
"reason": "<short reason>",
|
| 28 |
+
"trainer_schema": {{
|
| 29 |
+
"form_id": "<suggested id>",
|
| 30 |
+
"version": 1,
|
| 31 |
+
"page": 1,
|
| 32 |
+
"scalar_value_region_mode": "offset_from_anchor_v1",
|
| 33 |
+
"fields": [
|
| 34 |
+
{{
|
| 35 |
+
"field_id": "facility_organization",
|
| 36 |
+
"label": "Facility / Organization",
|
| 37 |
+
"type": "entity",
|
| 38 |
+
"anchor_hint": "<printed label text or None>",
|
| 39 |
+
"value_hint": "<what to extract>"
|
| 40 |
+
}},
|
| 41 |
+
{{
|
| 42 |
+
"field_id": "case_location_address",
|
| 43 |
+
"label": "Case Location / Address",
|
| 44 |
+
"type": "entity",
|
| 45 |
+
"anchor_hint": "<printed label text or None>",
|
| 46 |
+
"value_hint": "<what to extract>"
|
| 47 |
+
}},
|
| 48 |
+
{{
|
| 49 |
+
"field_id": "vendor",
|
| 50 |
+
"label": "Vendor",
|
| 51 |
+
"type": "entity",
|
| 52 |
+
"anchor_hint": "<printed label text or None>",
|
| 53 |
+
"value_hint": "<what to extract>"
|
| 54 |
+
}},
|
| 55 |
+
{{
|
| 56 |
+
"field_id": "physician_name",
|
| 57 |
+
"label": "Physician Name",
|
| 58 |
+
"type": "person",
|
| 59 |
+
"anchor_hint": "<printed label text or None>",
|
| 60 |
+
"value_hint": "<what to extract>"
|
| 61 |
+
}},
|
| 62 |
+
{{
|
| 63 |
+
"field_id": "date_of_surgery",
|
| 64 |
+
"label": "Date of Surgery",
|
| 65 |
+
"type": "date",
|
| 66 |
+
"anchor_hint": "<printed label text or None>",
|
| 67 |
+
"value_hint": "<what to extract>"
|
| 68 |
+
}},
|
| 69 |
+
{{
|
| 70 |
+
"field_id": "items",
|
| 71 |
+
"label": "Items / Line Items",
|
| 72 |
+
"type": "table",
|
| 73 |
+
"table_hint": {{
|
| 74 |
+
"expected_columns": ["item_number","description","qty","lot_number","price","extended_price"],
|
| 75 |
+
"where_on_page": "<short description>",
|
| 76 |
+
"header_text_examples": ["Item Number","Description","Qty"]
|
| 77 |
+
}}
|
| 78 |
+
}}
|
| 79 |
+
]
|
| 80 |
+
}}
|
| 81 |
+
}}
|
| 82 |
+
"""
|
| 83 |
+
|
| 84 |
+
USER_PROMPT = """
|
| 85 |
+
Classify the form template and generate trainer_schema based on the provided page images.
|
| 86 |
+
Focus on printed structure, titles, logos, and table headers.
|
| 87 |
+
"""
|
backend/worker/template_registry_snapshot.py
ADDED
|
File without changes
|
backend/worker/template_store.py
ADDED
|
@@ -0,0 +1,36 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
import json
|
| 3 |
+
from pathlib import Path
|
| 4 |
+
from typing import Any, Dict, List
|
| 5 |
+
|
| 6 |
+
TEMPLATE_DIR = Path(__file__).resolve().parent / "trainer_templates"
|
| 7 |
+
|
| 8 |
+
def list_trainer_templates() -> List[Dict[str, Any]]:
|
| 9 |
+
TEMPLATE_DIR.mkdir(parents=True, exist_ok=True)
|
| 10 |
+
out: List[Dict[str, Any]] = []
|
| 11 |
+
|
| 12 |
+
for p in sorted(TEMPLATE_DIR.glob("*.json")):
|
| 13 |
+
try:
|
| 14 |
+
cfg = json.loads(p.read_text(encoding="utf-8"))
|
| 15 |
+
except Exception:
|
| 16 |
+
continue
|
| 17 |
+
|
| 18 |
+
template_id = cfg.get("template_id") or cfg.get("form_id") or p.stem
|
| 19 |
+
name = cfg.get("name") or cfg.get("form_id") or template_id
|
| 20 |
+
|
| 21 |
+
out.append({
|
| 22 |
+
"template_id": template_id,
|
| 23 |
+
"name": name,
|
| 24 |
+
# optional: trainer config itself (don’t spam prompt if huge)
|
| 25 |
+
"has_config": True,
|
| 26 |
+
})
|
| 27 |
+
|
| 28 |
+
return out
|
| 29 |
+
|
| 30 |
+
def save_trainer_template(template_id: str, cfg: Dict[str, Any]) -> Path:
|
| 31 |
+
TEMPLATE_DIR.mkdir(parents=True, exist_ok=True)
|
| 32 |
+
cfg = dict(cfg)
|
| 33 |
+
cfg["template_id"] = template_id # enforce
|
| 34 |
+
path = TEMPLATE_DIR / f"{template_id}.json"
|
| 35 |
+
path.write_text(json.dumps(cfg, indent=2), encoding="utf-8")
|
| 36 |
+
return path
|
backend/worker/tmp/.keep
ADDED
|
File without changes
|
backend/worker/uploads/.keep
ADDED
|
File without changes
|
backend/worker/worker.py
ADDED
|
@@ -0,0 +1,286 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
import os
|
| 4 |
+
import time
|
| 5 |
+
import uuid
|
| 6 |
+
from dataclasses import dataclass
|
| 7 |
+
from pathlib import Path
|
| 8 |
+
from typing import List, Tuple
|
| 9 |
+
|
| 10 |
+
from dotenv import load_dotenv
|
| 11 |
+
|
| 12 |
+
from .gmail_client import GmailClient
|
| 13 |
+
from .openai_classifier import classify_with_openai
|
| 14 |
+
from .pdf_render import render_pdf_to_pngs
|
| 15 |
+
|
| 16 |
+
# Force load repo_root/backend/.env (single source of truth)
|
| 17 |
+
REPO_ROOT = Path(__file__).resolve().parents[2]
|
| 18 |
+
load_dotenv(REPO_ROOT / "backend" / ".env", override=True)
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
@dataclass
|
| 22 |
+
class Settings:
|
| 23 |
+
creds_path: Path
|
| 24 |
+
token_path: Path
|
| 25 |
+
|
| 26 |
+
label_incoming: str
|
| 27 |
+
label_known: str
|
| 28 |
+
label_unknown: str
|
| 29 |
+
label_train: str
|
| 30 |
+
|
| 31 |
+
# Rep email for UNKNOWN detection
|
| 32 |
+
rep_notify_to: str
|
| 33 |
+
notify_from: str
|
| 34 |
+
|
| 35 |
+
# OpenAI
|
| 36 |
+
openai_api_key: str
|
| 37 |
+
openai_model: str
|
| 38 |
+
|
| 39 |
+
poll_seconds: int
|
| 40 |
+
max_messages_per_poll: int
|
| 41 |
+
|
| 42 |
+
render_pages: int
|
| 43 |
+
render_dpi: int
|
| 44 |
+
|
| 45 |
+
trainer_base_url: str
|
| 46 |
+
|
| 47 |
+
|
| 48 |
+
def load_settings() -> Settings:
|
| 49 |
+
base = Path(__file__).resolve().parents[1] # backend/
|
| 50 |
+
creds = Path(os.environ.get("GMAIL_CREDENTIALS_JSON", str(base / "credentials.json")))
|
| 51 |
+
token = Path(os.environ.get("GMAIL_TOKEN_JSON", str(base / "token.json")))
|
| 52 |
+
|
| 53 |
+
openai_api_key = (os.environ.get("OPENAI_API_KEY_TEST") or os.environ.get("OPENAI_API_KEY") or "").strip()
|
| 54 |
+
openai_model = (os.environ.get("OPENAI_MODEL") or "gpt-4o-mini").strip()
|
| 55 |
+
|
| 56 |
+
return Settings(
|
| 57 |
+
creds_path=creds,
|
| 58 |
+
token_path=token,
|
| 59 |
+
|
| 60 |
+
label_incoming=os.environ.get("PDF_PIPELINE_LABEL_INCOMING", "PDF_PIPELINE/INCOMING"),
|
| 61 |
+
label_known=os.environ.get("PDF_PIPELINE_LABEL_KNOWN", "PDF_PIPELINE/KNOWN"),
|
| 62 |
+
label_unknown=os.environ.get("PDF_PIPELINE_LABEL_UNKNOWN", "PDF_PIPELINE/UNKNOWN"),
|
| 63 |
+
label_train=os.environ.get("PDF_PIPELINE_LABEL_TRAIN", "PDF_PIPELINE/TRAIN"),
|
| 64 |
+
|
| 65 |
+
notify_from=(os.environ.get("PDF_PIPELINE_NOTIFY_FROM") or "").strip(),
|
| 66 |
+
rep_notify_to=(os.environ.get("PDF_PIPELINE_NOTIFY_TO") or "").strip(),
|
| 67 |
+
|
| 68 |
+
openai_api_key=openai_api_key,
|
| 69 |
+
openai_model=openai_model,
|
| 70 |
+
|
| 71 |
+
poll_seconds=int(os.environ.get("PDF_PIPELINE_POLL_SECONDS", "20")),
|
| 72 |
+
max_messages_per_poll=int(os.environ.get("PDF_PIPELINE_MAX_PER_POLL", "5")),
|
| 73 |
+
|
| 74 |
+
render_pages=int(os.environ.get("PDF_PIPELINE_RENDER_PAGES", "2")),
|
| 75 |
+
render_dpi=int(os.environ.get("PDF_PIPELINE_RENDER_DPI", "200")),
|
| 76 |
+
|
| 77 |
+
trainer_base_url=(os.environ.get("PDF_TRAINER_BASE_URL") or "http://localhost:5173").strip(),
|
| 78 |
+
)
|
| 79 |
+
|
| 80 |
+
|
| 81 |
+
def _safe_name(s: str) -> str:
|
| 82 |
+
return "".join(c if c.isalnum() or c in ("-", "_", ".", " ") else "_" for c in s).strip()
|
| 83 |
+
|
| 84 |
+
|
| 85 |
+
def _write_pipeline_pdf(root_worker_dir: Path, filename: str, pdf_bytes: bytes) -> Tuple[str, Path]:
|
| 86 |
+
"""
|
| 87 |
+
Persist PDF for the trainer to fetch later.
|
| 88 |
+
Returns (pdf_id, pdf_path_on_disk).
|
| 89 |
+
"""
|
| 90 |
+
uploads_dir = root_worker_dir / "uploads"
|
| 91 |
+
uploads_dir.mkdir(parents=True, exist_ok=True)
|
| 92 |
+
|
| 93 |
+
pdf_id = uuid.uuid4().hex
|
| 94 |
+
pdf_path = uploads_dir / f"{pdf_id}.pdf"
|
| 95 |
+
name_path = uploads_dir / f"{pdf_id}.name.txt"
|
| 96 |
+
|
| 97 |
+
pdf_path.write_bytes(pdf_bytes)
|
| 98 |
+
name_path.write_text(filename, encoding="utf-8")
|
| 99 |
+
|
| 100 |
+
return pdf_id, pdf_path
|
| 101 |
+
|
| 102 |
+
|
| 103 |
+
def _process_train_label(gmail: GmailClient, s: Settings, root: Path) -> None:
|
| 104 |
+
"""
|
| 105 |
+
TRAIN behavior:
|
| 106 |
+
- Pull unread PDFs from TRAIN label
|
| 107 |
+
- Store into uploads/ and print trainer link
|
| 108 |
+
- Mark read
|
| 109 |
+
- Do NOT classify
|
| 110 |
+
- Do NOT move labels
|
| 111 |
+
"""
|
| 112 |
+
msgs = gmail.search_unread_pdf_messages(s.label_train, max_results=s.max_messages_per_poll)
|
| 113 |
+
if not msgs:
|
| 114 |
+
return
|
| 115 |
+
|
| 116 |
+
for m in msgs:
|
| 117 |
+
msg_full = gmail.get_message_full(m.msg_id)
|
| 118 |
+
pdf_atts = gmail.list_pdf_attachments(msg_full)
|
| 119 |
+
|
| 120 |
+
if not pdf_atts:
|
| 121 |
+
gmail.move_message(m.msg_id, add_labels=[], remove_labels=[], mark_read=True)
|
| 122 |
+
continue
|
| 123 |
+
|
| 124 |
+
for (filename, att_id) in pdf_atts:
|
| 125 |
+
filename = _safe_name(filename or "attachment.pdf")
|
| 126 |
+
pdf_bytes = gmail.download_attachment(m.msg_id, att_id)
|
| 127 |
+
|
| 128 |
+
pdf_id, stored_pdf_path = _write_pipeline_pdf(root, filename, pdf_bytes)
|
| 129 |
+
trainer_link = f"{s.trainer_base_url.rstrip('/')}/?pdf_id={pdf_id}"
|
| 130 |
+
|
| 131 |
+
gmail.move_message(m.msg_id, add_labels=[], remove_labels=[], mark_read=True)
|
| 132 |
+
|
| 133 |
+
print(
|
| 134 |
+
f"[worker][TRAIN] stored PDF msg={m.msg_id} file={filename} "
|
| 135 |
+
f"pdf_id={pdf_id} stored={stored_pdf_path}"
|
| 136 |
+
)
|
| 137 |
+
print(f"[worker][TRAIN] open: {trainer_link}")
|
| 138 |
+
|
| 139 |
+
|
| 140 |
+
def main():
|
| 141 |
+
s = load_settings()
|
| 142 |
+
|
| 143 |
+
# Validate settings
|
| 144 |
+
if not s.rep_notify_to:
|
| 145 |
+
raise RuntimeError("Missing PDF_PIPELINE_NOTIFY_TO (rep email for UNKNOWN detection)")
|
| 146 |
+
if not s.notify_from:
|
| 147 |
+
raise RuntimeError("Missing PDF_PIPELINE_NOTIFY_FROM (OAuth Gmail account email)")
|
| 148 |
+
if not s.trainer_base_url:
|
| 149 |
+
raise RuntimeError("Missing PDF_TRAINER_BASE_URL (base URL for trainer link)")
|
| 150 |
+
if not s.openai_api_key:
|
| 151 |
+
raise RuntimeError("Missing OPENAI_API_KEY_TEST (or OPENAI_API_KEY) in backend/.env")
|
| 152 |
+
|
| 153 |
+
gmail = GmailClient(s.creds_path, s.token_path)
|
| 154 |
+
|
| 155 |
+
# Ensure labels exist
|
| 156 |
+
gmail.ensure_label(s.label_incoming)
|
| 157 |
+
gmail.ensure_label(s.label_known)
|
| 158 |
+
gmail.ensure_label(s.label_unknown)
|
| 159 |
+
gmail.ensure_label(s.label_train)
|
| 160 |
+
|
| 161 |
+
root = Path(__file__).resolve().parents[0] # backend/worker
|
| 162 |
+
tmp_dir = root / "tmp"
|
| 163 |
+
tmp_dir.mkdir(parents=True, exist_ok=True)
|
| 164 |
+
|
| 165 |
+
print(f"[worker] Watching label: {s.label_incoming}")
|
| 166 |
+
print(f"[worker] Known label: {s.label_known}")
|
| 167 |
+
print(f"[worker] Unknown label: {s.label_unknown}")
|
| 168 |
+
print(f"[worker] Train label: {s.label_train}")
|
| 169 |
+
print(f"[worker] Rep notify to: {s.rep_notify_to}")
|
| 170 |
+
print(f"[worker] OpenAI model: {s.openai_model}")
|
| 171 |
+
|
| 172 |
+
while True:
|
| 173 |
+
try:
|
| 174 |
+
# 1) TRAIN lane
|
| 175 |
+
_process_train_label(gmail, s, root)
|
| 176 |
+
|
| 177 |
+
# 2) Main pipeline (INCOMING -> KNOWN/UNKNOWN)
|
| 178 |
+
msgs = gmail.search_unread_pdf_messages(s.label_incoming, max_results=s.max_messages_per_poll)
|
| 179 |
+
if not msgs:
|
| 180 |
+
time.sleep(s.poll_seconds)
|
| 181 |
+
continue
|
| 182 |
+
|
| 183 |
+
for m in msgs:
|
| 184 |
+
msg_full = gmail.get_message_full(m.msg_id)
|
| 185 |
+
pdf_atts = gmail.list_pdf_attachments(msg_full)
|
| 186 |
+
|
| 187 |
+
if not pdf_atts:
|
| 188 |
+
# Remove INCOMING + mark read so it doesn't loop forever
|
| 189 |
+
gmail.move_message(m.msg_id, add_labels=[], remove_labels=[s.label_incoming], mark_read=True)
|
| 190 |
+
continue
|
| 191 |
+
|
| 192 |
+
any_unknown = False
|
| 193 |
+
unknown_payloads: List[Tuple[str, bytes]] = []
|
| 194 |
+
|
| 195 |
+
# Classify all PDF attachments for this message
|
| 196 |
+
for (filename, att_id) in pdf_atts:
|
| 197 |
+
filename = _safe_name(filename or "attachment.pdf")
|
| 198 |
+
pdf_bytes = gmail.download_attachment(m.msg_id, att_id)
|
| 199 |
+
|
| 200 |
+
stamp = str(int(time.time()))
|
| 201 |
+
pdf_path = tmp_dir / f"{stamp}_{m.msg_id}_{filename}"
|
| 202 |
+
pdf_path.write_bytes(pdf_bytes)
|
| 203 |
+
|
| 204 |
+
img_dir = tmp_dir / f"{stamp}_{m.msg_id}_{pdf_path.stem}"
|
| 205 |
+
rendered = render_pdf_to_pngs(pdf_path, img_dir, pages=s.render_pages, dpi=s.render_dpi)
|
| 206 |
+
image_paths = [str(r.path) for r in rendered]
|
| 207 |
+
|
| 208 |
+
result = classify_with_openai(
|
| 209 |
+
image_paths,
|
| 210 |
+
api_key=s.openai_api_key,
|
| 211 |
+
model=s.openai_model,
|
| 212 |
+
)
|
| 213 |
+
|
| 214 |
+
template_id = (result.get("template_id") or "UNKNOWN").strip()
|
| 215 |
+
conf = float(result.get("confidence") or 0.0)
|
| 216 |
+
|
| 217 |
+
if template_id == "UNKNOWN":
|
| 218 |
+
any_unknown = True
|
| 219 |
+
unknown_payloads.append((filename, pdf_bytes))
|
| 220 |
+
print(f"[worker] UNKNOWN attachment conf={conf:.3f} msg={m.msg_id} file={filename}")
|
| 221 |
+
else:
|
| 222 |
+
print(
|
| 223 |
+
f"[worker] KNOWN attachment template={template_id} conf={conf:.3f} "
|
| 224 |
+
f"msg={m.msg_id} file={filename}"
|
| 225 |
+
)
|
| 226 |
+
|
| 227 |
+
# Apply Gmail label ONCE per message
|
| 228 |
+
if any_unknown:
|
| 229 |
+
gmail.move_message(
|
| 230 |
+
m.msg_id,
|
| 231 |
+
add_labels=[s.label_unknown],
|
| 232 |
+
remove_labels=[s.label_incoming],
|
| 233 |
+
mark_read=True,
|
| 234 |
+
)
|
| 235 |
+
else:
|
| 236 |
+
gmail.move_message(
|
| 237 |
+
m.msg_id,
|
| 238 |
+
add_labels=[s.label_known],
|
| 239 |
+
remove_labels=[s.label_incoming],
|
| 240 |
+
mark_read=True,
|
| 241 |
+
)
|
| 242 |
+
|
| 243 |
+
# Notify rep for each unknown PDF attachment
|
| 244 |
+
if any_unknown:
|
| 245 |
+
for (filename, pdf_bytes) in unknown_payloads:
|
| 246 |
+
pdf_id, stored_pdf_path = _write_pipeline_pdf(root, filename, pdf_bytes)
|
| 247 |
+
trainer_link = f"{s.trainer_base_url.rstrip('/')}/?pdf_id={pdf_id}"
|
| 248 |
+
|
| 249 |
+
subject = "Action required: Unknown PDF format (template not found)"
|
| 250 |
+
body = (
|
| 251 |
+
"Hi,\n\n"
|
| 252 |
+
"We received a PDF that does not match any existing templates in the system.\n\n"
|
| 253 |
+
"Please open the PDF Trainer using the link below and create or update the template configuration:\n"
|
| 254 |
+
f"{trainer_link}\n\n"
|
| 255 |
+
"The original PDF is attached for reference.\n\n"
|
| 256 |
+
"Thank you,\n"
|
| 257 |
+
"Inserio Automation\n"
|
| 258 |
+
)
|
| 259 |
+
|
| 260 |
+
attachments: List[Tuple[str, bytes]] = []
|
| 261 |
+
if len(pdf_bytes) < 20 * 1024 * 1024:
|
| 262 |
+
attachments.append((filename, pdf_bytes))
|
| 263 |
+
else:
|
| 264 |
+
body += "\nNote: The PDF was too large to attach.\n"
|
| 265 |
+
|
| 266 |
+
gmail.send_email(
|
| 267 |
+
to_email=s.rep_notify_to,
|
| 268 |
+
from_email=s.notify_from,
|
| 269 |
+
subject=subject,
|
| 270 |
+
body_text=body,
|
| 271 |
+
attachments=attachments,
|
| 272 |
+
)
|
| 273 |
+
|
| 274 |
+
print(
|
| 275 |
+
f"[worker] UNKNOWN: emailed rep {s.rep_notify_to} msg={m.msg_id} file={filename} "
|
| 276 |
+
f"pdf_id={pdf_id} stored={stored_pdf_path}"
|
| 277 |
+
)
|
| 278 |
+
|
| 279 |
+
except Exception as e:
|
| 280 |
+
print(f"[worker] ERROR: {e}")
|
| 281 |
+
|
| 282 |
+
time.sleep(s.poll_seconds)
|
| 283 |
+
|
| 284 |
+
|
| 285 |
+
if __name__ == "__main__":
|
| 286 |
+
main()
|
requirements.txt
CHANGED
|
@@ -1,18 +1,8 @@
|
|
| 1 |
-
|
| 2 |
-
google-
|
| 3 |
-
google-auth
|
| 4 |
-
google-auth-
|
| 5 |
-
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
# PDF -> image
|
| 10 |
-
PyMuPDF==1.23.26
|
| 11 |
-
Pillow==10.2.0
|
| 12 |
-
|
| 13 |
-
# Utilities
|
| 14 |
-
python-dotenv==1.0.1
|
| 15 |
-
requests==2.31.0
|
| 16 |
-
|
| 17 |
-
fastapi==0.115.6
|
| 18 |
-
uvicorn==0.30.6
|
|
|
|
| 1 |
+
google-api-python-client
|
| 2 |
+
google-auth
|
| 3 |
+
google-auth-oauthlib
|
| 4 |
+
google-auth-httplib2
|
| 5 |
+
python-dotenv
|
| 6 |
+
pydantic
|
| 7 |
+
requests
|
| 8 |
+
Pillow
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|