Spaces:

SAadmin
/

pdf-trainer-worker

Runtime error

App Files Files Community

Avinashnalla7 commited on Feb 1

Commit

7fd3f6f

1 Parent(s): 4478465

fix: restore worker code + entrypoint

Browse files

Files changed (16) hide show

Dockerfile +5 -22
app.py +4 -0
backend/__init__.py +0 -0
backend/worker/__init__.py +0 -0
backend/worker/config.py +89 -0
backend/worker/gmail_client.py +149 -0
backend/worker/openai_classifier.py +312 -0
backend/worker/out/.keep +0 -0
backend/worker/pdf_render.py +41 -0
backend/worker/prompts.py +87 -0
backend/worker/template_registry_snapshot.py +0 -0
backend/worker/template_store.py +36 -0
backend/worker/tmp/.keep +0 -0
backend/worker/uploads/.keep +0 -0
backend/worker/worker.py +286 -0
requirements.txt +8 -18

Dockerfile CHANGED Viewed

@@ -1,25 +1,8 @@
-# ---- Build stage ----
-FROM node:20-alpine AS build
 WORKDIR /app
-COPY package.json package-lock.json ./
-RUN npm ci
-COPY . .
-RUN npm run build
-# ---- Runtime stage ----
-FROM nginx:alpine
-# Remove default nginx config
-RUN rm /etc/nginx/conf.d/default.conf
-# Custom nginx config
-COPY nginx.conf /etc/nginx/conf.d/default.conf
-# Copy built assets
-COPY --from=build /app/dist /usr/share/nginx/html
-EXPOSE 7860
-CMD ["nginx", "-g", "daemon off;"]

+FROM python:3.11-slim
 WORKDIR /app
+COPY requirements.txt /app/requirements.txt
+RUN pip install --no-cache-dir -r /app/requirements.txt
+COPY . /app
+CMD ["python", "-u", "app.py"]

app.py ADDED Viewed

	@@ -0,0 +1,4 @@

+from backend.worker.worker import main
+if __name__ == "__main__":
+    main()

backend/__init__.py ADDED Viewed

File without changes

backend/worker/__init__.py ADDED Viewed

File without changes

backend/worker/config.py ADDED Viewed

	@@ -0,0 +1,89 @@

+from __future__ import annotations
+import os
+from dataclasses import dataclass
+from pathlib import Path
+@dataclass(frozen=True)
+class Settings:
+    # Repo paths
+    repo_root: Path
+    backend_dir: Path
+    worker_dir: Path
+    # Gmail
+    credentials_path: Path
+    token_path: Path
+    label_incoming: str
+    label_known: str
+    label_unknown: str
+    label_train: str
+    # Notification
+    notify_to_email: str
+    notify_from_email: str
+    # Trainer
+    trainer_base_url: str
+    # OpenAI
+    openai_api_key: str
+    openai_model: str
+    # Worker behavior
+    poll_seconds: int
+    max_messages_per_poll: int
+    render_pages: int
+    render_dpi: int
+def load_settings(repo_root: Path) -> Settings:
+    backend_dir = repo_root / "backend"
+    worker_dir = backend_dir / "worker"
+    # IMPORTANT: use the SAME env var you actually store in backend/.env
+    # Your file shows OPENAI_API_KEY_TEST=...
+    openai_api_key = os.environ.get("OPENAI_API_KEY_TEST", "").strip()
+    if not openai_api_key:
+        raise RuntimeError("Missing OPENAI_API_KEY_TEST env var in backend/.env")
+    notify_to = os.environ.get("PDF_PIPELINE_NOTIFY_TO", "").strip()
+    if not notify_to:
+        raise RuntimeError("Missing PDF_PIPELINE_NOTIFY_TO env var")
+    notify_from = os.environ.get("PDF_PIPELINE_NOTIFY_FROM", "").strip()
+    if not notify_from:
+        raise RuntimeError("Missing PDF_PIPELINE_NOTIFY_FROM env var")
+    trainer_base_url = os.environ.get("PDF_TRAINER_BASE_URL", "http://localhost:5173").strip()
+    if not trainer_base_url:
+        raise RuntimeError("Missing PDF_TRAINER_BASE_URL env var")
+    return Settings(
+        repo_root=repo_root,
+        backend_dir=backend_dir,
+        worker_dir=worker_dir,
+        credentials_path=Path(os.environ.get("GMAIL_CREDENTIALS_JSON", str(backend_dir / "credentials.json"))),
+        token_path=Path(os.environ.get("GMAIL_TOKEN_JSON", str(backend_dir / "token.json"))),
+        label_incoming=os.environ.get("PDF_PIPELINE_LABEL_INCOMING", "PDF_PIPELINE/INCOMING"),
+        label_known=os.environ.get("PDF_PIPELINE_LABEL_KNOWN", "PDF_PIPELINE/KNOWN"),
+        label_unknown=os.environ.get("PDF_PIPELINE_LABEL_UNKNOWN", "PDF_PIPELINE/UNKNOWN"),
+        label_train=os.environ.get("PDF_PIPELINE_LABEL_TRAIN", "PDF_PIPELINE/TRAIN"),
+        notify_to_email=notify_to,
+        notify_from_email=notify_from,
+        trainer_base_url=trainer_base_url,
+        openai_api_key=openai_api_key,
+        openai_model=os.environ.get("OPENAI_MODEL", "gpt-4.1-mini"),
+        poll_seconds=int(os.environ.get("PDF_PIPELINE_POLL_SECONDS", "20")),
+        max_messages_per_poll=int(os.environ.get("PDF_PIPELINE_MAX_PER_POLL", "5")),
+        render_pages=int(os.environ.get("PDF_PIPELINE_RENDER_PAGES", "2")),
+        render_dpi=int(os.environ.get("PDF_PIPELINE_RENDER_DPI", "200")),
+    )

backend/worker/gmail_client.py ADDED Viewed

	@@ -0,0 +1,149 @@

+from __future__ import annotations
+import base64
+import os
+from dataclasses import dataclass
+from email.message import EmailMessage
+from pathlib import Path
+from typing import List, Optional, Tuple
+from google.oauth2.credentials import Credentials
+from googleapiclient.discovery import build
+SCOPES = [
+    "https://www.googleapis.com/auth/gmail.modify",
+    "https://www.googleapis.com/auth/gmail.send",
+]
+@dataclass
+class GmailMessage:
+    msg_id: str
+    thread_id: str
+class GmailClient:
+    def __init__(self, credentials_path: Path, token_path: Path):
+        if not credentials_path.exists():
+            raise FileNotFoundError(f"Missing OAuth client json: {credentials_path}")
+        if not token_path.exists():
+            raise FileNotFoundError(f"Missing token json: {token_path}")
+        creds = Credentials.from_authorized_user_file(str(token_path), SCOPES)
+        self.service = build("gmail", "v1", credentials=creds, cache_discovery=False)
+    def list_labels(self) -> List[dict]:
+        resp = self.service.users().labels().list(userId="me").execute()
+        return resp.get("labels", [])
+    def get_label_id(self, name: str) -> Optional[str]:
+        for lbl in self.list_labels():
+            if lbl.get("name") == name:
+                return lbl.get("id")
+        return None
+    def ensure_label(self, name: str) -> str:
+        existing = self.get_label_id(name)
+        if existing:
+            return existing
+        body = {
+            "name": name,
+            "labelListVisibility": "labelShow",
+            "messageListVisibility": "show",
+        }
+        created = self.service.users().labels().create(userId="me", body=body).execute()
+        return created["id"]
+    def search_unread_pdf_messages(self, label_name: str, max_results: int = 10) -> List[GmailMessage]:
+        # Gmail search query: label + unread + pdf attachments
+        query = f'label:"{label_name}" is:unread has:attachment filename:pdf'
+        resp = self.service.users().messages().list(userId="me", q=query, maxResults=max_results).execute()
+        msgs = resp.get("messages", []) or []
+        out: List[GmailMessage] = []
+        for m in msgs:
+            out.append(GmailMessage(msg_id=m["id"], thread_id=m.get("threadId", "")))
+        return out
+    def get_message_full(self, msg_id: str) -> dict:
+        return self.service.users().messages().get(userId="me", id=msg_id, format="full").execute()
+    def _walk_parts(self, payload: dict) -> List[dict]:
+        parts = []
+        stack = [payload]
+        while stack:
+            node = stack.pop()
+            if not isinstance(node, dict):
+                continue
+            if node.get("parts"):
+                stack.extend(node["parts"])
+            parts.append(node)
+        return parts
+    def list_pdf_attachments(self, msg_full: dict) -> List[Tuple[str, str]]:
+        """
+        Returns [(filename, attachmentId), ...] for application/pdf parts.
+        """
+        payload = msg_full.get("payload", {}) or {}
+        parts = self._walk_parts(payload)
+        out: List[Tuple[str, str]] = []
+        for p in parts:
+            filename = (p.get("filename") or "").strip()
+            body = p.get("body") or {}
+            att_id = body.get("attachmentId")
+            mime = (p.get("mimeType") or "").lower()
+            if filename.lower().endswith(".pdf") or mime == "application/pdf":
+                if filename and att_id:
+                    out.append((filename, att_id))
+        return out
+    def download_attachment(self, msg_id: str, attachment_id: str) -> bytes:
+        att = (
+            self.service.users()
+            .messages()
+            .attachments()
+            .get(userId="me", messageId=msg_id, id=attachment_id)
+            .execute()
+        )
+        data = att.get("data", "")
+        return base64.urlsafe_b64decode(data.encode("utf-8"))
+    def move_message(
+        self,
+        msg_id: str,
+        add_labels: List[str],
+        remove_labels: List[str],
+        mark_read: bool = True,
+    ) -> None:
+        add_ids = [self.ensure_label(n) for n in add_labels]
+        remove_ids = [self.ensure_label(n) for n in remove_labels]
+        if mark_read:
+            remove_ids.append("UNREAD")
+        body = {"addLabelIds": add_ids, "removeLabelIds": remove_ids}
+        self.service.users().messages().modify(userId="me", id=msg_id, body=body).execute()
+    def send_email(self, to_email: str, subject: str, body_text: str, from_email: Optional[str] = None, attachments: Optional[List[Tuple[str, bytes]]] = None) -> None:
+        msg = EmailMessage()
+        msg["To"] = to_email
+        msg["Subject"] = subject
+        if from_email:
+            msg["From"] = from_email
+        msg.set_content(body_text)
+        attachments = attachments or []
+        for filename, data in attachments:
+            # basic content type guess for pdf/json
+            if filename.lower().endswith(".pdf"):
+                maintype, subtype = "application", "pdf"
+            elif filename.lower().endswith(".json"):
+                maintype, subtype = "application", "json"
+            else:
+                maintype, subtype = "application", "octet-stream"
+            msg.add_attachment(data, maintype=maintype, subtype=subtype, filename=filename)
+        raw = base64.urlsafe_b64encode(msg.as_bytes()).decode("utf-8")
+        self.service.users().messages().send(userId="me", body={"raw": raw}).execute()

backend/worker/openai_classifier.py ADDED Viewed

	@@ -0,0 +1,312 @@

+from __future__ import annotations
+import base64
+import json
+import re
+from pathlib import Path
+from typing import Any, Dict, List, Optional, Tuple
+from openai import OpenAI
+# ----------------------------
+# Known templates (mirror your main system)
+# ----------------------------
+KNOWN_TEMPLATES: List[Dict[str, Any]] = [
+    {
+        "template_id": "T1_IFACTOR_DELIVERED_ORDER",
+        "name": "I-FACTOR Delivered Order Form",
+        "keywords_all": ["delivered order form"],
+        "keywords_any": ["i-factor", "cerapedics", "product information", "stickers", "bill to", "delivered to"],
+    },
+    {
+        "template_id": "T2_SEASPINE_DELIVERED_GOODS_FORM",
+        "name": "SeaSpine Delivered Goods Form",
+        "keywords_all": ["delivered goods form"],
+        "keywords_any": ["seaspine", "isotis", "handling fee", "sales order", "invoice"],
+    },
+    {
+        "template_id": "T3_ASTURA_SALES_ORDER_FORM",
+        "name": "Astura Sales Order Form",
+        "keywords_all": [],
+        "keywords_any": ["astura", "dc141", "ca200", "cbba", "sales order"],
+    },
+    {
+        "template_id": "T4_MEDICAL_ESTIMATION_OF_CHARGES",
+        "name": "Medical Estimation of Charges",
+        "keywords_all": [],
+        "keywords_any": ["estimation of charges", "good faith estimate", "patient responsibility", "insurance"],
+    },
+    {
+        "template_id": "T5_CLINICAL_PROGRESS_NOTE_POSTOP",
+        "name": "Clinical Progress Note Postop",
+        "keywords_all": [],
+        "keywords_any": ["clinical progress note", "progress note", "post-op", "assessment", "plan"],
+    },
+    {
+        "template_id": "T6_CUSTOMER_CHARGE_SHEET_SPINE",
+        "name": "Customer Charge Sheet Spine",
+        "keywords_all": [],
+        "keywords_any": ["customer charge sheet", "charge sheet", "spine", "qty", "unit price", "total"],
+    },
+    {
+        "template_id": "T7_SALES_ORDER_ZIMMER",
+        "name": "Zimmer Sales Order",
+        "keywords_all": [],
+        "keywords_any": ["zimmer", "zimmer biomet", "biomet", "sales order", "purchase order", "po number"],
+    },
+]
+# ----------------------------
+# Public API (EXPLICIT key/model)
+# ----------------------------
+def classify_with_openai(
+    image_paths: List[str],
+    *,
+    api_key: str,
+    model: str,
+    max_pages: int = 2,
+) -> Dict[str, Any]:
+    """
+    Input: list of PNG file paths (page renders).
+    Output:
+      {
+        "template_id": "T1_..." OR "UNKNOWN",
+        "confidence": 0..1,
+        "reason": "short string",
+        "trainer_schema": {}   # reserved for later
+      }
+    Hard guarantees:
+      - does NOT read environment variables
+      - does NOT guess api keys
+      - strict normalization to known template_ids
+    """
+    api_key = (api_key or "").strip()
+    model = (model or "").strip()
+    if not api_key:
+        raise RuntimeError("classify_with_openai: api_key is empty")
+    if not model:
+        raise RuntimeError("classify_with_openai: model is empty")
+    if not image_paths:
+        return {
+            "template_id": "UNKNOWN",
+            "confidence": 0.0,
+            "reason": "No rendered images provided.",
+            "trainer_schema": {},
+        }
+    # Encode first N pages (keep small + deterministic)
+    pages_b64: List[str] = []
+    for p in image_paths[: max_pages if max_pages > 0 else 1]:
+        pages_b64.append(_png_file_to_b64(Path(p)))
+    client = OpenAI(api_key=api_key)
+    system = (
+        "You are a strict document template classifier.\n"
+        "You will be shown PNG images of PDF pages (scanned forms).\n"
+        "Your job is to decide which known template matches.\n\n"
+        "Hard rules:\n"
+        "1) Output VALID JSON only. No markdown. No extra text.\n"
+        "2) Choose ONE template_id from the provided list OR return template_id='UNKNOWN'.\n"
+        "3) If uncertain, return UNKNOWN.\n"
+        "4) Use printed headers, vendor branding, and distinctive layout cues.\n"
+        "5) confidence must be 0..1.\n"
+    )
+    prompt_payload = {
+        "known_templates": KNOWN_TEMPLATES,
+        "output_schema": {
+            "template_id": "string (one of known template_ids) OR 'UNKNOWN'",
+            "confidence": "number 0..1",
+            "reason": "short string",
+        },
+    }
+    user_text = (
+        "Classify the attached document images against known_templates.\n"
+        "Return JSON matching output_schema.\n\n"
+        f"{json.dumps(prompt_payload, indent=2)}"
+    )
+    # Multi-modal message: text + images
+    content: List[Dict[str, Any]] = [{"type": "text", "text": user_text}]
+    for b64png in pages_b64:
+        content.append(
+            {
+                "type": "image_url",
+                "image_url": {"url": f"data:image/png;base64,{b64png}"},
+            }
+        )
+    resp = client.chat.completions.create(
+        model=model,
+        temperature=0.0,
+        messages=[
+            {"role": "system", "content": system},
+            {"role": "user", "content": content},
+        ],
+    )
+    raw = (resp.choices[0].message.content or "").strip()
+    parsed = _parse_json_object(raw)
+    template_id = str(parsed.get("template_id") or "").strip()
+    confidence = _to_float(parsed.get("confidence"), default=0.0)
+    confidence = max(0.0, min(1.0, confidence))
+    reason = str(parsed.get("reason") or "").strip()
+    # Normalize: only allow known template ids or UNKNOWN
+    template_id = _normalize_template_id(template_id)
+    # If model returns UNKNOWN but gives high confidence, clamp confidence.
+    if template_id == "UNKNOWN" and confidence > 0.6:
+        confidence = 0.6
+    return {
+        "template_id": template_id,
+        "confidence": confidence,
+        "reason": reason[:500],
+        "trainer_schema": {},
+    }
+# ----------------------------
+# Legacy wrapper (ENV-based) - keep only if you want
+# ----------------------------
+def classify_with_openai_from_env(image_paths: List[str]) -> Dict[str, Any]:
+    """
+    Backwards compatible wrapper.
+    Reads env vars, then calls classify_with_openai(api_key=..., model=...).
+    Use this only if you have old code you haven't updated yet.
+    """
+    import os
+    api_key = (os.getenv("OPENAI_API_KEY_TEST") or os.getenv("OPENAI_API_KEY") or "").strip()
+    if not api_key:
+        raise RuntimeError("Missing OPENAI_API_KEY_TEST (or OPENAI_API_KEY)")
+    model = (os.getenv("OPENAI_MODEL") or "gpt-4o-mini").strip()
+    # IMPORTANT: call the explicit version (one implementation only)
+    return classify_with_openai(
+        image_paths,
+        api_key=api_key,
+        model=model,
+    )
+# ----------------------------
+# Helpers
+# ----------------------------
+def _normalize_template_id(template_id: str) -> str:
+    tid = (template_id or "").strip()
+    if not tid:
+        return "UNKNOWN"
+    known_ids = {t["template_id"] for t in KNOWN_TEMPLATES}
+    if tid in known_ids:
+        return tid
+    # common garbage patterns (model returns name instead of id, etc.)
+    low = tid.lower()
+    for t in KNOWN_TEMPLATES:
+        if t["name"].lower() == low:
+            return t["template_id"]
+    return "UNKNOWN"
+def _png_file_to_b64(path: Path) -> str:
+    data = path.read_bytes()
+    return base64.b64encode(data).decode("utf-8")
+_JSON_BLOCK_RE = re.compile(r"\{.*\}", re.DOTALL)
+def _parse_json_object(text: str) -> Dict[str, Any]:
+    """
+    Extract and parse the first {...} JSON object from model output.
+    Handles:
+      - pure JSON
+      - JSON embedded in text
+      - fenced code blocks (we strip fences)
+    """
+    if not text:
+        return {}
+    s = text.strip()
+    # Strip ```json fences if present
+    s = _strip_code_fences(s)
+    # Fast path: starts with "{"
+    if s.startswith("{"):
+        try:
+            return json.loads(s)
+        except Exception:
+            pass
+    # Try to find a JSON-looking block
+    m = _JSON_BLOCK_RE.search(s)
+    if not m:
+        return {}
+    chunk = m.group(0)
+    try:
+        return json.loads(chunk)
+    except Exception:
+        # last attempt: remove trailing commas (common model mistake)
+        cleaned = _remove_trailing_commas(chunk)
+        try:
+            return json.loads(cleaned)
+        except Exception:
+            return {}
+def _strip_code_fences(s: str) -> str:
+    # remove leading ```json / ``` and trailing ```
+    if s.startswith("```"):
+        s = re.sub(r"^```[a-zA-Z0-9]*\s*", "", s)
+        s = re.sub(r"\s*```$", "", s)
+    return s.strip()
+def _remove_trailing_commas(s: str) -> str:
+    # naive but effective: remove ",}" and ",]" patterns repeatedly
+    prev = None
+    cur = s
+    while prev != cur:
+        prev = cur
+        cur = re.sub(r",\s*}", "}", cur)
+        cur = re.sub(r",\s*]", "]", cur)
+    return cur
+def _to_float(x: Any, default: float = 0.0) -> float:
+    try:
+        return float(x)
+    except Exception:
+        return default
+# ----------------------------
+# Optional: quick self-check (manual)
+# ----------------------------
+def _debug_summarize_result(res: Dict[str, Any]) -> str:
+    return f"template_id={res.get('template_id')} conf={res.get('confidence')} reason={str(res.get('reason') or '')[:80]}"
+def _validate_known_templates() -> Tuple[bool, str]:
+    ids = [t.get("template_id") for t in KNOWN_TEMPLATES]
+    if any(not i for i in ids):
+        return False, "One or more templates missing template_id"
+    if len(set(ids)) != len(ids):
+        return False, "Duplicate template_id in KNOWN_TEMPLATES"
+    return True, "ok"

backend/worker/out/.keep ADDED Viewed

File without changes

backend/worker/pdf_render.py ADDED Viewed

	@@ -0,0 +1,41 @@

+from __future__ import annotations
+from dataclasses import dataclass
+from pathlib import Path
+from typing import List
+import fitz  # PyMuPDF
+from PIL import Image
+@dataclass
+class RenderedImage:
+    path: Path
+    page_index: int
+def render_pdf_to_pngs(pdf_path: Path, out_dir: Path, pages: int = 2, dpi: int = 200) -> List[RenderedImage]:
+    out_dir.mkdir(parents=True, exist_ok=True)
+    doc = fitz.open(pdf_path)
+    n = min(pages, doc.page_count)
+    zoom = dpi / 72.0
+    mat = fitz.Matrix(zoom, zoom)
+    rendered: List[RenderedImage] = []
+    for i in range(n):
+        page = doc.load_page(i)
+        pix = page.get_pixmap(matrix=mat, alpha=False)
+        img_path = out_dir / f"{pdf_path.stem}_p{i+1}.png"
+        pix.save(str(img_path))
+        # normalize to RGB with PIL (avoids weird modes)
+        im = Image.open(img_path).convert("RGB")
+        im.save(img_path)
+        rendered.append(RenderedImage(path=img_path, page_index=i))
+    doc.close()
+    return rendered

backend/worker/prompts.py ADDED Viewed

	@@ -0,0 +1,87 @@

+TEMPLATE_IDS = [
+    "T1_IFACTOR_DELIVERED_ORDER",
+    "T2_SEASPINE_DELIVERED_GOODS_FORM",
+    "T3_ASTURA_SALES_ORDER_FORM",
+    "T4_MEDICAL_ESTIMATION_OF_CHARGES",
+    "T5_CLINICAL_PROGRESS_NOTE_POSTOP",
+    "T6_CUSTOMER_CHARGE_SHEET_SPINE",
+    "T7_SALES_ORDER_ZIMMER",
+]
+SYSTEM_PROMPT = f"""
+You are classifying a medical/healthcare sales/order PDF form into one of the known templates,
+and extracting a "trainer schema" for onboarding.
+Known template_ids:
+{TEMPLATE_IDS}
+Rules:
+- You MUST return JSON only (no markdown, no extra text).
+- If none match confidently, return template_id "UNKNOWN".
+- Always produce a schema object (even for UNKNOWN) so onboarding can proceed.
+Output JSON shape (strict):
+{{
+  "template_id": "<one of known template_ids or UNKNOWN>",
+  "confidence": 0.0,
+  "reason": "<short reason>",
+  "trainer_schema": {{
+    "form_id": "<suggested id>",
+    "version": 1,
+    "page": 1,
+    "scalar_value_region_mode": "offset_from_anchor_v1",
+    "fields": [
+      {{
+        "field_id": "facility_organization",
+        "label": "Facility / Organization",
+        "type": "entity",
+        "anchor_hint": "<printed label text or None>",
+        "value_hint": "<what to extract>"
+      }},
+      {{
+        "field_id": "case_location_address",
+        "label": "Case Location / Address",
+        "type": "entity",
+        "anchor_hint": "<printed label text or None>",
+        "value_hint": "<what to extract>"
+      }},
+      {{
+        "field_id": "vendor",
+        "label": "Vendor",
+        "type": "entity",
+        "anchor_hint": "<printed label text or None>",
+        "value_hint": "<what to extract>"
+      }},
+      {{
+        "field_id": "physician_name",
+        "label": "Physician Name",
+        "type": "person",
+        "anchor_hint": "<printed label text or None>",
+        "value_hint": "<what to extract>"
+      }},
+      {{
+        "field_id": "date_of_surgery",
+        "label": "Date of Surgery",
+        "type": "date",
+        "anchor_hint": "<printed label text or None>",
+        "value_hint": "<what to extract>"
+      }},
+      {{
+        "field_id": "items",
+        "label": "Items / Line Items",
+        "type": "table",
+        "table_hint": {{
+          "expected_columns": ["item_number","description","qty","lot_number","price","extended_price"],
+          "where_on_page": "<short description>",
+          "header_text_examples": ["Item Number","Description","Qty"]
+        }}
+      }}
+    ]
+  }}
+}}
+"""
+USER_PROMPT = """
+Classify the form template and generate trainer_schema based on the provided page images.
+Focus on printed structure, titles, logos, and table headers.
+"""

backend/worker/template_registry_snapshot.py ADDED Viewed

File without changes

backend/worker/template_store.py ADDED Viewed

	@@ -0,0 +1,36 @@

+from __future__ import annotations
+import json
+from pathlib import Path
+from typing import Any, Dict, List
+TEMPLATE_DIR = Path(__file__).resolve().parent / "trainer_templates"
+def list_trainer_templates() -> List[Dict[str, Any]]:
+    TEMPLATE_DIR.mkdir(parents=True, exist_ok=True)
+    out: List[Dict[str, Any]] = []
+    for p in sorted(TEMPLATE_DIR.glob("*.json")):
+        try:
+            cfg = json.loads(p.read_text(encoding="utf-8"))
+        except Exception:
+            continue
+        template_id = cfg.get("template_id") or cfg.get("form_id") or p.stem
+        name = cfg.get("name") or cfg.get("form_id") or template_id
+        out.append({
+            "template_id": template_id,
+            "name": name,
+            # optional: trainer config itself (don’t spam prompt if huge)
+            "has_config": True,
+        })
+    return out
+def save_trainer_template(template_id: str, cfg: Dict[str, Any]) -> Path:
+    TEMPLATE_DIR.mkdir(parents=True, exist_ok=True)
+    cfg = dict(cfg)
+    cfg["template_id"] = template_id  # enforce
+    path = TEMPLATE_DIR / f"{template_id}.json"
+    path.write_text(json.dumps(cfg, indent=2), encoding="utf-8")
+    return path

backend/worker/tmp/.keep ADDED Viewed

File without changes

backend/worker/uploads/.keep ADDED Viewed

File without changes

backend/worker/worker.py ADDED Viewed

	@@ -0,0 +1,286 @@

+from __future__ import annotations
+import os
+import time
+import uuid
+from dataclasses import dataclass
+from pathlib import Path
+from typing import List, Tuple
+from dotenv import load_dotenv
+from .gmail_client import GmailClient
+from .openai_classifier import classify_with_openai
+from .pdf_render import render_pdf_to_pngs
+# Force load repo_root/backend/.env (single source of truth)
+REPO_ROOT = Path(__file__).resolve().parents[2]
+load_dotenv(REPO_ROOT / "backend" / ".env", override=True)
+@dataclass
+class Settings:
+    creds_path: Path
+    token_path: Path
+    label_incoming: str
+    label_known: str
+    label_unknown: str
+    label_train: str
+    # Rep email for UNKNOWN detection
+    rep_notify_to: str
+    notify_from: str
+    # OpenAI
+    openai_api_key: str
+    openai_model: str
+    poll_seconds: int
+    max_messages_per_poll: int
+    render_pages: int
+    render_dpi: int
+    trainer_base_url: str
+def load_settings() -> Settings:
+    base = Path(__file__).resolve().parents[1]  # backend/
+    creds = Path(os.environ.get("GMAIL_CREDENTIALS_JSON", str(base / "credentials.json")))
+    token = Path(os.environ.get("GMAIL_TOKEN_JSON", str(base / "token.json")))
+    openai_api_key = (os.environ.get("OPENAI_API_KEY_TEST") or os.environ.get("OPENAI_API_KEY") or "").strip()
+    openai_model = (os.environ.get("OPENAI_MODEL") or "gpt-4o-mini").strip()
+    return Settings(
+        creds_path=creds,
+        token_path=token,
+        label_incoming=os.environ.get("PDF_PIPELINE_LABEL_INCOMING", "PDF_PIPELINE/INCOMING"),
+        label_known=os.environ.get("PDF_PIPELINE_LABEL_KNOWN", "PDF_PIPELINE/KNOWN"),
+        label_unknown=os.environ.get("PDF_PIPELINE_LABEL_UNKNOWN", "PDF_PIPELINE/UNKNOWN"),
+        label_train=os.environ.get("PDF_PIPELINE_LABEL_TRAIN", "PDF_PIPELINE/TRAIN"),
+        notify_from=(os.environ.get("PDF_PIPELINE_NOTIFY_FROM") or "").strip(),
+        rep_notify_to=(os.environ.get("PDF_PIPELINE_NOTIFY_TO") or "").strip(),
+        openai_api_key=openai_api_key,
+        openai_model=openai_model,
+        poll_seconds=int(os.environ.get("PDF_PIPELINE_POLL_SECONDS", "20")),
+        max_messages_per_poll=int(os.environ.get("PDF_PIPELINE_MAX_PER_POLL", "5")),
+        render_pages=int(os.environ.get("PDF_PIPELINE_RENDER_PAGES", "2")),
+        render_dpi=int(os.environ.get("PDF_PIPELINE_RENDER_DPI", "200")),
+        trainer_base_url=(os.environ.get("PDF_TRAINER_BASE_URL") or "http://localhost:5173").strip(),
+    )
+def _safe_name(s: str) -> str:
+    return "".join(c if c.isalnum() or c in ("-", "_", ".", " ") else "_" for c in s).strip()
+def _write_pipeline_pdf(root_worker_dir: Path, filename: str, pdf_bytes: bytes) -> Tuple[str, Path]:
+    """
+    Persist PDF for the trainer to fetch later.
+    Returns (pdf_id, pdf_path_on_disk).
+    """
+    uploads_dir = root_worker_dir / "uploads"
+    uploads_dir.mkdir(parents=True, exist_ok=True)
+    pdf_id = uuid.uuid4().hex
+    pdf_path = uploads_dir / f"{pdf_id}.pdf"
+    name_path = uploads_dir / f"{pdf_id}.name.txt"
+    pdf_path.write_bytes(pdf_bytes)
+    name_path.write_text(filename, encoding="utf-8")
+    return pdf_id, pdf_path
+def _process_train_label(gmail: GmailClient, s: Settings, root: Path) -> None:
+    """
+    TRAIN behavior:
+      - Pull unread PDFs from TRAIN label
+      - Store into uploads/ and print trainer link
+      - Mark read
+      - Do NOT classify
+      - Do NOT move labels
+    """
+    msgs = gmail.search_unread_pdf_messages(s.label_train, max_results=s.max_messages_per_poll)
+    if not msgs:
+        return
+    for m in msgs:
+        msg_full = gmail.get_message_full(m.msg_id)
+        pdf_atts = gmail.list_pdf_attachments(msg_full)
+        if not pdf_atts:
+            gmail.move_message(m.msg_id, add_labels=[], remove_labels=[], mark_read=True)
+            continue
+        for (filename, att_id) in pdf_atts:
+            filename = _safe_name(filename or "attachment.pdf")
+            pdf_bytes = gmail.download_attachment(m.msg_id, att_id)
+            pdf_id, stored_pdf_path = _write_pipeline_pdf(root, filename, pdf_bytes)
+            trainer_link = f"{s.trainer_base_url.rstrip('/')}/?pdf_id={pdf_id}"
+            gmail.move_message(m.msg_id, add_labels=[], remove_labels=[], mark_read=True)
+            print(
+                f"[worker][TRAIN] stored PDF msg={m.msg_id} file={filename} "
+                f"pdf_id={pdf_id} stored={stored_pdf_path}"
+            )
+            print(f"[worker][TRAIN] open: {trainer_link}")
+def main():
+    s = load_settings()
+    # Validate settings
+    if not s.rep_notify_to:
+        raise RuntimeError("Missing PDF_PIPELINE_NOTIFY_TO (rep email for UNKNOWN detection)")
+    if not s.notify_from:
+        raise RuntimeError("Missing PDF_PIPELINE_NOTIFY_FROM (OAuth Gmail account email)")
+    if not s.trainer_base_url:
+        raise RuntimeError("Missing PDF_TRAINER_BASE_URL (base URL for trainer link)")
+    if not s.openai_api_key:
+        raise RuntimeError("Missing OPENAI_API_KEY_TEST (or OPENAI_API_KEY) in backend/.env")
+    gmail = GmailClient(s.creds_path, s.token_path)
+    # Ensure labels exist
+    gmail.ensure_label(s.label_incoming)
+    gmail.ensure_label(s.label_known)
+    gmail.ensure_label(s.label_unknown)
+    gmail.ensure_label(s.label_train)
+    root = Path(__file__).resolve().parents[0]  # backend/worker
+    tmp_dir = root / "tmp"
+    tmp_dir.mkdir(parents=True, exist_ok=True)
+    print(f"[worker] Watching label: {s.label_incoming}")
+    print(f"[worker] Known label:   {s.label_known}")
+    print(f"[worker] Unknown label: {s.label_unknown}")
+    print(f"[worker] Train label:   {s.label_train}")
+    print(f"[worker] Rep notify to: {s.rep_notify_to}")
+    print(f"[worker] OpenAI model:  {s.openai_model}")
+    while True:
+        try:
+            # 1) TRAIN lane
+            _process_train_label(gmail, s, root)
+            # 2) Main pipeline (INCOMING -> KNOWN/UNKNOWN)
+            msgs = gmail.search_unread_pdf_messages(s.label_incoming, max_results=s.max_messages_per_poll)
+            if not msgs:
+                time.sleep(s.poll_seconds)
+                continue
+            for m in msgs:
+                msg_full = gmail.get_message_full(m.msg_id)
+                pdf_atts = gmail.list_pdf_attachments(msg_full)
+                if not pdf_atts:
+                    # Remove INCOMING + mark read so it doesn't loop forever
+                    gmail.move_message(m.msg_id, add_labels=[], remove_labels=[s.label_incoming], mark_read=True)
+                    continue
+                any_unknown = False
+                unknown_payloads: List[Tuple[str, bytes]] = []
+                # Classify all PDF attachments for this message
+                for (filename, att_id) in pdf_atts:
+                    filename = _safe_name(filename or "attachment.pdf")
+                    pdf_bytes = gmail.download_attachment(m.msg_id, att_id)
+                    stamp = str(int(time.time()))
+                    pdf_path = tmp_dir / f"{stamp}_{m.msg_id}_{filename}"
+                    pdf_path.write_bytes(pdf_bytes)
+                    img_dir = tmp_dir / f"{stamp}_{m.msg_id}_{pdf_path.stem}"
+                    rendered = render_pdf_to_pngs(pdf_path, img_dir, pages=s.render_pages, dpi=s.render_dpi)
+                    image_paths = [str(r.path) for r in rendered]
+                    result = classify_with_openai(
+                        image_paths,
+                        api_key=s.openai_api_key,
+                        model=s.openai_model,
+                    )
+                    template_id = (result.get("template_id") or "UNKNOWN").strip()
+                    conf = float(result.get("confidence") or 0.0)
+                    if template_id == "UNKNOWN":
+                        any_unknown = True
+                        unknown_payloads.append((filename, pdf_bytes))
+                        print(f"[worker] UNKNOWN attachment conf={conf:.3f} msg={m.msg_id} file={filename}")
+                    else:
+                        print(
+                            f"[worker] KNOWN attachment template={template_id} conf={conf:.3f} "
+                            f"msg={m.msg_id} file={filename}"
+                        )
+                # Apply Gmail label ONCE per message
+                if any_unknown:
+                    gmail.move_message(
+                        m.msg_id,
+                        add_labels=[s.label_unknown],
+                        remove_labels=[s.label_incoming],
+                        mark_read=True,
+                    )
+                else:
+                    gmail.move_message(
+                        m.msg_id,
+                        add_labels=[s.label_known],
+                        remove_labels=[s.label_incoming],
+                        mark_read=True,
+                    )
+                # Notify rep for each unknown PDF attachment
+                if any_unknown:
+                    for (filename, pdf_bytes) in unknown_payloads:
+                        pdf_id, stored_pdf_path = _write_pipeline_pdf(root, filename, pdf_bytes)
+                        trainer_link = f"{s.trainer_base_url.rstrip('/')}/?pdf_id={pdf_id}"
+                        subject = "Action required: Unknown PDF format (template not found)"
+                        body = (
+                            "Hi,\n\n"
+                            "We received a PDF that does not match any existing templates in the system.\n\n"
+                            "Please open the PDF Trainer using the link below and create or update the template configuration:\n"
+                            f"{trainer_link}\n\n"
+                            "The original PDF is attached for reference.\n\n"
+                            "Thank you,\n"
+                            "Inserio Automation\n"
+                        )
+                        attachments: List[Tuple[str, bytes]] = []
+                        if len(pdf_bytes) < 20 * 1024 * 1024:
+                            attachments.append((filename, pdf_bytes))
+                        else:
+                            body += "\nNote: The PDF was too large to attach.\n"
+                        gmail.send_email(
+                            to_email=s.rep_notify_to,
+                            from_email=s.notify_from,
+                            subject=subject,
+                            body_text=body,
+                            attachments=attachments,
+                        )
+                        print(
+                            f"[worker] UNKNOWN: emailed rep {s.rep_notify_to} msg={m.msg_id} file={filename} "
+                            f"pdf_id={pdf_id} stored={stored_pdf_path}"
+                        )
+        except Exception as e:
+            print(f"[worker] ERROR: {e}")
+        time.sleep(s.poll_seconds)
+if __name__ == "__main__":
+    main()

requirements.txt CHANGED Viewed

@@ -1,18 +1,8 @@
-# Google / Gmail
-google-api-python-client==2.111.0
-google-auth==2.27.0
-google-auth-oauthlib==1.2.0
-# OpenAI
-openai==1.12.0
-# PDF -> image
-PyMuPDF==1.23.26
-Pillow==10.2.0
-# Utilities
-python-dotenv==1.0.1
-requests==2.31.0
-fastapi==0.115.6
-uvicorn==0.30.6

+google-api-python-client
+google-auth
+google-auth-oauthlib
+google-auth-httplib2
+python-dotenv
+pydantic
+requests
+Pillow