Avinashnalla7 commited on
Commit
2fce05d
·
1 Parent(s): 8974c0c

deploy: real pdf-trainer-demo API

Browse files
.gitattributes DELETED
@@ -1,35 +0,0 @@
1
- *.7z filter=lfs diff=lfs merge=lfs -text
2
- *.arrow filter=lfs diff=lfs merge=lfs -text
3
- *.bin filter=lfs diff=lfs merge=lfs -text
4
- *.bz2 filter=lfs diff=lfs merge=lfs -text
5
- *.ckpt filter=lfs diff=lfs merge=lfs -text
6
- *.ftz filter=lfs diff=lfs merge=lfs -text
7
- *.gz filter=lfs diff=lfs merge=lfs -text
8
- *.h5 filter=lfs diff=lfs merge=lfs -text
9
- *.joblib filter=lfs diff=lfs merge=lfs -text
10
- *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
- *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
- *.model filter=lfs diff=lfs merge=lfs -text
13
- *.msgpack filter=lfs diff=lfs merge=lfs -text
14
- *.npy filter=lfs diff=lfs merge=lfs -text
15
- *.npz filter=lfs diff=lfs merge=lfs -text
16
- *.onnx filter=lfs diff=lfs merge=lfs -text
17
- *.ot filter=lfs diff=lfs merge=lfs -text
18
- *.parquet filter=lfs diff=lfs merge=lfs -text
19
- *.pb filter=lfs diff=lfs merge=lfs -text
20
- *.pickle filter=lfs diff=lfs merge=lfs -text
21
- *.pkl filter=lfs diff=lfs merge=lfs -text
22
- *.pt filter=lfs diff=lfs merge=lfs -text
23
- *.pth filter=lfs diff=lfs merge=lfs -text
24
- *.rar filter=lfs diff=lfs merge=lfs -text
25
- *.safetensors filter=lfs diff=lfs merge=lfs -text
26
- saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
- *.tar.* filter=lfs diff=lfs merge=lfs -text
28
- *.tar filter=lfs diff=lfs merge=lfs -text
29
- *.tflite filter=lfs diff=lfs merge=lfs -text
30
- *.tgz filter=lfs diff=lfs merge=lfs -text
31
- *.wasm filter=lfs diff=lfs merge=lfs -text
32
- *.xz filter=lfs diff=lfs merge=lfs -text
33
- *.zip filter=lfs diff=lfs merge=lfs -text
34
- *.zst filter=lfs diff=lfs merge=lfs -text
35
- *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
.gitignore DELETED
@@ -1,8 +0,0 @@
1
- __pycache__/
2
- *.pyc
3
- .venv/
4
- .env
5
- .env.*
6
- node_modules/
7
- dist/
8
- .DS_Store
 
 
 
 
 
 
 
 
 
Dockerfile CHANGED
@@ -1,13 +1,25 @@
1
- FROM python:3.11-slim
 
2
 
3
  WORKDIR /app
4
- ENV PYTHONUNBUFFERED=1
5
- ENV PIP_DISABLE_PIP_VERSION_CHECK=1
6
 
7
- COPY requirements.txt .
8
- RUN pip install --no-cache-dir -r requirements.txt
9
 
10
  COPY . .
 
11
 
12
- ENV PORT=7860
13
- CMD ["bash", "-lc", "uvicorn backend.api:app --host 0.0.0.0 --port ${PORT}"]
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ---- Build stage ----
2
+ FROM node:20-alpine AS build
3
 
4
  WORKDIR /app
 
 
5
 
6
+ COPY package.json package-lock.json ./
7
+ RUN npm ci
8
 
9
  COPY . .
10
+ RUN npm run build
11
 
12
+ # ---- Runtime stage ----
13
+ FROM nginx:alpine
14
+
15
+ # Remove default nginx config
16
+ RUN rm /etc/nginx/conf.d/default.conf
17
+
18
+ # Custom nginx config
19
+ COPY nginx.conf /etc/nginx/conf.d/default.conf
20
+
21
+ # Copy built assets
22
+ COPY --from=build /app/dist /usr/share/nginx/html
23
+
24
+ EXPOSE 7860
25
+ CMD ["nginx", "-g", "daemon off;"]
README.md DELETED
@@ -1,10 +0,0 @@
1
- ---
2
- title: Pdf Trainer Api
3
- emoji: 🌍
4
- colorFrom: indigo
5
- colorTo: pink
6
- sdk: docker
7
- pinned: false
8
- ---
9
-
10
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
api.py DELETED
@@ -1 +0,0 @@
1
- from backend.api import app
 
 
backend/.DS_Store ADDED
Binary file (8.2 kB). View file
 
backend/{__init__.py → .env} RENAMED
File without changes
backend/__pycache__/api.cpython-311.pyc ADDED
Binary file (6.45 kB). View file
 
backend/api.py CHANGED
@@ -1,20 +1,20 @@
1
  from __future__ import annotations
2
 
 
3
  import os
4
- import time
5
  from pathlib import Path
6
- from typing import Dict, Optional
7
- from uuid import uuid4
8
- from collections import deque
9
- from dataclasses import dataclass, asdict
10
 
11
- from fastapi import FastAPI, HTTPException, Header, Request
 
12
  from fastapi.middleware.cors import CORSMiddleware
13
- from fastapi.responses import FileResponse, JSONResponse
14
- from pydantic import BaseModel
 
15
 
16
  app = FastAPI(title="PDF Trainer API", version="1.0")
17
 
 
18
  app.add_middleware(
19
  CORSMiddleware,
20
  allow_origins=[
@@ -26,114 +26,188 @@ app.add_middleware(
26
  allow_headers=["*"],
27
  )
28
 
29
- WORKER_TOKEN = (os.getenv("WORKER_TOKEN") or "").strip()
30
- DATA_DIR = Path(os.getenv("DATA_DIR", "/app/data")).resolve()
31
- PDF_DIR = DATA_DIR / "pdfs"
32
- CFG_DIR = DATA_DIR / "configs"
 
 
 
 
 
 
33
 
34
- PDF_DIR.mkdir(parents=True, exist_ok=True)
35
- CFG_DIR.mkdir(parents=True, exist_ok=True)
 
 
 
 
 
 
 
36
 
37
- def _require_worker_token(x_worker_token: Optional[str]):
38
- if not WORKER_TOKEN:
39
- raise HTTPException(status_code=500, detail="Server missing WORKER_TOKEN")
40
- if not x_worker_token or x_worker_token != WORKER_TOKEN:
41
- raise HTTPException(status_code=401, detail="Unauthorized worker")
42
 
43
  @app.get("/health")
44
- def health() -> Dict[str, bool]:
45
  return {"ok": True}
46
 
47
- @app.get("/")
48
- def root() -> Dict[str, str]:
49
- return {"service": "pdf-trainer-api", "status": "running"}
50
-
51
- def _pdf_path(pdf_id: str) -> Path:
52
- safe = pdf_id.strip().replace("/", "_")
53
- if not safe:
54
- raise HTTPException(status_code=400, detail="Missing pdf_id")
55
- if not safe.lower().endswith(".pdf"):
56
- safe = safe + ".pdf"
57
- return PDF_DIR / safe
58
-
59
- @app.put("/api/pdf/{pdf_id}")
60
- async def put_pdf(
61
- pdf_id: str,
62
- request: Request,
63
- x_worker_token: Optional[str] = Header(default=None),
64
- ):
65
- _require_worker_token(x_worker_token)
66
- body = await request.body()
67
- if not body:
68
- raise HTTPException(status_code=400, detail="Empty body")
69
- p = _pdf_path(pdf_id)
70
- p.write_bytes(body)
71
- return {"ok": True, "pdf_id": p.stem, "bytes": len(body)}
72
 
73
  @app.get("/api/pdf/{pdf_id}")
74
  def get_pdf(pdf_id: str):
75
- p = _pdf_path(pdf_id)
76
- if not p.exists():
77
  raise HTTPException(status_code=404, detail="PDF not found")
78
- return FileResponse(str(p), media_type="application/pdf", filename=p.name)
 
 
 
 
 
 
 
79
 
80
  @app.post("/api/send-config")
81
- async def send_config(payload: dict):
82
- pdf_id = str(payload.get("pdf_id", "")).strip()
83
- template_id = str(payload.get("template_id", "")).strip()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
84
  config = payload.get("config")
85
- if not pdf_id or not template_id or config is None:
86
- raise HTTPException(status_code=400, detail="Missing pdf_id/template_id/config")
87
- cfg_name = f"{pdf_id}_{template_id}_{uuid4().hex}.json"
88
- out = CFG_DIR / cfg_name
89
- import json
90
- out.write_text(json.dumps(payload, indent=2), encoding="utf-8")
91
- return {"ok": True, "saved": cfg_name}
92
-
93
- @dataclass
94
- class Job:
95
- id: str
96
- payload: dict
97
- created_at: float
98
- status: str = "queued"
99
- message: str = ""
100
-
101
- JOBQ: deque[Job] = deque()
102
- JOBS: dict[str, Job] = {}
103
-
104
- class EnqueueReq(BaseModel):
105
- payload: dict
106
-
107
- @app.post("/api/jobs/enqueue")
108
- def enqueue_job(req: EnqueueReq):
109
- job_id = uuid4().hex
110
- job = Job(id=job_id, payload=req.payload, created_at=time.time())
111
- JOBQ.append(job)
112
- JOBS[job_id] = job
113
- return {"job_id": job_id, "status": job.status}
114
-
115
- @app.get("/api/jobs/next")
116
- def jobs_next(x_worker_token: Optional[str] = Header(default=None)):
117
- _require_worker_token(x_worker_token)
118
- while JOBQ:
119
- job = JOBQ.popleft()
120
- if job.status == "queued":
121
- job.status = "running"
122
- return asdict(job)
123
- return JSONResponse(status_code=204, content=None)
124
-
125
- class StatusReq(BaseModel):
126
- status: str
127
- message: str = ""
128
-
129
- @app.post("/api/jobs/{job_id}/status")
130
- def jobs_status(job_id: str, req: StatusReq, x_worker_token: Optional[str] = Header(default=None)):
131
- _require_worker_token(x_worker_token)
132
- job = JOBS.get(job_id)
133
- if not job:
134
- raise HTTPException(status_code=404, detail="Unknown job_id")
135
- if req.status not in ("running", "done", "failed"):
136
- raise HTTPException(status_code=400, detail="Invalid status")
137
- job.status = req.status
138
- job.message = req.message
139
- return {"ok": True, "job_id": job_id, "status": job.status}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  from __future__ import annotations
2
 
3
+ import json
4
  import os
 
5
  from pathlib import Path
6
+ from typing import Any, Dict
 
 
 
7
 
8
+ from dotenv import load_dotenv
9
+ from fastapi import FastAPI, HTTPException
10
  from fastapi.middleware.cors import CORSMiddleware
11
+ from fastapi.responses import FileResponse, PlainTextResponse
12
+
13
+ from backend.worker.gmail_client import GmailClient
14
 
15
  app = FastAPI(title="PDF Trainer API", version="1.0")
16
 
17
+ # Allow Vite dev server
18
  app.add_middleware(
19
  CORSMiddleware,
20
  allow_origins=[
 
26
  allow_headers=["*"],
27
  )
28
 
29
+ REPO_ROOT = Path(__file__).resolve().parents[1]
30
+ BACKEND_DIR = REPO_ROOT / "backend"
31
+ UPLOADS_DIR = BACKEND_DIR / "worker" / "uploads"
32
+
33
+ # Load backend/.env explicitly ONCE for this process
34
+ load_dotenv(BACKEND_DIR / ".env", override=True)
35
+
36
+ CREDENTIALS_JSON = Path(os.environ.get("GMAIL_CREDENTIALS_JSON", str(BACKEND_DIR / "credentials.json")))
37
+ TOKEN_JSON = Path(os.environ.get("GMAIL_TOKEN_JSON", str(BACKEND_DIR / "token.json")))
38
+
39
 
40
+ def _gmail() -> GmailClient:
41
+ return GmailClient(CREDENTIALS_JSON, TOKEN_JSON)
42
+
43
+
44
+ def _get_env_required(key: str) -> str:
45
+ v = (os.environ.get(key) or "").strip()
46
+ if not v:
47
+ raise HTTPException(status_code=500, detail=f"Server missing {key} env var")
48
+ return v
49
 
 
 
 
 
 
50
 
51
  @app.get("/health")
52
+ def health():
53
  return {"ok": True}
54
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
55
 
56
  @app.get("/api/pdf/{pdf_id}")
57
  def get_pdf(pdf_id: str):
58
+ path = UPLOADS_DIR / f"{pdf_id}.pdf"
59
+ if not path.exists():
60
  raise HTTPException(status_code=404, detail="PDF not found")
61
+
62
+ name_path = UPLOADS_DIR / f"{pdf_id}.name.txt"
63
+ pdf_name = name_path.read_text(encoding="utf-8").strip() if name_path.exists() else f"{pdf_id}.pdf"
64
+
65
+ resp = FileResponse(path, media_type="application/pdf", filename=pdf_name)
66
+ resp.headers["X-PDF-Name"] = pdf_name
67
+ return resp
68
+
69
 
70
  @app.post("/api/send-config")
71
+ async def send_config(payload: Dict[str, Any]):
72
+ """
73
+ PIPELINE SUBMISSION EMAIL (after rep saves config)
74
+
75
+ REQUIRED payload:
76
+ - pdf_id: str
77
+ - template_id: str
78
+ - config: dict
79
+
80
+ Sends to PIPELINE inbox:
81
+ - PDF_PIPELINE_PIPELINE_NOTIFY_TO
82
+
83
+ Requirements:
84
+ - Subject includes template_id
85
+ - Body includes pdf_id
86
+ - Attachments: JSON + PDF
87
+ """
88
+ pdf_id = (payload.get("pdf_id") or "").strip()
89
+ template_id = (payload.get("template_id") or "").strip()
90
  config = payload.get("config")
91
+
92
+ if not pdf_id:
93
+ raise HTTPException(status_code=400, detail="Missing pdf_id")
94
+ if not template_id:
95
+ raise HTTPException(status_code=400, detail="Missing template_id")
96
+ if not isinstance(config, dict):
97
+ raise HTTPException(status_code=400, detail="Missing config object")
98
+
99
+ pipeline_to = _get_env_required("PDF_PIPELINE_PIPELINE_NOTIFY_TO")
100
+ notify_from = _get_env_required("PDF_PIPELINE_NOTIFY_FROM")
101
+ trainer_base_url = (os.environ.get("PDF_TRAINER_BASE_URL") or "http://localhost:5173").strip()
102
+
103
+ pdf_path = UPLOADS_DIR / f"{pdf_id}.pdf"
104
+ if not pdf_path.exists():
105
+ raise HTTPException(status_code=404, detail="PDF not found for pdf_id")
106
+
107
+ name_path = UPLOADS_DIR / f"{pdf_id}.name.txt"
108
+ pdf_name = name_path.read_text(encoding="utf-8").strip() if name_path.exists() else f"{pdf_id}.pdf"
109
+
110
+ trainer_link = f"{trainer_base_url.rstrip('/')}/?pdf_id={pdf_id}"
111
+
112
+ subject = f"PDF_TRAINER_CONFIG_SUBMITTED | template_id={template_id}"
113
+ body = (
114
+ "Hi,\n\n"
115
+ "A PDF Trainer configuration was submitted.\n\n"
116
+ f"template_id: {template_id}\n"
117
+ f"pdf_id: {pdf_id}\n"
118
+ f"trainer_link: {trainer_link}\n\n"
119
+ "Attachments:\n"
120
+ f"- trainer_config_{pdf_id}_{template_id}.json\n"
121
+ f"- {pdf_name}\n\n"
122
+ "Thank you,\n"
123
+ "Inserio Automation\n"
124
+ )
125
+
126
+ cfg_bytes = json.dumps(
127
+ {"pdf_id": pdf_id, "template_id": template_id, "config": config},
128
+ indent=2,
129
+ ).encode("utf-8")
130
+
131
+ attachments = [
132
+ (f"trainer_config_{pdf_id}_{template_id}.json", cfg_bytes),
133
+ (pdf_name, pdf_path.read_bytes()),
134
+ ]
135
+
136
+ gmail = _gmail()
137
+ gmail.send_email(
138
+ to_email=pipeline_to,
139
+ from_email=notify_from,
140
+ subject=subject,
141
+ body_text=body,
142
+ attachments=attachments,
143
+ )
144
+
145
+ return {"ok": True}
146
+
147
+
148
+ @app.post("/api/notify-unknown")
149
+ async def notify_unknown(payload: Dict[str, Any]):
150
+ """
151
+ UNKNOWN TEMPLATE NOTIFICATION (rep email)
152
+
153
+ REQUIRED payload:
154
+ - pdf_id: str
155
+ OPTIONAL:
156
+ - reason: str
157
+
158
+ Sends to REP inbox:
159
+ - PDF_PIPELINE_NOTIFY_TO
160
+
161
+ Requirements:
162
+ - Includes trainer link with PDF pre-loaded
163
+ - Attaches PDF
164
+ - No JSON
165
+ """
166
+ pdf_id = (payload.get("pdf_id") or "").strip()
167
+ reason = (payload.get("reason") or "").strip()
168
+
169
+ if not pdf_id:
170
+ raise HTTPException(status_code=400, detail="Missing pdf_id")
171
+
172
+ rep_to = _get_env_required("PDF_PIPELINE_NOTIFY_TO")
173
+ notify_from = _get_env_required("PDF_PIPELINE_NOTIFY_FROM")
174
+ trainer_base_url = (os.environ.get("PDF_TRAINER_BASE_URL") or "http://localhost:5173").strip()
175
+
176
+ pdf_path = UPLOADS_DIR / f"{pdf_id}.pdf"
177
+ if not pdf_path.exists():
178
+ raise HTTPException(status_code=404, detail="PDF not found for pdf_id")
179
+
180
+ name_path = UPLOADS_DIR / f"{pdf_id}.name.txt"
181
+ pdf_name = name_path.read_text(encoding="utf-8").strip() if name_path.exists() else f"{pdf_id}.pdf"
182
+
183
+ trainer_link = f"{trainer_base_url.rstrip('/')}/?pdf_id={pdf_id}"
184
+
185
+ subject = "Action required: Unknown PDF format (template not found)"
186
+ body = (
187
+ "Hi,\n\n"
188
+ "We received a PDF that does not match any existing templates in the system.\n\n"
189
+ + (f"Reason: {reason}\n\n" if reason else "")
190
+ + "Please open the PDF Trainer using the link below and create or update the template configuration:\n"
191
+ f"{trainer_link}\n\n"
192
+ "The original PDF is attached for reference.\n\n"
193
+ "Thank you,\n"
194
+ "Inserio Automation\n"
195
+ )
196
+
197
+ attachments = [(pdf_name, pdf_path.read_bytes())]
198
+
199
+ gmail = _gmail()
200
+ gmail.send_email(
201
+ to_email=rep_to,
202
+ from_email=notify_from,
203
+ subject=subject,
204
+ body_text=body,
205
+ attachments=attachments,
206
+ )
207
+
208
+ return {"ok": True}
209
+
210
+
211
+ @app.get("/", response_class=PlainTextResponse)
212
+ def root():
213
+ return "PDF Trainer API. Use /health"
backend/scripts/__pycache__/apply_trainer_schemas.cpython-314.pyc ADDED
Binary file (2.8 kB). View file
 
backend/scripts/__pycache__/generate_template_schema_skeletons.cpython-314.pyc ADDED
Binary file (4.72 kB). View file
 
backend/scripts/__pycache__/migrate_hardcoded_templates.cpython-314.pyc ADDED
Binary file (3.15 kB). View file
 
backend/sftp_store.py DELETED
@@ -1,42 +0,0 @@
1
- import os
2
- import socket
3
- import posixpath
4
- import paramiko
5
-
6
-
7
- def _env(name: str) -> str:
8
- v = (os.getenv(name) or "").strip()
9
- if not v:
10
- raise RuntimeError(f"Missing env var: {name}")
11
- return v
12
-
13
-
14
- def download_bytes(remote_path: str) -> bytes:
15
- """
16
- Downloads a file from SFTP_ROOT + remote_path.
17
- remote_path should be relative like 'pdfs/<id>.pdf' (no leading slash).
18
- """
19
- host = _env("SFTP_HOST")
20
- port = int(_env("SFTP_PORT"))
21
- user = _env("SFTP_USER")
22
- pw = _env("SFTP_PASS")
23
- root = (_env("SFTP_ROOT").rstrip("/") or "/")
24
-
25
- rp = remote_path.lstrip("/")
26
-
27
- transport = paramiko.Transport(socket.create_connection((host, port), timeout=10))
28
- transport.banner_timeout = 10
29
- transport.auth_timeout = 10
30
- transport.connect(username=user, password=pw)
31
- sftp = paramiko.SFTPClient.from_transport(transport)
32
-
33
- try:
34
- sftp.chdir(root)
35
- full = posixpath.join(".", rp)
36
- with sftp.open(full, "rb") as f:
37
- return f.read()
38
- finally:
39
- try:
40
- sftp.close()
41
- finally:
42
- transport.close()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
backend/worker/__pycache__/__init__.cpython-311.pyc ADDED
Binary file (221 Bytes). View file
 
backend/worker/__pycache__/__init__.cpython-314.pyc ADDED
Binary file (211 Bytes). View file
 
backend/worker/__pycache__/gmail_client.cpython-311.pyc ADDED
Binary file (11 kB). View file
 
backend/worker/__pycache__/gmail_client.cpython-314.pyc ADDED
Binary file (11.2 kB). View file
 
backend/worker/__pycache__/openai_classifier.cpython-311.pyc ADDED
Binary file (7.51 kB). View file
 
backend/worker/__pycache__/openai_classifier.cpython-314.pyc ADDED
Binary file (12.5 kB). View file
 
backend/worker/__pycache__/pdf_render.cpython-311.pyc ADDED
Binary file (2.38 kB). View file
 
backend/worker/__pycache__/pdf_render.cpython-314.pyc ADDED
Binary file (2.28 kB). View file
 
backend/worker/__pycache__/prompts.cpython-311.pyc ADDED
Binary file (2.85 kB). View file
 
backend/worker/__pycache__/worker.cpython-311.pyc ADDED
Binary file (9.98 kB). View file
 
backend/worker/__pycache__/worker.cpython-314.pyc ADDED
Binary file (14.5 kB). View file
 
requirements.txt CHANGED
@@ -1,4 +1,18 @@
1
- fastapi
2
- uvicorn[standard]
3
- python-dotenv
4
- pydantic
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Google / Gmail
2
+ google-api-python-client==2.111.0
3
+ google-auth==2.27.0
4
+ google-auth-oauthlib==1.2.0
5
+
6
+ # OpenAI
7
+ openai==1.12.0
8
+
9
+ # PDF -> image
10
+ PyMuPDF==1.23.26
11
+ Pillow==10.2.0
12
+
13
+ # Utilities
14
+ python-dotenv==1.0.1
15
+ requests==2.31.0
16
+
17
+ fastapi==0.115.6
18
+ uvicorn==0.30.6