Avinashnalla7 commited on
Commit
4478465
·
1 Parent(s): 1b42251

deploy: real pdf-trainer-demo worker

Browse files
Files changed (7) hide show
  1. .gitattributes +0 -35
  2. .gitignore +2 -2
  3. Dockerfile +20 -6
  4. README.md +0 -10
  5. requirements.txt +18 -3
  6. sftp_store.py +0 -89
  7. worker.py +0 -173
.gitattributes DELETED
@@ -1,35 +0,0 @@
1
- *.7z filter=lfs diff=lfs merge=lfs -text
2
- *.arrow filter=lfs diff=lfs merge=lfs -text
3
- *.bin filter=lfs diff=lfs merge=lfs -text
4
- *.bz2 filter=lfs diff=lfs merge=lfs -text
5
- *.ckpt filter=lfs diff=lfs merge=lfs -text
6
- *.ftz filter=lfs diff=lfs merge=lfs -text
7
- *.gz filter=lfs diff=lfs merge=lfs -text
8
- *.h5 filter=lfs diff=lfs merge=lfs -text
9
- *.joblib filter=lfs diff=lfs merge=lfs -text
10
- *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
- *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
- *.model filter=lfs diff=lfs merge=lfs -text
13
- *.msgpack filter=lfs diff=lfs merge=lfs -text
14
- *.npy filter=lfs diff=lfs merge=lfs -text
15
- *.npz filter=lfs diff=lfs merge=lfs -text
16
- *.onnx filter=lfs diff=lfs merge=lfs -text
17
- *.ot filter=lfs diff=lfs merge=lfs -text
18
- *.parquet filter=lfs diff=lfs merge=lfs -text
19
- *.pb filter=lfs diff=lfs merge=lfs -text
20
- *.pickle filter=lfs diff=lfs merge=lfs -text
21
- *.pkl filter=lfs diff=lfs merge=lfs -text
22
- *.pt filter=lfs diff=lfs merge=lfs -text
23
- *.pth filter=lfs diff=lfs merge=lfs -text
24
- *.rar filter=lfs diff=lfs merge=lfs -text
25
- *.safetensors filter=lfs diff=lfs merge=lfs -text
26
- saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
- *.tar.* filter=lfs diff=lfs merge=lfs -text
28
- *.tar filter=lfs diff=lfs merge=lfs -text
29
- *.tflite filter=lfs diff=lfs merge=lfs -text
30
- *.tgz filter=lfs diff=lfs merge=lfs -text
31
- *.wasm filter=lfs diff=lfs merge=lfs -text
32
- *.xz filter=lfs diff=lfs merge=lfs -text
33
- *.zip filter=lfs diff=lfs merge=lfs -text
34
- *.zst filter=lfs diff=lfs merge=lfs -text
35
- *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
.gitignore CHANGED
@@ -1,5 +1,5 @@
1
  __pycache__/
2
  *.pyc
3
- *.pyo
4
- *.pyd
5
  .DS_Store
 
 
 
1
  __pycache__/
2
  *.pyc
 
 
3
  .DS_Store
4
+ .env
5
+ _local_secrets/
Dockerfile CHANGED
@@ -1,11 +1,25 @@
1
- FROM python:3.11-slim
 
 
2
  WORKDIR /app
3
 
4
- COPY requirements.txt .
5
- RUN pip install --no-cache-dir -r requirements.txt
6
 
7
- # IMPORTANT: copy everything (worker.py + sftp_store.py + future files)
8
  COPY . .
 
 
 
 
 
 
 
 
 
 
 
 
 
9
 
10
- ENV PYTHONUNBUFFERED=1
11
- CMD ["python", "worker.py"]
 
1
+ # ---- Build stage ----
2
+ FROM node:20-alpine AS build
3
+
4
  WORKDIR /app
5
 
6
+ COPY package.json package-lock.json ./
7
+ RUN npm ci
8
 
 
9
  COPY . .
10
+ RUN npm run build
11
+
12
+ # ---- Runtime stage ----
13
+ FROM nginx:alpine
14
+
15
+ # Remove default nginx config
16
+ RUN rm /etc/nginx/conf.d/default.conf
17
+
18
+ # Custom nginx config
19
+ COPY nginx.conf /etc/nginx/conf.d/default.conf
20
+
21
+ # Copy built assets
22
+ COPY --from=build /app/dist /usr/share/nginx/html
23
 
24
+ EXPOSE 7860
25
+ CMD ["nginx", "-g", "daemon off;"]
README.md DELETED
@@ -1,10 +0,0 @@
1
- ---
2
- title: Pdf Trainer Worker
3
- emoji: 🏆
4
- colorFrom: indigo
5
- colorTo: red
6
- sdk: docker
7
- pinned: false
8
- ---
9
-
10
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
requirements.txt CHANGED
@@ -1,3 +1,18 @@
1
- requests
2
- python-dotenv
3
- paramiko
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Google / Gmail
2
+ google-api-python-client==2.111.0
3
+ google-auth==2.27.0
4
+ google-auth-oauthlib==1.2.0
5
+
6
+ # OpenAI
7
+ openai==1.12.0
8
+
9
+ # PDF -> image
10
+ PyMuPDF==1.23.26
11
+ Pillow==10.2.0
12
+
13
+ # Utilities
14
+ python-dotenv==1.0.1
15
+ requests==2.31.0
16
+
17
+ fastapi==0.115.6
18
+ uvicorn==0.30.6
sftp_store.py DELETED
@@ -1,89 +0,0 @@
1
- import os
2
- import posixpath
3
- import paramiko
4
-
5
-
6
- def _env(name: str) -> str:
7
- v = (os.getenv(name) or "").strip()
8
- if not v:
9
- raise RuntimeError(f"Missing env var: {name}")
10
- return v
11
-
12
-
13
- def sftp_test() -> str:
14
- host = _env("SFTP_HOST")
15
- port = int(_env("SFTP_PORT"))
16
- user = _env("SFTP_USER")
17
- pw = _env("SFTP_PASS")
18
- root = _env("SFTP_ROOT").rstrip("/") or "/"
19
-
20
- transport = paramiko.Transport((host, port))
21
- transport.connect(username=user, password=pw)
22
- sftp = paramiko.SFTPClient.from_transport(transport)
23
-
24
- try:
25
- sftp.chdir(root)
26
- sftp.listdir(".")
27
- return "sftp ok"
28
- finally:
29
- try:
30
- sftp.close()
31
- finally:
32
- transport.close()
33
-
34
-
35
- def upload_bytes(remote_path: str, data: bytes) -> None:
36
- """
37
- Upload to SFTP_ROOT/<remote_path>.
38
- remote_path should be RELATIVE (e.g. "pdfs/abc.pdf").
39
- If someone passes "/inserio/pdfs/abc.pdf", we normalize it.
40
- """
41
- host = _env("SFTP_HOST")
42
- port = int(_env("SFTP_PORT"))
43
- user = _env("SFTP_USER")
44
- pw = _env("SFTP_PASS")
45
- root = _env("SFTP_ROOT").rstrip("/") or "/"
46
-
47
- transport = paramiko.Transport((host, port))
48
- transport.connect(username=user, password=pw)
49
- sftp = paramiko.SFTPClient.from_transport(transport)
50
-
51
- try:
52
- # go to the shared root first
53
- sftp.chdir(root)
54
-
55
- # normalize remote_path relative to root (avoid /inserio/inserio/... double prefix)
56
- root_rel = root.strip("/")
57
- rp = remote_path.lstrip("/")
58
- if root_rel and rp.startswith(root_rel + "/"):
59
- rp = rp[len(root_rel) + 1 :]
60
- remote_path = rp
61
-
62
- # ensure directories exist (relative to root)
63
- parts = remote_path.split("/")[:-1]
64
- cur = "."
65
- for part in parts:
66
- if not part:
67
- continue
68
- cur = posixpath.join(cur, part)
69
- try:
70
- sftp.mkdir(cur)
71
- except Exception:
72
- pass
73
-
74
- # write file relative to root
75
- full = posixpath.join(".", remote_path)
76
- with sftp.open(full, "wb") as f:
77
- f.write(data)
78
- finally:
79
- try:
80
- sftp.close()
81
- finally:
82
- transport.close()
83
-
84
-
85
- def upload_pdf(pdf_id: str, data: bytes) -> str:
86
- # Stores under: pdfs/{pdf_id}.pdf (relative to SFTP_ROOT)
87
- remote_path = f"pdfs/{pdf_id}.pdf"
88
- upload_bytes(remote_path, data)
89
- return remote_path
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
worker.py DELETED
@@ -1,173 +0,0 @@
1
- import os
2
- import time
3
- import base64
4
- import threading
5
- import requests
6
- from http.server import BaseHTTPRequestHandler, HTTPServer
7
-
8
- from sftp_store import upload_pdf
9
- from sftp_store import sftp_test
10
-
11
- import sftp_store
12
-
13
- API_BASE = os.getenv("API_BASE", "").rstrip("/")
14
- WORKER_TOKEN = os.getenv("WORKER_TOKEN", "")
15
- PORT = int(os.getenv("PORT", "7860"))
16
- POLL = int(os.getenv("POLL_INTERVAL_SECONDS", "10"))
17
-
18
- if not API_BASE or not WORKER_TOKEN:
19
- raise RuntimeError("Missing API_BASE or WORKER_TOKEN")
20
-
21
-
22
- class Handler(BaseHTTPRequestHandler):
23
-
24
- def do_HEAD(self):
25
- path = self.path.split("?", 1)[0]
26
-
27
- if path in ("/", "/health"):
28
- self.send_response(200)
29
- self.end_headers()
30
- return
31
-
32
- if path == "/sftp-test":
33
- # no body on HEAD
34
- self.send_response(200)
35
- self.end_headers()
36
- return
37
-
38
- # if you have /api/pdf routes in worker, add them here too
39
- self.send_response(404)
40
- self.end_headers()
41
-
42
- def do_GET(self):
43
- path = self.path.split('?', 1)[0]
44
-
45
- if path in ('/', '/health'):
46
- self.send_response(200)
47
- self.end_headers()
48
- self.wfile.write(b"ok\n")
49
- return
50
-
51
- if path == '/sftp-test':
52
- try:
53
- msg = sftp_store.sftp_test()
54
- self.send_response(200)
55
- self.end_headers()
56
- self.wfile.write((msg + "\n").encode('utf-8'))
57
- except Exception as e:
58
- self.send_response(500)
59
- self.end_headers()
60
- self.wfile.write((f"sftp fail: {e}\n").encode('utf-8'))
61
- return
62
-
63
- self.send_response(404)
64
- self.end_headers()
65
-
66
-
67
- def serve_http():
68
- HTTPServer(("0.0.0.0", PORT), Handler).serve_forever()
69
-
70
-
71
- def _api_headers() -> dict:
72
- # API expects X-Worker-Token (NOT Authorization Bearer)
73
- return {"X-Worker-Token": WORKER_TOKEN}
74
-
75
- def _job_status(job_id: str, status: str, message: str = "") -> None:
76
- try:
77
- requests.post(
78
- f"{API_BASE}/api/jobs/{job_id}/status",
79
- headers=_api_headers(),
80
- json={"status": status, "message": message},
81
- timeout=10,
82
- )
83
- except Exception as e:
84
- print(f"status update failed: {e}", flush=True)
85
-
86
- def process_job(job: dict) -> None:
87
- # API returns job dict (asdict(Job)) => id field is the canonical one
88
- jid = job.get("id") or job.get("job_id")
89
- if not jid:
90
- raise RuntimeError("job missing id/job_id")
91
-
92
- payload = job.get("payload") or {}
93
- action = payload.get("action")
94
-
95
- if action == "upload_pdf":
96
- pdf_id = payload.get("pdf_id")
97
- pdf_b64 = payload.get("pdf_b64")
98
- if not pdf_id or not pdf_b64:
99
- raise RuntimeError("upload_pdf requires payload.pdf_id and payload.pdf_b64")
100
-
101
- # decode
102
- try:
103
- pdf_bytes = base64.b64decode(pdf_b64)
104
- except Exception as e:
105
- raise RuntimeError(f"invalid base64 in pdf_b64: {e}")
106
-
107
- # upload
108
- print(f"uploading pdf {pdf_id} ({len(pdf_bytes)} bytes)", flush=True)
109
- remote_path = upload_pdf(pdf_id, pdf_bytes)
110
- print(f"uploaded to {remote_path}", flush=True)
111
- _job_status(jid, "done", f"uploaded:{remote_path}")
112
- print(f"job {jid} done -> {remote_path}", flush=True)
113
- return
114
-
115
- # unknown action -> fail but don't crash loop
116
- _job_status(jid, "failed", f"unknown action: {action}")
117
- print(f"job {jid} failed: unknown action {action}", flush=True)
118
-
119
-
120
- def poll_loop():
121
- print("Worker polling started", flush=True)
122
-
123
- while True:
124
- try:
125
- r = requests.get(
126
- f"{API_BASE}/api/jobs/next",
127
- headers={"X-Worker-Token": WORKER_TOKEN},
128
- timeout=10,
129
- )
130
-
131
- print(f"poll /api/jobs/next -> {r.status_code}", flush=True)
132
-
133
- if r.status_code == 200:
134
- job = r.json()
135
- job_id = job.get("job_id") or job.get("id")
136
- print(f"claimed job {job_id}", flush=True)
137
-
138
- payload = job.get("payload", {})
139
-
140
- try:
141
- action = payload.get("action")
142
-
143
- if action == "upload_pdf":
144
- pdf_id = payload["pdf_id"]
145
- pdf_b64 = payload["pdf_b64"]
146
-
147
- import base64
148
- from sftp_store import upload_bytes
149
-
150
- data = base64.b64decode(pdf_b64)
151
- remote_path = f"pdfs/{pdf_id}.pdf"
152
-
153
- print(f"uploading pdf {pdf_id} ({len(data)} bytes)", flush=True)
154
- upload_bytes(remote_path, data)
155
- print(f"uploaded to {remote_path}", flush=True)
156
-
157
- else:
158
- print(f"unknown action: {action}", flush=True)
159
-
160
- except Exception as e:
161
- print(f"job {job_id} failed: {e}", flush=True)
162
- # processing placeholder
163
-
164
- except Exception as e:
165
- print(f"poll error: {e}", flush=True)
166
-
167
- time.sleep(POLL)
168
-
169
-
170
- print("PDF Trainer Worker booted", flush=True)
171
-
172
- threading.Thread(target=serve_http, daemon=True).start()
173
- poll_loop()