Parsa2025AI commited on
Commit
1c241f3
·
verified ·
1 Parent(s): 15445a8

fast api app

Browse files
Files changed (3) hide show
  1. Dockerfile +21 -0
  2. app.py +53 -0
  3. requirements.txt +7 -0
Dockerfile ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Use a small CUDA image if you plan to request GPU; CPU also works.
2
+ FROM python:3.11-slim
3
+
4
+ # Create non-root user (required by Spaces Docker) and prepare workdir
5
+ RUN useradd -m -u 1000 user
6
+ USER user
7
+ ENV HOME=/home/user PATH=/home/user/.local/bin:$PATH
8
+ WORKDIR $HOME/app
9
+
10
+ # Copy files
11
+ COPY --chown=user requirements.txt ./
12
+ RUN pip install --no-cache-dir -r requirements.txt
13
+
14
+ COPY --chown=user app.py ./
15
+
16
+ # Spaces expects your app to listen on port 7860 unless overridden in README yaml
17
+ ENV PORT=7860
18
+ EXPOSE 7860
19
+
20
+ # Run FastAPI
21
+ CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
app.py ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os, io, time, contextlib, torch
2
+ from fastapi import FastAPI, UploadFile, File
3
+ from PIL import Image
4
+ from transformers import (VisionEncoderDecoderModel, AutoTokenizer, AutoImageProcessor,
5
+ BitsAndBytesConfig)
6
+
7
+ MODEL_ID = os.getenv("MODEL_ID", "Parsa2025AI/r2gen-swin-cerebras-ft")
8
+ GEN_MAX_LEN = int(os.getenv("GEN_MAX_LEN", "192"))
9
+ NUM_BEAMS = int(os.getenv("NUM_BEAMS", "1"))
10
+
11
+ app = FastAPI(title="R2Gen API (FastAPI on Spaces)")
12
+
13
+ # Quantization + auto device map works on CPU or GPU Space
14
+ bnb = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_use_double_quant=True,
15
+ bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.float16)
16
+
17
+ image_processor = AutoImageProcessor.from_pretrained(MODEL_ID)
18
+ tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
19
+ model = VisionEncoderDecoderModel.from_pretrained(
20
+ MODEL_ID, quantization_config=bnb, device_map="auto", offload_folder="/data/offload"
21
+ )
22
+ model.eval()
23
+
24
+ # IDs for generation
25
+ if model.config.pad_token_id is None and tokenizer.pad_token_id is not None:
26
+ model.config.pad_token_id = tokenizer.pad_token_id
27
+ if model.config.eos_token_id is None and tokenizer.eos_token_id is not None:
28
+ model.config.eos_token_id = tokenizer.eos_token_id
29
+
30
+ @app.get("/health")
31
+ def health():
32
+ return {"ok": True, "model": MODEL_ID}
33
+
34
+ @app.post("/generate")
35
+ def generate(file: UploadFile = File(...)):
36
+ img = Image.open(io.BytesIO(file.file.read())).convert("RGB")
37
+ inputs = image_processor(img, return_tensors="pt")
38
+
39
+ # Match encoder dtype/device (important when quantized/offloaded)
40
+ enc_param = next(model.encoder.parameters())
41
+ pixel_values = inputs.pixel_values.to(device=enc_param.device, dtype=enc_param.dtype)
42
+
43
+ gen_kwargs = dict(max_length=GEN_MAX_LEN, num_beams=NUM_BEAMS,
44
+ pad_token_id=model.config.pad_token_id, eos_token_id=model.config.eos_token_id)
45
+
46
+ t0 = time.time()
47
+ with torch.inference_mode():
48
+ use_amp = (enc_param.device.type == "cuda" and enc_param.dtype in (torch.float16, torch.bfloat16))
49
+ ctx = torch.autocast("cuda", dtype=enc_param.dtype) if use_amp else contextlib.nullcontext()
50
+ with ctx:
51
+ out = model.generate(pixel_values=pixel_values, **gen_kwargs)
52
+ text = tokenizer.decode(out[0], skip_special_tokens=True).strip()
53
+ return {"text": text, "ms": int((time.time() - t0) * 1000)}
requirements.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ fastapi
2
+ uvicorn
3
+ transformers>=4.42
4
+ accelerate
5
+ bitsandbytes
6
+ torch
7
+ pillow