Loomis Green commited on
Commit
e3e877e
·
1 Parent(s): 71394ea

Deploy Google Flan T5 FastAPI Docker app

Browse files
Files changed (10) hide show
  1. .dockerignore +5 -0
  2. .gitignore +5 -0
  3. DS_Store +0 -0
  4. Dockerfile +8 -37
  5. README.md +23 -13
  6. __pycache__/app.cpython-311.pyc +0 -0
  7. app.py +19 -0
  8. docker-compose.yml +11 -0
  9. main.py +0 -78
  10. requirements.txt +3 -3
.dockerignore ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ __pycache__
2
+ .DS_Store
3
+ venv
4
+ .env
5
+ .git
.gitignore ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ __pycache__
2
+ .DS_Store
3
+ venv
4
+ .env
5
+ .git
DS_Store ADDED
Binary file (6.15 kB). View file
 
Dockerfile CHANGED
@@ -1,45 +1,16 @@
1
- # Use standard Python 3.10 slim image (Lightweight & Compatible)
2
  FROM python:3.10-slim
3
 
4
- WORKDIR /app
5
 
6
- # Install system libraries required for the CPU runner and building wheels
7
- # build-essential & cmake: required if fallback to source build occurs
8
- # libopenblas-dev: for optimized matrix operations
9
- # libgomp1: for OpenMP
10
- RUN apt-get update && apt-get install -y \
11
- build-essential \
12
- cmake \
13
- libopenblas-dev \
14
- libgomp1 \
15
- curl \
16
- && rm -rf /var/lib/apt/lists/*
17
 
18
- # Upgrade pip to ensure it handles wheels correctly
19
- RUN pip install --upgrade pip --default-timeout=1000
20
 
21
- # Install Python dependencies (FastAPI, Uvicorn, etc.)
22
- COPY requirements.txt .
23
- RUN pip install --no-cache-dir -r requirements.txt --default-timeout=1000
24
 
25
- # -----------------------------------------------------------------------------
26
- # INSTALL PRE-COMPILED LLAMA-CPP-PYTHON
27
- # -----------------------------------------------------------------------------
28
- # We install from the 'cpu' specific index.
29
- # We added build-essential and cmake above so that if a wheel isn't found,
30
- # it can successfully build from source without erroring out.
31
- RUN pip install llama-cpp-python \
32
- --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cpu \
33
- --default-timeout=1000
34
 
35
- # Copy application code
36
- COPY . .
37
 
38
- # Create model cache directory
39
- RUN mkdir -p /app/model_cache && chmod 777 /app/model_cache
40
-
41
- # Expose port
42
- EXPOSE 7860
43
-
44
- # Start the application
45
- CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "7860"]
 
 
1
  FROM python:3.10-slim
2
 
3
+ RUN useradd user
4
 
5
+ USER user
 
 
 
 
 
 
 
 
 
 
6
 
7
+ ENV HOME=/home/user \
8
+ PATH=/home/user/.local/bin:$PATH
9
 
10
+ WORKDIR $HOME/app
 
 
11
 
12
+ COPY --chown=user ./ $HOME/app
 
 
 
 
 
 
 
 
13
 
14
+ RUN pip install -r requirements.txt
 
15
 
16
+ CMD fastapi run --reload --host=0.0.0.0 --port=7860
 
 
 
 
 
 
 
README.md CHANGED
@@ -1,21 +1,31 @@
1
  ---
2
- title: Personal Coder AI
3
- emoji: 👨‍💻
4
- colorFrom: indigo
5
- colorTo: blue
6
  sdk: docker
7
  pinned: false
8
- license: apache-2.0
9
- short_description: Qwen 2.5 Coder 7B (GGUF/CPU Version)
10
  ---
11
 
12
- # Personal Coder AI (CPU Version)
13
 
14
- This Space runs **Qwen 2.5 Coder 7B (GGUF)** on standard CPU.
15
- * **No Quotas:** Unlimited usage.
16
- * **Speed:** Slower than GPU, but reliable.
17
 
18
- ## API Usage
 
 
 
 
 
19
 
20
- Endpoint: `POST /chat`
21
- JSON: `{"prompt": "Write a python script..."}`
 
 
 
 
 
 
 
 
1
  ---
2
+ title: Google Flan Fastapi
3
+ emoji: 👁
4
+ colorFrom: green
5
+ colorTo: gray
6
  sdk: docker
7
  pinned: false
8
+ license: mit
 
9
  ---
10
 
11
+ # Huggingface Spaces for Docker with FastAPI
12
 
13
+ ## Overview
14
+ This repository contains a simple example of how to deploy a Huggingface model using Docker and FastAPI. The model used is the `google-flan-t5-base` model from the Huggingface model hub.
 
15
 
16
+ ## Usage
17
+ To run the FastAPI server, you can use the following command:
18
+ ```bash
19
+ docker compose up --build
20
+ ```
21
+ Then visit `http://localhost:7860/docs` to see the API documentation.
22
 
23
+ ## Deployment to Hugging Face Spaces
24
+ 1. Create a new Space on Hugging Face (SDK: Docker).
25
+ 2. Push these files to the Space's repository.
26
+ 3. The Dockerfile will automatically build and serve the app on port 7860.
27
+
28
+
29
+ ## Useful Links
30
+ - [Google Flan T5 Base Model](https://huggingface.co/google/flan-t5-base)
31
+ - [Files](https://huggingface.co/spaces/sarthaksavvy/google-flan-fastapi/tree/main)]
__pycache__/app.cpython-311.pyc ADDED
Binary file (1.01 kB). View file
 
app.py ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import FastAPI
2
+ # Use a pipeline as a high-level helper
3
+ from transformers import pipeline
4
+
5
+ pipe = pipeline("text2text-generation", model="google/flan-t5-base")
6
+
7
+ app = FastAPI()
8
+
9
+
10
+ @app.get('/')
11
+ def home():
12
+ return {"message": "Loomyloo Gateway API is running"}
13
+
14
+
15
+ @app.get('/ask')
16
+ def ask(prompt: str):
17
+ # This uses the google/flan-t5-base model loaded above
18
+ result = pipe(prompt)
19
+ return result[0]
docker-compose.yml ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ version: '3.8'
2
+
3
+ services:
4
+ app:
5
+ build: .
6
+ ports:
7
+ - "7860:7860"
8
+ volumes:
9
+ - .:/home/user/app
10
+ environment:
11
+ - TRANSFORMERS_CACHE=/home/user/app/cache
main.py DELETED
@@ -1,78 +0,0 @@
1
- import os
2
- from fastapi import FastAPI, HTTPException
3
- from pydantic import BaseModel
4
- from llama_cpp import Llama
5
- from huggingface_hub import hf_hub_download
6
-
7
- app = FastAPI()
8
-
9
- # ---------------------------------------------------------
10
- # CONFIGURATION
11
- # ---------------------------------------------------------
12
- # We use Qwen 2.5 Coder 7B (GGUF) - Small & Fast on CPU
13
- REPO_ID = "Qwen/Qwen2.5-Coder-7B-Instruct-GGUF"
14
- FILENAME = "qwen2.5-coder-7b-instruct-q4_k_m.gguf"
15
- MODEL_PATH = f"./model_cache/{FILENAME}"
16
-
17
- # Global model variable
18
- llm = None
19
-
20
- def load_model():
21
- global llm
22
- if not os.path.exists(MODEL_PATH):
23
- print(f"📥 Downloading {FILENAME} from Hugging Face...")
24
- hf_hub_download(
25
- repo_id=REPO_ID,
26
- filename=FILENAME,
27
- local_dir="./model_cache",
28
- local_dir_use_symlinks=False
29
- )
30
- print("✅ Download complete.")
31
-
32
- print("🚀 Loading Model into RAM...")
33
- # n_ctx=8192 allows for decent context window
34
- # n_threads=2 is optimized for Hugging Face Free Tier (2 vCPUs)
35
- llm = Llama(model_path=MODEL_PATH, n_ctx=8192, n_threads=2)
36
- print("✅ Model Loaded!")
37
-
38
- # Load model on startup
39
- @app.on_event("startup")
40
- def startup_event():
41
- load_model()
42
-
43
- # ---------------------------------------------------------
44
- # API ENDPOINTS
45
- # ---------------------------------------------------------
46
-
47
- class ChatRequest(BaseModel):
48
- prompt: str
49
-
50
- @app.get("/")
51
- def read_root():
52
- return {"status": "running", "model": REPO_ID}
53
-
54
- @app.post("/chat")
55
- def chat(request: ChatRequest):
56
- global llm
57
- if not llm:
58
- raise HTTPException(status_code=500, detail="Model not loaded")
59
-
60
- # Format prompt for Qwen (ChatML style is best, but basic instruct works)
61
- # Simple Instruct Format:
62
- formatted_prompt = f"<|im_start|>user\n{request.prompt}<|im_end|>\n<|im_start|>assistant\n"
63
-
64
- print(f"📩 Generating response for: {request.prompt[:50]}...")
65
-
66
- output = llm(
67
- formatted_prompt,
68
- max_tokens=1024,
69
- stop=["<|im_end|>", "User:"],
70
- echo=False
71
- )
72
-
73
- response_text = output['choices'][0]['text']
74
- return {"response": response_text.strip()}
75
-
76
- if __name__ == "__main__":
77
- import uvicorn
78
- uvicorn.run(app, host="0.0.0.0", port=7860)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
requirements.txt CHANGED
@@ -1,5 +1,5 @@
1
  fastapi
2
  uvicorn
3
- huggingface_hub
4
- pydantic
5
- # llama-cpp-python is installed manually in the Dockerfile
 
1
  fastapi
2
  uvicorn
3
+ transformers
4
+ torch
5
+ torchvision