j-harishankar commited on
Commit
c002449
·
1 Parent(s): 03ee6af

Initial deployment

Browse files
Files changed (3) hide show
  1. Dockerfile +31 -0
  2. main.py +128 -0
  3. requirements.txt +12 -0
Dockerfile ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Use NVIDIA CUDA base image for GPU support
2
+ # If you don't have a GPU, use python:3.10-slim
3
+ FROM nvidia/cuda:11.8.0-runtime-ubuntu22.04
4
+
5
+ # Set environment variables
6
+ ENV PYTHONDONTWRITEBYTECODE 1
7
+ ENV PYTHONUNBUFFERED 1
8
+
9
+ # Install system dependencies
10
+ RUN apt-get update && apt-get install -y \
11
+ python3.10 \
12
+ python3-pip \
13
+ git \
14
+ && rm -rf /var/lib/apt/lists/*
15
+
16
+ # Set working directory
17
+ WORKDIR /app
18
+
19
+ # Install Python dependencies
20
+ COPY requirements.txt .
21
+ RUN pip3 install --no-cache-dir torch --index-url https://download.pytorch.org/whl/cu118
22
+ RUN pip3 install --no-cache-dir -r requirements.txt
23
+
24
+ # Copy application code
25
+ COPY . .
26
+
27
+ # Expose port (FastAPI default)
28
+ EXPOSE 8000
29
+
30
+ # Run the application
31
+ CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8000"]
main.py ADDED
@@ -0,0 +1,128 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from fastapi import FastAPI, HTTPException
3
+ from pydantic import BaseModel
4
+ from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline ,BitsAndBytesConfig
5
+ from peft import PeftModel
6
+ from sentence_transformers import SentenceTransformer
7
+ from typing import List, Optional
8
+ import time
9
+ import os
10
+ app = FastAPI(
11
+ title="Model Deployment API",
12
+ description="API for contract LoRA generation and text embeddings",
13
+ version="1.0.0"
14
+ )
15
+ # --- Configuration ---
16
+ LORA_MODEL_ID = "shibinsha02/contract-lora"
17
+ BASE_MODEL_ID = "StevenChen16/llama3-8b-Lawyer"
18
+ EMBEDDING_MODEL_ID = "sentence-transformers/all-MiniLM-L6-v2"
19
+
20
+ # Global variables for models
21
+ generation_pipeline = None
22
+ embedding_model = SentenceTransformer(EMBEDDING_MODEL_ID, device="cpu")
23
+ # --- Models ---
24
+ class GenerateRequest(BaseModel):
25
+ prompt: str
26
+ max_new_tokens: Optional[int] = 128
27
+ temperature: Optional[float] = 0.7
28
+ top_p: Optional[float] = 0.9
29
+
30
+ class GenerateResponse(BaseModel):
31
+ generated_text: str
32
+ generation_time: float
33
+
34
+ class EmbeddingRequest(BaseModel):
35
+ text: str
36
+
37
+ class EmbeddingResponse(BaseModel):
38
+ embedding: List[float]
39
+ model: str
40
+
41
+ # --- Startup Event ---
42
+ @app.on_event("startup")
43
+ async def load_models():
44
+ global generation_pipeline, embedding_model
45
+
46
+ print("Loading embedding model...")
47
+ embedding_model = SentenceTransformer(EMBEDDING_MODEL_ID)
48
+
49
+ print("Loading generation model (this might take a while)...")
50
+ # Setting up device
51
+ device = "cuda" if torch.cuda.is_available() else "cpu"
52
+ print(f"Using device: {device}")
53
+
54
+ # Load tokenizer and base model
55
+ tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_ID)
56
+
57
+ # Load with 4-bit quantization if possible for Llama 3 on typical GPUs
58
+ # Otherwise fallback to float16 or float32
59
+ try:
60
+ bnb_config = BitsAndBytesConfig(
61
+ load_in_4bit=True,
62
+ bnb_4bit_compute_dtype=torch.float16
63
+ )
64
+
65
+ base_model = AutoModelForCausalLM.from_pretrained(
66
+ BASE_MODEL_ID,
67
+ quantization_config=bnb_config,
68
+ device_map="auto"
69
+ )
70
+ # Load LoRA adapter
71
+ model = PeftModel.from_pretrained(base_model, LORA_MODEL_ID)
72
+
73
+ generation_pipeline = pipeline(
74
+ "text-generation",
75
+ model=model,
76
+ tokenizer=tokenizer,
77
+ device_map="auto" if device == "cuda" else None
78
+ )
79
+ except Exception as e:
80
+ print(f"Error loading generation model: {e}")
81
+ # Placeholder/Mock for local testing if hardware is insufficient
82
+ generation_pipeline = None
83
+
84
+ @app.get("/health")
85
+ async def health_check():
86
+ return {
87
+ "status": "healthy",
88
+ "embeddings_loaded": embedding_model is not None,
89
+ "generation_loaded": generation_pipeline is not None
90
+ }
91
+
92
+ @app.post("/embeddings", response_model=EmbeddingResponse)
93
+ async def get_embeddings(request: EmbeddingRequest):
94
+ if embedding_model is None:
95
+ raise HTTPException(status_code=503, detail="Embedding model not loaded")
96
+
97
+ embedding = embedding_model.encode(request.text).tolist()
98
+ return EmbeddingResponse(
99
+ embedding=embedding,
100
+ model=EMBEDDING_MODEL_ID
101
+ )
102
+
103
+ @app.post("/generate", response_model=GenerateResponse)
104
+ async def generate_text(request: GenerateRequest):
105
+ if generation_pipeline is None:
106
+ raise HTTPException(status_code=503, detail="Generation model not loaded or hardware insufficient")
107
+
108
+ start_time = time.time()
109
+
110
+ outputs = generation_pipeline(
111
+ request.prompt,
112
+ max_new_tokens=request.max_new_tokens,
113
+ temperature=request.temperature,
114
+ top_p=request.top_p,
115
+ do_sample=True if request.temperature > 0 else False
116
+ )
117
+
118
+ generated_text = outputs[0]["generated_text"]
119
+ end_time = time.time()
120
+
121
+ return GenerateResponse(
122
+ generated_text=generated_text,
123
+ generation_time=round(end_time - start_time, 2)
124
+ )
125
+
126
+ if __name__ == "__main__":
127
+ import uvicorn
128
+ uvicorn.run(app, host="0.0.0.0", port=7860)
requirements.txt ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ fastapi
2
+ uvicorn
3
+ transformers
4
+ peft
5
+ sentence-transformers
6
+ torch
7
+ pydantic
8
+ accelerate
9
+ bitsandbytes
10
+ python-multipart
11
+ python-dotenv
12
+ httpx