sdlc-agent / src /tools /api_endpoint_cpu.py
Veeru-c's picture
initial commit
06bd253
import modal
app = modal.App("census-qa-api-cpu")
vol_checkpoints = modal.Volume.from_name("model-checkpoints")
# CPU-only image (no CUDA)
image = modal.Image.debian_slim(python_version="3.10") \
.pip_install(
"torch",
"transformers",
"peft",
"accelerate",
"bitsandbytes",
"scipy",
"huggingface_hub",
"protobuf",
"sentencepiece",
"fastapi"
)
@app.cls(
image=image,
volumes={"/data/checkpoints": vol_checkpoints},
cpu=4, # Use CPU instead of GPU
memory=8192, # 8GB RAM
keep_warm=1
)
class ModelCPU:
@modal.enter()
def load(self):
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel
print("Loading model on CPU...")
# Load base model
base_model = "microsoft/Phi-3-mini-4k-instruct"
self.tokenizer = AutoTokenizer.from_pretrained(base_model)
# Load with PEFT adapter (no quantization on CPU)
model = AutoModelForCausalLM.from_pretrained(
base_model,
torch_dtype="auto",
device_map="cpu"
)
# Load LoRA adapter
self.model = PeftModel.from_pretrained(
model,
"/data/checkpoints/phi3-census-lora"
)
print("Model loaded on CPU!")
@modal.web_endpoint(method="POST")
def ask(self, data: dict):
prompt = f"""Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.
### Instruction:
{data.get('question', '')}
### Input:
{data.get('context', 'Context: Japan Census data.')}
### Response:
"""
inputs = self.tokenizer([prompt], return_tensors="pt")
outputs = self.model.generate(**inputs, max_new_tokens=150, temperature=0.1)
response = self.tokenizer.batch_decode(outputs)[0]
if "### Response:\n" in response:
answer = response.split("### Response:\n")[1].split("<|endoftext|>")[0].strip()
else:
answer = response.strip()
return {"question": data.get('question'), "answer": answer}
@app.local_entrypoint()
def main():
print("CPU-based API endpoint")
print("Deploy with: modal deploy docs/api_endpoint_cpu.py")
print("Note: CPU inference is 10-20x slower than GPU")