import modal

app = modal.App("census-qa-api-cpu")
vol_checkpoints = modal.Volume.from_name("model-checkpoints")

# CPU-only image (no CUDA)
image = modal.Image.debian_slim(python_version="3.10") \
    .pip_install(
        "torch",
        "transformers",
        "peft",
        "accelerate",
        "bitsandbytes",
        "scipy",
        "huggingface_hub",
        "protobuf",
        "sentencepiece",
        "fastapi"
    )

@app.cls(
    image=image,
    volumes={"/data/checkpoints": vol_checkpoints},
    cpu=4,  # Use CPU instead of GPU
    memory=8192,  # 8GB RAM
    keep_warm=1
)
class ModelCPU:
    @modal.enter()
    def load(self):
        from transformers import AutoModelForCausalLM, AutoTokenizer
        from peft import PeftModel
        
        print("Loading model on CPU...")
        
        # Load base model
        base_model = "microsoft/Phi-3-mini-4k-instruct"
        self.tokenizer = AutoTokenizer.from_pretrained(base_model)
        
        # Load with PEFT adapter (no quantization on CPU)
        model = AutoModelForCausalLM.from_pretrained(
            base_model,
            torch_dtype="auto",
            device_map="cpu"
        )
        
        # Load LoRA adapter
        self.model = PeftModel.from_pretrained(
            model,
            "/data/checkpoints/phi3-census-lora"
        )
        
        print("Model loaded on CPU!")
    
    @modal.web_endpoint(method="POST")
    def ask(self, data: dict):
        prompt = f"""Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{data.get('question', '')}

### Input:
{data.get('context', 'Context: Japan Census data.')}

### Response:
"""
        
        inputs = self.tokenizer([prompt], return_tensors="pt")
        outputs = self.model.generate(**inputs, max_new_tokens=150, temperature=0.1)
        response = self.tokenizer.batch_decode(outputs)[0]
        
        if "### Response:\n" in response:
            answer = response.split("### Response:\n")[1].split("<|endoftext|>")[0].strip()
        else:
            answer = response.strip()
        
        return {"question": data.get('question'), "answer": answer}

@app.local_entrypoint()
def main():
    print("CPU-based API endpoint")
    print("Deploy with: modal deploy docs/api_endpoint_cpu.py")
    print("Note: CPU inference is 10-20x slower than GPU")