import os from fastapi import FastAPI from pydantic import BaseModel from transformers import ( AutoModelForCausalLM, AutoTokenizer, pipeline, AutoConfig ) # Load the configuration and remove any quantization config if present config = AutoConfig.from_pretrained("devops-bda/Abap") if hasattr(config, "quantization_config"): del config.quantization_config # Safely delete it instead of setting to None # Load the model and tokenizer without 4-bit quantization model = AutoModelForCausalLM.from_pretrained( "devops-bda/Abap", config=config, device_map="auto" # This ensures the model loads properly on CPU ) tokenizer = AutoTokenizer.from_pretrained("devops-bda/Abap") # Create a text-generation pipeline with the loaded model and tokenizer text_gen_pipeline = pipeline("text-generation", model=model, tokenizer=tokenizer) app = FastAPI() class InputData(BaseModel): input_text: str @app.get("/health") async def health_check(): return {"status": "ok", "message": "Model is ready"} @app.post("/predict") async def predict(data: InputData): output = text_gen_pipeline(data.input_text, max_length=500) return {"output": output}