# !pip install llama-cpp-python from fastapi import FastAPI from llama_cpp import Llama app = FastAPI() # Initialize the model llm = Llama.from_pretrained( repo_id="TheBloke/dolphin-2_6-phi-2-GGUF", filename="dolphin-2_6-phi-2.Q3_K_S.gguf", ) @app.get("/") def greet_json(): return {"Hello": "World!"} @app.post("/chat") def chat_completion(prompt: str = "No input example has been defined for this model task."): response = llm.create_chat_completion( messages=[ {"role": "user", "content": prompt} ] ) return response