Phi-3-api / app.py
ar08's picture
Update app.py
1a97454 verified
from fastapi import FastAPI, HTTPException, Query
from fastapi.responses import StreamingResponse, JSONResponse
from pydantic import BaseModel
from llama_cpp import Llama
import time
import io
import asyncio
import logging
# Initialize FastAPI app
app = FastAPI()
# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
# Load the Phi-3 model using llama-cpp-python
model_path = "./Qwen2-1.5B-Instruct.IQ3_M.gguf" # Ensure this path is correct
try:
llama_model = Llama(
model_path=model_path,
n_ctx=4096,
n_threads=8,
n_gpu_layers=35
)
logger.info(f"Model loaded successfully from {model_path}")
except Exception as e:
logger.error(f"Failed to load model: {e}")
raise RuntimeError(f"Could not load model from {model_path}")
# Response model based on OpenAI API structure
class Choice(BaseModel):
text: str
index: int
logprobs: int = None
finish_reason: str
class ResponseModel(BaseModel):
id: str
object: str
created: int
model: str
choices: list[Choice]
@app.get("/v1/completions")
async def create_completion(
prompt: str = Query(..., description="The prompt to complete"),
model: str = "default",
max_tokens: int = 256,
temperature: float = 0.7,
top_p: float = 1.0,
n: int = 1,
stream: bool = False
):
try:
logger.info(f"Received GET request with prompt: {prompt}")
if stream:
# Streaming response using GET
async def generate():
logger.info(f"Generating streaming response for prompt: {prompt}")
try:
response = llama_model(
prompt=prompt,
max_tokens=max_tokens,
temperature=temperature,
stop=None,
echo=True,
)
generated_text = response["choices"][0]["text"]
for chunk in generated_text.split('\n'):
yield chunk
await asyncio.sleep(0.1) # Simulate delay for streaming effect
except Exception as e:
logger.error(f"Error during model inference: {e}")
raise HTTPException(status_code=500, detail="Error generating response.")
return StreamingResponse(generate(), media_type="text/plain")
else:
# Non-streaming JSON response
logger.info(f"Generating non-streaming response for prompt: {prompt}")
response = llama_model(
prompt=prompt,
max_tokens=max_tokens,
temperature=temperature,
stop=None,
echo=True,
)
generated_text = response["choices"][0]["text"]
# Build the response in OpenAI's format
response_data = ResponseModel(
id="cmpl-xxxx", # Replace with unique ID generator if needed
object="text_completion",
created=int(time.time()),
model=model,
choices=[
Choice(
text=generated_text,
index=0,
logprobs=None,
finish_reason="stop"
)
]
)
return JSONResponse(content=response_data.dict())
except Exception as e:
logger.error(f"Internal error: {e}")
raise HTTPException(status_code=500, detail="Server error occurred.")
# Handle root route to avoid 404 errors
@app.get("/")
async def root():
return {"message": "Welcome to the Phi-3 API"}
# Handle robots.txt to avoid unnecessary 404 errors
@app.get("/robots.txt")
async def robots():
return StreamingResponse(io.StringIO("User-agent: *\nDisallow: /"), media_type="text/plain")
# Test the model in isolation (optional, you can use this to debug locally)
@app.get("/test_model")
async def test_model():
test_prompt = "This is a test prompt."
try:
response = llama_model(prompt=test_prompt, max_tokens=10, temperature=0.7, echo=True)
return {"response": response["choices"][0]["text"]}
except Exception as e:
logger.error(f"Model test error: {e}")
return {"error": str(e)}
# Main entry point
if __name__ == "__main__":
import uvicorn
uvicorn.run(app, host="0.0.0.0", port=7860)