| | from fastapi import FastAPI, HTTPException, Query |
| | from fastapi.responses import StreamingResponse, JSONResponse |
| | from pydantic import BaseModel |
| | from llama_cpp import Llama |
| | import time |
| | import io |
| | import asyncio |
| | import logging |
| |
|
| | |
| | app = FastAPI() |
| |
|
| | |
| | logging.basicConfig(level=logging.INFO) |
| | logger = logging.getLogger(__name__) |
| |
|
| | |
| | model_path = "./Qwen2-1.5B-Instruct.IQ3_M.gguf" |
| | try: |
| | llama_model = Llama( |
| | model_path=model_path, |
| | n_ctx=4096, |
| | n_threads=8, |
| | n_gpu_layers=35 |
| | ) |
| | logger.info(f"Model loaded successfully from {model_path}") |
| | except Exception as e: |
| | logger.error(f"Failed to load model: {e}") |
| | raise RuntimeError(f"Could not load model from {model_path}") |
| |
|
| | |
| | class Choice(BaseModel): |
| | text: str |
| | index: int |
| | logprobs: int = None |
| | finish_reason: str |
| |
|
| | class ResponseModel(BaseModel): |
| | id: str |
| | object: str |
| | created: int |
| | model: str |
| | choices: list[Choice] |
| |
|
| |
|
| | @app.get("/v1/completions") |
| | async def create_completion( |
| | prompt: str = Query(..., description="The prompt to complete"), |
| | model: str = "default", |
| | max_tokens: int = 256, |
| | temperature: float = 0.7, |
| | top_p: float = 1.0, |
| | n: int = 1, |
| | stream: bool = False |
| | ): |
| | try: |
| | logger.info(f"Received GET request with prompt: {prompt}") |
| |
|
| | if stream: |
| | |
| | async def generate(): |
| | logger.info(f"Generating streaming response for prompt: {prompt}") |
| | try: |
| | response = llama_model( |
| | prompt=prompt, |
| | max_tokens=max_tokens, |
| | temperature=temperature, |
| | stop=None, |
| | echo=True, |
| | ) |
| | generated_text = response["choices"][0]["text"] |
| | for chunk in generated_text.split('\n'): |
| | yield chunk |
| | await asyncio.sleep(0.1) |
| | except Exception as e: |
| | logger.error(f"Error during model inference: {e}") |
| | raise HTTPException(status_code=500, detail="Error generating response.") |
| | return StreamingResponse(generate(), media_type="text/plain") |
| |
|
| | else: |
| | |
| | logger.info(f"Generating non-streaming response for prompt: {prompt}") |
| | response = llama_model( |
| | prompt=prompt, |
| | max_tokens=max_tokens, |
| | temperature=temperature, |
| | stop=None, |
| | echo=True, |
| | ) |
| | |
| | generated_text = response["choices"][0]["text"] |
| |
|
| | |
| | response_data = ResponseModel( |
| | id="cmpl-xxxx", |
| | object="text_completion", |
| | created=int(time.time()), |
| | model=model, |
| | choices=[ |
| | Choice( |
| | text=generated_text, |
| | index=0, |
| | logprobs=None, |
| | finish_reason="stop" |
| | ) |
| | ] |
| | ) |
| | return JSONResponse(content=response_data.dict()) |
| |
|
| | except Exception as e: |
| | logger.error(f"Internal error: {e}") |
| | raise HTTPException(status_code=500, detail="Server error occurred.") |
| |
|
| | |
| | @app.get("/") |
| | async def root(): |
| | return {"message": "Welcome to the Phi-3 API"} |
| |
|
| | |
| | @app.get("/robots.txt") |
| | async def robots(): |
| | return StreamingResponse(io.StringIO("User-agent: *\nDisallow: /"), media_type="text/plain") |
| |
|
| | |
| | @app.get("/test_model") |
| | async def test_model(): |
| | test_prompt = "This is a test prompt." |
| | try: |
| | response = llama_model(prompt=test_prompt, max_tokens=10, temperature=0.7, echo=True) |
| | return {"response": response["choices"][0]["text"]} |
| | except Exception as e: |
| | logger.error(f"Model test error: {e}") |
| | return {"error": str(e)} |
| |
|
| | |
| | if __name__ == "__main__": |
| | import uvicorn |
| | uvicorn.run(app, host="0.0.0.0", port=7860) |
| |
|