| import fastapi |
| from fastapi.responses import JSONResponse |
| from time import time |
| |
| |
| import logging |
| import llama_cpp |
| import llama_cpp.llama_tokenizer |
| from pydantic import BaseModel |
|
|
|
|
| class GenModel(BaseModel): |
| question: str |
| system: str = "You are a helpful medical AI assistant. Help as much as you can. Remember, response in English." |
| temperature: float = 0.8 |
| seed: int = 101 |
| mirostat_mode: int=2 |
| mirostat_tau: float=4.0 |
| mirostat_eta: float=1.1 |
| |
| llm_chat = llama_cpp.Llama.from_pretrained( |
| repo_id="Qwen/Qwen1.5-0.5B-Chat-GGUF", |
| filename="*q4_0.gguf", |
| tokenizer=llama_cpp.llama_tokenizer.LlamaHFTokenizer.from_pretrained("Qwen/Qwen1.5-0.5B"), |
| verbose=False, |
| n_ctx=1024, |
| n_gpu_layers=0, |
| |
| ) |
| llm_generate = llama_cpp.Llama.from_pretrained( |
| repo_id="Qwen/Qwen1.5-0.5B-Chat-GGUF", |
| filename="*q4_0.gguf", |
| tokenizer=llama_cpp.llama_tokenizer.LlamaHFTokenizer.from_pretrained("Qwen/Qwen1.5-0.5B"), |
| verbose=False, |
| n_ctx=4096, |
| n_gpu_layers=0, |
| mirostat_mode=2, |
| mirostat_tau=4.0, |
| mirostat_eta=1.1 |
| |
| ) |
| |
| logging.basicConfig(level=logging.INFO) |
| logger = logging.getLogger(__name__) |
|
|
| app = fastapi.FastAPI( |
| title="OpenGenAI", |
| description="Your Excellect AI Physician") |
| """ |
| app.add_middleware( |
| CORSMiddleware, |
| allow_origins = ["*"], |
| allow_credentials=True, |
| allow_methods=["*"], |
| allow_headers=["*"] |
| ) |
| """ |
| @app.get("/") |
| def index(): |
| return fastapi.responses.RedirectResponse(url="/docs") |
|
|
|
|
| @app.get("/health") |
| def health(): |
| return {"status": "ok"} |
| |
| |
| @app.post("/chat/") |
| async def chat(gen:GenModel): |
| try: |
| messages=[ |
| {"role": "assistant", "content": gen.system}, |
| ] |
| st = time() |
| |
| messages.append({"role": "user", "content": gen.question}) |
| output = llm_chat.create_chat_completion( |
| messages = messages, |
| temperature=gen.temperature, |
| seed=gen.seed, |
| |
| ) |
| |
| print(output) |
| et = time() |
| output["time"] = et - st |
| messages.append({'role': "assistant", "content": output['choices'][0]['message']['content']}) |
| |
| return output |
| except Exception as e: |
| logger.error(f"Error in /complete endpoint: {e}") |
| return JSONResponse( |
| status_code=500, content={"message": "Internal Server Error"} |
| ) |
|
|
| |
| @app.post("/generate") |
| async def generate(gen:GenModel): |
| gen.system = "You are an helpful medical AI assistant." |
| gen.temperature = 0.5 |
| gen.seed = 42 |
| try: |
| st = time() |
| output = llm_generate.create_chat_completion( |
| messages=[ |
| {"role": "system", "content": gen.system}, |
| {"role": "user", "content": gen.question}, |
| ], |
| temperature = gen.temperature, |
| seed= gen.seed, |
| |
| |
| ) |
| """ |
| for chunk in output: |
| delta = chunk['choices'][0]['delta'] |
| if 'role' in delta: |
| print(delta['role'], end=': ') |
| elif 'content' in delta: |
| print(delta['content'], end='') |
| #print(chunk) |
| """ |
| et = time() |
| output["time"] = et - st |
| return output |
| except Exception as e: |
| logger.error(f"Error in /generate endpoint: {e}") |
| return JSONResponse( |
| status_code=500, content={"message": "Internal Server Error"} |
| ) |
|
|
|
|
|
|
| if __name__ == "__main__": |
| import uvicorn |
| uvicorn.run(app, host="0.0.0.0", port=7860) |