from fastapi import FastAPI from pydantic import BaseModel from fastapi.middleware.cors import CORSMiddleware from llama_index.llms.ollama import Ollama from llama_index.core.llms import ChatMessage import time app = FastAPI() class Generate(BaseModel): response: str duration: float chat_history = [] model = Ollama(model="phi2", base_url="http://localhost:11434") def generate_text(model: Ollama, prompt: str) -> {}: if prompt == "": return { "response": "Please provide a prompt.", "duration": str(0) } chat_history.append({ "role": "user", "content": prompt }) messages = [ChatMessage(role=msg["role"], content=msg["content"]) for msg in chat_history] start_time = time.time() response_gen = model.stream_chat(messages) full_response = "" try: for response_chunk in response_gen: full_response += response_chunk.delta except Exception as e: return { "response": f"Error: {str(e)}", "duration": 0 } duration = time.time() - start_time chat_history.append({ "role": "assistant", "content": full_response }) return { "response": full_response, "duration": f"{duration:.2f}" } @app.get("/") async def root(): return {"message": "Hello World"} @app.get("/health") async def health_check(): try: import requests response = requests.get("http://localhost:11434/api/version") ollama_status = "OK" if response.status_code == 200 else "Not available" except: ollama_status = "Error" return { "status": "healthy", "ollama_status": ollama_status, "models_loaded": model is not None } @app.post("/api/generate", summary="Generate text from prompt", tags=["Generate"], response_model=Generate) def inference(input_prompt: str): return generate_text(model, input_prompt)