Spaces:

batnyan
/

Ollamaapi

Sleeping

App Files Files Community

brendon-ai commited on Jun 17, 2025

Commit

a285a66

verified ·

1 Parent(s): 40f8c64

Update app.py

Browse files

Files changed (1) hide show

app.py +250 -120

app.py CHANGED Viewed

@@ -1,143 +1,273 @@
-from fastapi import FastAPI, HTTPException, status, APIRouter, Request
-from pydantic import BaseModel, ValidationError
-from transformers import AutoTokenizer, AutoModelForMaskedLM
-import torch
 import logging
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
 app = FastAPI(
-    title="NeuroBERT-Tiny Masked Language Model API",
-    description="An API to perform Masked Language Modeling using the boltuix/NeuroBERT-Tiny model.",
-    version="1.0.0"
 )
-api_router = APIRouter()
-# --- NeuroBERT-Tiny Model Configuration ---
-# Using boltuix/NeuroBERT-Tiny for Masked Language Modeling.
-MODEL_NAME = "boltuix/NeuroBERT-Tiny"
-# ----------------------------------------
-# Load model globally to avoid reloading on each request
-# This block runs once when the FastAPI application starts.
-try:
-    logger.info(f"Loading tokenizer and model for {MODEL_NAME}...")
-    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
-    model = AutoModelForMaskedLM.from_pretrained(MODEL_NAME)
-    model.eval() # Set model to evaluation mode
-    logger.info("Model loaded successfully.")
-except Exception as e:
-    logger.exception(f"Failed to load model or tokenizer for {MODEL_NAME} during startup!")
-    raise RuntimeError(f"Could not load model: {e}")
-class InferenceRequest(BaseModel):
-    """
-    Request model for the /predict endpoint.
-    Expects a single string field 'text' containing the sentence with [MASK] tokens.
-    """
-    text: str
-class PredictionResult(BaseModel):
-    """
-    Response model for individual predictions from the /predict endpoint.
-    """
-    sequence: str    # The full sequence with the predicted token filled in
-    score: float     # Confidence score of the prediction
-    token: int       # The ID of the predicted token
-    token_str: str   # The string representation of the predicted token
-@api_router.post(
-    "/predict", # Prediction endpoint
-    response_model=list[PredictionResult],
-    summary="Predicts masked tokens in a given text using NeuroBERT-Tiny",
-    description="Accepts a text string with '[MASK]' tokens and returns top 5 predictions for each masked position."
-)
-async def predict_masked_lm(request: InferenceRequest):
-    """
-    Predicts the most likely tokens for [MASK] positions in the input text using the NeuroBERT-Tiny model.
-    Returns a list of top 5 predictions for each masked token, including the full sequence, score, and token details.
-    """
-    try:
-        text = request.text
-        logger.info(f"Received prediction request for text: '{text}'")
-        inputs = tokenizer(text, return_tensors="pt")
-        with torch.no_grad():
-            outputs = model(**inputs)
-        logits = outputs.logits
-        masked_token_id = tokenizer.convert_tokens_to_ids("[MASK]")
-        # Find all masked tokens
-        masked_token_indices = torch.where(inputs["input_ids"] == masked_token_id)[1]
-        if not masked_token_indices.numel():
-            logger.warning("No [MASK] token found in the input text. Returning 400 Bad Request.")
-            raise HTTPException(
-                status_code=status.HTTP_400_BAD_REQUEST,
-                detail="Input text must contain at least one '[MASK]' token."
-            )
-        results = []
-        for masked_index in masked_token_indices:
-            # Get top 5 predictions for the masked token
-            top_5_logits = torch.topk(logits[0, masked_index], 5).values
-            top_5_tokens = torch.topk(logits[0, masked_index], 5).indices
-            for i in range(5):
-                score = torch.nn.functional.softmax(logits[0, masked_index], dim=-1)[top_5_tokens[i]].item()
-                predicted_token_id = top_5_tokens[i].item()
-                predicted_token_str = tokenizer.decode(predicted_token_id)
-                # Replace the [MASK] with the predicted token for the full sequence
-                temp_input_ids = inputs["input_ids"].clone()
-                temp_input_ids[0, masked_index] = predicted_token_id
-                full_sequence = tokenizer.decode(temp_input_ids[0], skip_special_tokens=True)
-                results.append(PredictionResult(
-                    sequence=full_sequence,
-                    score=score,
-                    token=predicted_token_id,
-                    token_str=predicted_token_str
-                ))
-        logger.info(f"Successfully processed request. Returning {len(results)} predictions.")
-        return results
-    except ValidationError as e:
-        logger.error(f"Validation error for request: {e.errors()}")
-        raise HTTPException(
-            status_code=status.HTTP_422_UNPROCESSABLE_ENTITY,
-            detail=e.errors()
-        )
-    except HTTPException:
-        raise # Re-raise custom HTTPExceptions
     except Exception as e:
-        logger.exception(f"An unexpected error occurred during prediction: {e}")
         raise HTTPException(
-            status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
-            detail=f"An internal server error occurred: {e}"
         )
-@api_router.get(
-    "/health", # Health check endpoint
-    summary="Health Check",
-    description="Returns a simple message indicating the API is running."
-)
-async def health_check():
-    logger.info("Health check endpoint accessed.")
-    return {"message": "NeuroBERT-Tiny API is running!"}
-app.include_router(api_router)
-@app.api_route("/{path_name:path}", methods=["GET", "POST", "PUT", "DELETE", "PATCH", "OPTIONS", "HEAD"])
-async def catch_all(request: Request, path_name: str):
-    logger.warning(f"Unhandled route accessed: {request.method} {path_name}")
-    raise HTTPException(status_code=status.HTTP_404_NOT_FOUND, detail="Not Found")
 if __name__ == "__main__":
     import uvicorn
-    uvicorn.run(app, host="0.0.0.0", port=7860, log_level="info")

+from fastapi import FastAPI, HTTPException, BackgroundTasks
+from pydantic import BaseModel, Field
+from typing import List, Optional, Dict, Any
+import httpx
+import asyncio
 import logging
+import time
+import json
+# Configure logging
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
+# FastAPI app
 app = FastAPI(
+    title="Ollama API Server",
+    description="REST API for running Ollama models",
+    version="1.0.0",
+    docs_url="/docs",
+    redoc_url="/redoc"
 )
+# Ollama server configuration
+OLLAMA_BASE_URL = "http://localhost:11434"
+# Pydantic models
+class ChatMessage(BaseModel):
+    role: str = Field(..., description="Role of the message sender (user, assistant, system)")
+    content: str = Field(..., description="Content of the message")
+class ChatRequest(BaseModel):
+    model: str = Field(..., description="Model name to use for chat")
+    messages: List[ChatMessage] = Field(..., description="List of chat messages")
+    temperature: Optional[float] = Field(0.7, ge=0.0, le=2.0, description="Sampling temperature")
+    top_p: Optional[float] = Field(0.9, ge=0.0, le=1.0, description="Top-p sampling parameter")
+    max_tokens: Optional[int] = Field(512, ge=1, le=4096, description="Maximum tokens to generate")
+    stream: Optional[bool] = Field(False, description="Whether to stream the response")
+class GenerateRequest(BaseModel):
+    model: str = Field(..., description="Model name to use for generation")
+    prompt: str = Field(..., description="Input prompt for text generation")
+    temperature: Optional[float] = Field(0.7, ge=0.0, le=2.0, description="Sampling temperature")
+    top_p: Optional[float] = Field(0.9, ge=0.0, le=1.0, description="Top-p sampling parameter")
+    max_tokens: Optional[int] = Field(512, ge=1, le=4096, description="Maximum tokens to generate")
+    stream: Optional[bool] = Field(False, description="Whether to stream the response")
+class ModelPullRequest(BaseModel):
+    model: str = Field(..., description="Model name to pull (e.g., 'llama2:7b')")
+class ChatResponse(BaseModel):
+    model: str
+    response: str
+    done: bool
+    total_duration: Optional[int] = None
+    load_duration: Optional[int] = None
+    prompt_eval_count: Optional[int] = None
+    eval_count: Optional[int] = None
+class GenerateResponse(BaseModel):
+    model: str
+    response: str
+    done: bool
+    total_duration: Optional[int] = None
+    load_duration: Optional[int] = None
+    prompt_eval_count: Optional[int] = None
+    eval_count: Optional[int] = None
+# HTTP client for Ollama API
+async def get_ollama_client():
+    return httpx.AsyncClient(timeout=300.0)  # 5 minute timeout
+@app.get("/health")
+async def health_check():
+    """Health check endpoint"""
+    try:
+        async with await get_ollama_client() as client:
+            response = await client.get(f"{OLLAMA_BASE_URL}/api/version")
+            if response.status_code == 200:
+                return {
+                    "status": "healthy",
+                    "ollama_status": "running",
+                    "ollama_version": response.json(),
+                    "timestamp": time.time()
+                }
+            else:
+                return {
+                    "status": "degraded",
+                    "ollama_status": "error",
+                    "error": f"Ollama returned status {response.status_code}",
+                    "timestamp": time.time()
+                }
     except Exception as e:
+        logger.error(f"Health check failed: {e}")
+        return {
+            "status": "unhealthy",
+            "ollama_status": "unreachable",
+            "error": str(e),
+            "timestamp": time.time()
+        }
+@app.get("/models")
+async def list_models():
+    """List available models"""
+    try:
+        async with await get_ollama_client() as client:
+            response = await client.get(f"{OLLAMA_BASE_URL}/api/tags")
+            response.raise_for_status()
+            return response.json()
+    except httpx.HTTPError as e:
+        logger.error(f"Failed to list models: {e}")
+        raise HTTPException(status_code=500, detail=f"Failed to list models: {str(e)}")
+@app.post("/models/pull")
+async def pull_model(request: ModelPullRequest, background_tasks: BackgroundTasks):
+    """Pull a model from Ollama registry"""
+    try:
+        async with await get_ollama_client() as client:
+            # Start the pull request
+            pull_data = {"name": request.model}
+            response = await client.post(
+                f"{OLLAMA_BASE_URL}/api/pull",
+                json=pull_data,
+                timeout=1800.0  # 30 minute timeout for model pulling
+            )
+            if response.status_code == 200:
+                return {
+                    "status": "success",
+                    "message": f"Successfully initiated pull for model '{request.model}'",
+                    "model": request.model
+                }
+            else:
+                error_detail = response.text
+                logger.error(f"Failed to pull model: {error_detail}")
+                raise HTTPException(
+                    status_code=response.status_code,
+                    detail=f"Failed to pull model: {error_detail}"
+                )
+    except httpx.TimeoutException:
         raise HTTPException(
+            status_code=408,
+            detail="Model pull request timed out. Large models may take longer to download."
         )
+    except Exception as e:
+        logger.error(f"Error pulling model: {e}")
+        raise HTTPException(status_code=500, detail=f"Error pulling model: {str(e)}")
+@app.delete("/models/{model_name}")
+async def delete_model(model_name: str):
+    """Delete a model"""
+    try:
+        async with await get_ollama_client() as client:
+            response = await client.delete(f"{OLLAMA_BASE_URL}/api/delete", json={"name": model_name})
+            response.raise_for_status()
+            return {"status": "success", "message": f"Model '{model_name}' deleted successfully"}
+    except httpx.HTTPError as e:
+        logger.error(f"Failed to delete model: {e}")
+        raise HTTPException(status_code=500, detail=f"Failed to delete model: {str(e)}")
+@app.post("/chat", response_model=ChatResponse)
+async def chat_with_model(request: ChatRequest):
+    """Chat with a model"""
+    try:
+        # Convert messages to Ollama format
+        chat_data = {
+            "model": request.model,
+            "messages": [{"role": msg.role, "content": msg.content} for msg in request.messages],
+            "stream": request.stream,
+            "options": {
+                "temperature": request.temperature,
+                "top_p": request.top_p,
+                "num_predict": request.max_tokens
+            }
+        }
+        async with await get_ollama_client() as client:
+            response = await client.post(
+                f"{OLLAMA_BASE_URL}/api/chat",
+                json=chat_data,
+                timeout=300.0
+            )
+            response.raise_for_status()
+            result = response.json()
+            return ChatResponse(
+                model=result.get("model", request.model),
+                response=result.get("message", {}).get("content", ""),
+                done=result.get("done", True),
+                total_duration=result.get("total_duration"),
+                load_duration=result.get("load_duration"),
+                prompt_eval_count=result.get("prompt_eval_count"),
+                eval_count=result.get("eval_count")
+            )
+    except httpx.HTTPError as e:
+        logger.error(f"Chat request failed: {e}")
+        if e.response.status_code == 404:
+            raise HTTPException(
+                status_code=404,
+                detail=f"Model '{request.model}' not found. Try pulling it first with POST /models/pull"
+            )
+        raise HTTPException(status_code=500, detail=f"Chat request failed: {str(e)}")
+    except Exception as e:
+        logger.error(f"Unexpected error in chat: {e}")
+        raise HTTPException(status_code=500, detail=f"Unexpected error: {str(e)}")
+@app.post("/generate", response_model=GenerateResponse)
+async def generate_text(request: GenerateRequest):
+    """Generate text completion"""
+    try:
+        generate_data = {
+            "model": request.model,
+            "prompt": request.prompt,
+            "stream": request.stream,
+            "options": {
+                "temperature": request.temperature,
+                "top_p": request.top_p,
+                "num_predict": request.max_tokens
+            }
+        }
+        async with await get_ollama_client() as client:
+            response = await client.post(
+                f"{OLLAMA_BASE_URL}/api/generate",
+                json=generate_data,
+                timeout=300.0
+            )
+            response.raise_for_status()
+            result = response.json()
+            return GenerateResponse(
+                model=result.get("model", request.model),
+                response=result.get("response", ""),
+                done=result.get("done", True),
+                total_duration=result.get("total_duration"),
+                load_duration=result.get("load_duration"),
+                prompt_eval_count=result.get("prompt_eval_count"),
+                eval_count=result.get("eval_count")
+            )
+    except httpx.HTTPError as e:
+        logger.error(f"Generate request failed: {e}")
+        if e.response.status_code == 404:
+            raise HTTPException(
+                status_code=404,
+                detail=f"Model '{request.model}' not found. Try pulling it first with POST /models/pull"
+            )
+        raise HTTPException(status_code=500, detail=f"Generate request failed: {str(e)}")
+    except Exception as e:
+        logger.error(f"Unexpected error in generate: {e}")
+        raise HTTPException(status_code=500, detail=f"Unexpected error: {str(e)}")
+@app.get("/")
+async def root():
+    """Root endpoint with API information"""
+    return {
+        "message": "Ollama API Server",
+        "version": "1.0.0",
+        "endpoints": {
+            "health": "/health",
+            "models": "/models",
+            "pull_model": "/models/pull",
+            "chat": "/chat",
+            "generate": "/generate",
+            "docs": "/docs"
+        },
+        "status": "running"
+    }
 if __name__ == "__main__":
     import uvicorn
+    logger.info("Starting Ollama API server...")
+    uvicorn.run(app, host="0.0.0.0", port=7860, log_level="info")