Spaces:

mojaalagevai
/

llmapi2

Paused

App Files Files Community

mojaalagevai commited on Jun 30, 2025

Commit

7d65968

verified ·

1 Parent(s): c526306

Update app.py

Browse files

Files changed (1) hide show

app.py +101 -73

app.py CHANGED Viewed

@@ -1,110 +1,137 @@
 from fastapi import FastAPI, HTTPException
 from pydantic import BaseModel
-from typing import List, Tuple, Optional
-import os
 from llama_cpp import Llama
-from llama_cpp_agent import LlamaCppAgent
-from llama_cpp_agent import MessagesFormatterType
 from llama_cpp_agent.providers import LlamaCppPythonProvider
 from llama_cpp_agent.chat_history import BasicChatHistory
 from llama_cpp_agent.chat_history.messages import Roles
-from huggingface_hub import hf_hub_download
-import logging
-import sys
-from logger import logging
-from exception import CustomExceptionHandling
-app = FastAPI(
-    title="Dolphin Llama.cpp API",
-    description="API for interacting with Dolphin3.0 models using Llama.cpp",
-    version="1.0.0"
-)
-# Download gguf model files
-if not os.path.exists("./models"):
-    os.makedirs("./models")
-hf_hub_download(
-    repo_id="bartowski/Dolphin3.0-Llama3.2-1B-GGUF",
-    filename="Dolphin3.0-Llama3.2-1B-Q4_K_M.gguf",
-    local_dir="./models",
-)
-hf_hub_download(
-    repo_id="bartowski/Dolphin3.0-Qwen2.5-0.5B-GGUF",
-    filename="Dolphin3.0-Qwen2.5-0.5B-Q6_K.gguf",
-    local_dir="./models",
-)
-hf_hub_download(
-    repo_id="bartowski/Qwen2.5-Coder-14B-Instruct-GGUF",
-    filename="Qwen2.5-Coder-14B-Instruct-Q6_K.gguf",
-    local_dir="./models",
-)
 llm = None
 llm_model = None
 class ChatRequest(BaseModel):
     message: str
     history: List[Tuple[str, str]] = []
-    model: str = "Dolphin3.0-Qwen2.5-0.5B-Q6_K.gguf"
-    system_message: str = "You are Dolphin, a helpful AI assistant focused on accurate and ethical responses."
     max_tokens: int = 1024
     temperature: float = 0.7
     top_p: float = 0.95
     top_k: int = 40
     repeat_penalty: float = 1.1
 class ChatResponse(BaseModel):
     response: str
-def initialize_llm(model: str):
-    global llm, llm_model
-    try:
-        model_path = f"models/{model}"
-        if not os.path.exists(model_path):
-            raise HTTPException(status_code=400, detail=f"Model file not found at {model_path}")
-        if llm is None or llm_model != model:
-            llm = Llama(
-                model_path=model_path,
-                flash_attn=False,
-                n_gpu_layers=0,
-                n_batch=8,
-                n_ctx=2048,
-                n_threads=8,
-                n_threads_batch=8,
-            )
-            llm_model = model
-        return llm
-    except Exception as e:
-        raise CustomExceptionHandling(e, sys) from e
 @app.post("/chat", response_model=ChatResponse)
-async def chat(request: ChatRequest):
     try:
-        # Initialize LLM
-        llm = initialize_llm(request.model)
         provider = LlamaCppPythonProvider(llm)
-        # Create agent
         agent = LlamaCppAgent(
             provider,
-            system_prompt=request.system_message,
             predefined_messages_formatter_type=MessagesFormatterType.CHATML,
-            debug_output=True,
         )
-        # Set sampling settings
         settings = provider.get_provider_default_settings()
         settings.temperature = request.temperature
         settings.top_k = request.top_k
         settings.top_p = request.top_p
         settings.max_tokens = request.max_tokens
         settings.repeat_penalty = request.repeat_penalty
-        settings.stream = False
-        # Build chat history
         messages = BasicChatHistory()
         for user_msg, assistant_msg in request.history:
             messages.add_message({"role": Roles.user, "content": user_msg})
             messages.add_message({"role": Roles.assistant, "content": assistant_msg})
@@ -117,15 +144,16 @@ async def chat(request: ChatRequest):
             print_output=False,
         )
-        logging.info("Response generated successfully")
-        return ChatResponse(response=response)
     except Exception as e:
-        raise CustomExceptionHandling(e, sys) from e
-@app.get("/health")
-async def health_check():
-    return {"status": "healthy"}
 if __name__ == "__main__":
     import uvicorn

+import os
+import sys
+from typing import List, Tuple, Optional
 from fastapi import FastAPI, HTTPException
 from pydantic import BaseModel
+from huggingface_hub import hf_hub_download
 from llama_cpp import Llama
+from llama_cpp_agent import LlamaCppAgent, MessagesFormatterType
 from llama_cpp_agent.providers import LlamaCppPythonProvider
 from llama_cpp_agent.chat_history import BasicChatHistory
 from llama_cpp_agent.chat_history.messages import Roles
+# Suppress warnings
+import warnings
+warnings.filterwarnings("ignore")
+# Ensure models directory exists
+MODEL_DIR = "./models"
+os.makedirs(MODEL_DIR, exist_ok=True)
+# Model info for download
+MODELS_INFO = [
+    {
+        "repo_id": "bartowski/Dolphin3.0-Llama3.2-1B-GGUF",
+        "filename": "Dolphin3.0-Llama3.2-1B-Q4_K_M.gguf"
+    },
+    {
+        "repo_id": "bartowski/Dolphin3.0-Qwen2.5-0.5B-GGUF",
+        "filename": "Dolphin3.0-Qwen2.5-0.5B-Q6_K.gguf"
+    },
+    {
+        "repo_id": "bartowski/Qwen2.5-Coder-14B-Instruct-GGUF",
+        "filename": "Qwen2.5-Coder-14B-Instruct-Q6_K.gguf"
+    }
+]
+# Download all models if not present
+for model_info in MODELS_INFO:
+    model_path = os.path.join(MODEL_DIR, model_info["filename"])
+    if not os.path.exists(model_path):
+        print(f"Downloading {model_info['filename']} from {model_info['repo_id']}...")
+        try:
+            hf_hub_download(
+                repo_id=model_info["repo_id"],
+                filename=model_info["filename"],
+                local_dir=MODEL_DIR
+            )
+            print(f"Downloaded {model_info['filename']}")
+        except Exception as e:
+            print(f"Error downloading {model_info['filename']}: {e}")
+# Available model keys
+AVAILABLE_MODELS = {
+    "llama": "Dolphin3.0-Llama3.2-1B-Q4_K_M.gguf",
+    "qwen": "Dolphin3.0-Qwen2.5-0.5B-Q6_K.gguf",
+    "coder": "Qwen2.5-Coder-14B-Instruct-Q6_K.gguf"
+}
+# Global LLM instance
 llm = None
 llm_model = None
+def load_model(model_key: str):
+    global llm, llm_model
+    model_name = AVAILABLE_MODELS.get(model_key)
+    if not model_name:
+        raise ValueError(f"Invalid model key: {model_key}")
+    model_path = os.path.join(MODEL_DIR, model_name)
+    if not os.path.exists(model_path):
+        raise FileNotFoundError(f"Model file not found at {model_path}")
+    if llm is None or llm_model != model_name:
+        llm = Llama(
+            model_path=model_path,
+            flash_attn=False,
+            n_gpu_layers=0,
+            n_batch=8,
+            n_ctx=2048,
+            n_threads=8,
+            n_threads_batch=8,
+        )
+        llm_model = model_name
+    return llm
 class ChatRequest(BaseModel):
     message: str
     history: List[Tuple[str, str]] = []
+    model: str = "qwen"
+    system_prompt: str = "You are Dolphin, a helpful AI assistant."
     max_tokens: int = 1024
     temperature: float = 0.7
     top_p: float = 0.95
     top_k: int = 40
     repeat_penalty: float = 1.1
 class ChatResponse(BaseModel):
     response: str
+app = FastAPI(
+    title="Dolphin 3.0 LLM API",
+    description="REST API for Dolphin 3.0 models using Llama.cpp backend.",
+    version="1.0"
+)
 @app.post("/chat", response_model=ChatResponse)
+def chat(request: ChatRequest):
     try:
+        # Load model
+        load_model(request.model)
         provider = LlamaCppPythonProvider(llm)
         agent = LlamaCppAgent(
             provider,
+            system_prompt=request.system_prompt,
             predefined_messages_formatter_type=MessagesFormatterType.CHATML,
         )
         settings = provider.get_provider_default_settings()
         settings.temperature = request.temperature
         settings.top_k = request.top_k
         settings.top_p = request.top_p
         settings.max_tokens = request.max_tokens
         settings.repeat_penalty = request.repeat_penalty
         messages = BasicChatHistory()
+        # Add history
         for user_msg, assistant_msg in request.history:
             messages.add_message({"role": Roles.user, "content": user_msg})
             messages.add_message({"role": Roles.assistant, "content": assistant_msg})
             print_output=False,
         )
+        return {"response": response}
     except Exception as e:
+        raise HTTPException(status_code=500, detail=str(e))
+@app.get("/")
+def read_root():
+    return {"message": "Welcome to Dolphin 3.0 FastAPI LLM Server!"}
 if __name__ == "__main__":
     import uvicorn