Spaces:

likhonhfai
/

fara-7b-api

Build error

App Files Files Community

likhonhfai commited on Feb 19

Commit

2aabcef

verified ·

1 Parent(s): 078f2ed

Upload folder using huggingface_hub

Browse files

Files changed (3) hide show

Dockerfile +3 -3
app.py +99 -53
requirements.txt +2 -1

Dockerfile CHANGED Viewed

@@ -1,4 +1,4 @@
-FROM nvidia/cuda:12.1.0-base-ubuntu22.04
 # Set up environment
 ENV DEBIAN_FRONTEND=noninteractive
@@ -16,7 +16,7 @@ ENV HOME=/home/user \
 WORKDIR $HOME/app
-# Install dependencies
 COPY --chown=user requirements.txt .
 RUN pip install --no-cache-dir -r requirements.txt
@@ -27,4 +27,4 @@ COPY --chown=user . .
 EXPOSE 7860
 # Run the application
-CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]

+FROM lmsysorg/sglang:latest
 # Set up environment
 ENV DEBIAN_FRONTEND=noninteractive
 WORKDIR $HOME/app
+# Install additional dependencies for our proxy app
 COPY --chown=user requirements.txt .
 RUN pip install --no-cache-dir -r requirements.txt
 EXPOSE 7860
 # Run the application
+CMD ["python3", "app.py"]

app.py CHANGED Viewed

@@ -1,40 +1,65 @@
 import os
-import torch
-from fastapi import FastAPI, HTTPException
 from pydantic import BaseModel
-from typing import List, Optional, Dict, Any
-from vllm import LLM, SamplingParams
-from PIL import Image
-import base64
-from io import BytesIO
-app = FastAPI(title="Fara-7B API")
-# Model configuration
 MODEL_ID = "microsoft/Fara-7B"
-llm = None
-def get_llm():
-    global llm
-    if llm is None:
-        # Check for GPU availability
-        if not torch.cuda.is_available():
-            # For Spaces, we might want to log this or handle it gracefully
-            print("CUDA is not available. This model requires a GPU.")
-        # vLLM setup
-        try:
-            llm = LLM(
-                model=MODEL_ID,
-                trust_remote_code=True,
-                dtype="auto",
-                max_model_len=4096,
-                tensor_parallel_size=1
-            )
-        except Exception as e:
-            print(f"Error initializing vLLM: {e}")
-            raise e
-    return llm
 # Request models
 class Message(BaseModel):
@@ -53,42 +78,63 @@ class MessageRequest(BaseModel):
 @app.get("/")
 async def root():
-    return {"message": "Fara-7B API is running. Use /v1/responses or /v1/messages"}
 @app.get("/health")
 async def health():
-    return {"status": "healthy"}
 @app.post("/v1/responses")
 async def generate_response(request: ResponseRequest):
     try:
-        model = get_llm()
-        sampling_params = SamplingParams(
-            temperature=request.temperature,
-            max_tokens=request.max_tokens
-        )
-        outputs = model.generate([request.prompt], sampling_params)
-        return {"response": outputs[0].outputs[0].text}
     except Exception as e:
         raise HTTPException(status_code=500, detail=str(e))
 @app.post("/v1/messages")
 async def generate_message(request: MessageRequest):
     try:
-        model = get_llm()
-        sampling_params = SamplingParams(
-            temperature=request.temperature,
-            max_tokens=request.max_tokens
-        )
-        # Formatting for messages
-        formatted_prompt = ""
-        for msg in request.messages:
-            formatted_prompt += f"<|im_start|>{msg.role}\n{msg.content}<|im_end|>\n"
-        formatted_prompt += "<|im_start|>assistant\n"
-        outputs = model.generate([formatted_prompt], sampling_params)
-        return {"message": outputs[0].outputs[0].text}
     except Exception as e:
         raise HTTPException(status_code=500, detail=str(e))

 import os
+import subprocess
+import time
+import requests
+from fastapi import FastAPI, HTTPException, Request
 from pydantic import BaseModel
+from typing import List, Optional, Any
+import torch
+app = FastAPI(title="Fara-7B SGLang API")
+# Configuration
 MODEL_ID = "microsoft/Fara-7B"
+SGLANG_PORT = 30000
+SGLANG_HOST = "127.0.0.1"
+SGLANG_URL = f"http://{SGLANG_HOST}:{SGLANG_PORT}"
+# Global process for SGLang server
+sglang_process = None
+def start_sglang():
+    global sglang_process
+    if sglang_process is None:
+        print(f"Starting SGLang server for {MODEL_ID}...")
+        # Command to start SGLang server
+        # Using --chat-template qwen2-vl as Fara-7B is based on Qwen2.5-VL
+        cmd = [
+            "python3", "-m", "sglang.launch_server",
+            "--model-path", MODEL_ID,
+            "--host", SGLANG_HOST,
+            "--port", str(SGLANG_PORT),
+            "--chat-template", "qwen2-vl",
+            "--trust-remote-code"
+        ]
+        # Check GPU availability for tensor parallel
+        if torch.cuda.device_count() > 1:
+            cmd.extend(["--tp", str(torch.cuda.device_count())])
+        sglang_process = subprocess.Popen(cmd)
+        # Wait for server to be ready
+        max_retries = 60
+        for i in range(max_retries):
+            try:
+                response = requests.get(f"{SGLANG_URL}/v1/models")
+                if response.status_code == 200:
+                    print("SGLang server is ready!")
+                    return
+            except:
+                pass
+            print(f"Waiting for SGLang server... ({i+1}/{max_retries})")
+            time.sleep(10)
+        raise RuntimeError("SGLang server failed to start within timeout.")
+@app.on_event("startup")
+async def startup_event():
+    # Start SGLang in the background
+    import threading
+    threading.Thread(target=start_sglang, daemon=True).start()
 # Request models
 class Message(BaseModel):
 @app.get("/")
 async def root():
+    return {"message": "Fara-7B SGLang API is running. Use /v1/responses or /v1/messages"}
 @app.get("/health")
 async def health():
+    try:
+        resp = requests.get(f"{SGLANG_URL}/v1/models", timeout=2)
+        if resp.status_code == 200:
+            return {"status": "healthy", "backend": "sglang"}
+    except:
+        pass
+    return {"status": "starting", "backend": "sglang"}
 @app.post("/v1/responses")
 async def generate_response(request: ResponseRequest):
     try:
+        # Map /v1/responses to SGLang's completions or chat completions
+        payload = {
+            "model": MODEL_ID,
+            "prompt": request.prompt,
+            "max_tokens": request.max_tokens,
+            "temperature": request.temperature
+        }
+        resp = requests.post(f"{SGLANG_URL}/v1/completions", json=payload)
+        resp.raise_for_status()
+        data = resp.json()
+        return {"response": data["choices"][0]["text"]}
     except Exception as e:
         raise HTTPException(status_code=500, detail=str(e))
 @app.post("/v1/messages")
 async def generate_message(request: MessageRequest):
     try:
+        # Map /v1/messages to SGLang's chat completions
+        payload = {
+            "model": MODEL_ID,
+            "messages": [m.dict() for m in request.messages],
+            "max_tokens": request.max_tokens,
+            "temperature": request.temperature
+        }
+        resp = requests.post(f"{SGLANG_URL}/v1/chat/completions", json=payload)
+        resp.raise_for_status()
+        data = resp.json()
+        return {"message": data["choices"][0]["message"]["content"]}
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=str(e))
+# Proxy other OpenAI compatible requests to SGLang if needed
+@app.api_route("/v1/{path:path}", methods=["GET", "POST", "PUT", "DELETE"])
+async def proxy_openai(path: str, request: Request):
+    url = f"{SGLANG_URL}/v1/{path}"
+    method = request.method
+    headers = {k: v for k, v in request.headers.items() if k.lower() != "host"}
+    body = await request.body()
+    try:
+        resp = requests.request(method, url, headers=headers, data=body, timeout=300)
+        return resp.json()
     except Exception as e:
         raise HTTPException(status_code=500, detail=str(e))

requirements.txt CHANGED Viewed

@@ -1,6 +1,6 @@
 fastapi
 uvicorn
-vllm
 huggingface_hub
 python-multipart
 pydantic
@@ -8,3 +8,4 @@ pillow
 torch
 transformers
 accelerate

 fastapi
 uvicorn
+sglang
 huggingface_hub
 python-multipart
 pydantic
 torch
 transformers
 accelerate
+requests