Spaces:

Ashok75
/

react

Sleeping

App Files Files Community

Ashok75 commited on Mar 5

Commit

90937b1

verified ·

1 Parent(s): 90e69ba

Update app.py

Browse files

Files changed (1) hide show

app.py +607 -38

app.py CHANGED Viewed

@@ -1,53 +1,622 @@
 import torch
-from flask import Flask, request, Response, render_template
 from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
 from threading import Thread
-app = Flask(__name__)
-# Load Nanbeige 4.1 3B
-model_id = "Nanbeige/Nanbeige4.1-3B"
-tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
-model = AutoModelForCausalLM.from_pretrained(
-    model_id,
-    torch_dtype=torch.bfloat16,
-    device_map="auto",
-    trust_remote_code=True
 )
-@app.route('/chat', methods=['POST'])
-def chat():
-    user_msg = request.json.get("message")
-    # System Prompt Construction [14, 32]
-    prompt = f"<|system|>\nYou are an GAKR AI ASSISTANT. Always think before answering.dont think heavely and answer directly if you know the answer and if you want any latest content or anything call the web_search tool to get the content like latest data and web data and all\n<|user|>\n{user_msg}\n<|assistant|>\n<thought>"
-    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
-    streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
-    generation_kwargs = dict(
-        **inputs,
-        streamer=streamer,
-        max_new_tokens=1024,
-        do_sample=True,
-        temperature=0.7,
-        pad_token_id=tokenizer.eos_token_id
     )
     thread = Thread(target=model.generate, kwargs=generation_kwargs)
     thread.start()
-    def stream():
-        # Start with the tag we forced in the prompt
-        yield "<thought>"
-        for new_text in streamer:
-            yield new_text
-    return Response(stream(), mimetype='text/plain')
-@app.route('/')
-def index():
-    return render_template('index.html')
-if __name__ == '__main__':
-    app.run(host='0.0.0.0', port=7860)

+"""
+Nanbeige4.1-3B Inference Server for Hugging Face Space
+Lightweight API server exposing /chat endpoint for remote agent communication
+"""
+import os
+import json
 import torch
+from typing import AsyncGenerator, Dict, List, Optional
+from contextlib import asynccontextmanager
+from fastapi import FastAPI, Request, HTTPException
+from fastapi.responses import StreamingResponse, HTMLResponse
+from fastapi.middleware.cors import CORSMiddleware
+from pydantic import BaseModel, Field
 from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
 from threading import Thread
+import asyncio
+# Global model instances
+model = None
+tokenizer = None
+# Model configuration
+MODEL_ID = "Nanbeige/Nanbeige4.1-3B"
+DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
+DEFAULT_MAX_TOKENS = 2048
+DEFAULT_TEMPERATURE = 0.6
+DEFAULT_TOP_P = 0.95
+class ChatMessage(BaseModel):
+    role: str = Field(..., description="Message role: system, user, assistant, or tool")
+    content: str = Field(..., description="Message content")
+    tool_calls: Optional[List[Dict]] = Field(None, description="Tool calls from assistant")
+    tool_call_id: Optional[str] = Field(None, description="Tool call ID for tool responses")
+class ChatRequest(BaseModel):
+    messages: List[ChatMessage] = Field(..., description="Conversation history")
+    tools: Optional[List[Dict]] = Field(None, description="Available tools for function calling")
+    stream: bool = Field(default=False, description="Enable streaming response")
+    max_tokens: int = Field(default=DEFAULT_MAX_TOKENS, ge=1, le=8192)
+    temperature: float = Field(default=DEFAULT_TEMPERATURE, ge=0.0, le=2.0)
+    top_p: float = Field(default=DEFAULT_TOP_P, ge=0.0, le=1.0)
+    stop: Optional[List[str]] = Field(default=None, description="Stop sequences")
+class ChatResponse(BaseModel):
+    id: str
+    object: str = "chat.completion"
+    created: int
+    model: str
+    choices: List[Dict]
+    usage: Optional[Dict] = None
+def load_model():
+    """Load Nanbeige4.1-3B model and tokenizer."""
+    global model, tokenizer
+    print(f"Loading {MODEL_ID} on {DEVICE}...")
+    tokenizer = AutoTokenizer.from_pretrained(
+        MODEL_ID,
+        trust_remote_code=True,
+        padding_side="left"
+    )
+    # Set pad token if not present
+    if tokenizer.pad_token is None:
+        tokenizer.pad_token = tokenizer.eos_token
+    model = AutoModelForCausalLM.from_pretrained(
+        MODEL_ID,
+        torch_dtype=torch.float16 if DEVICE == "cuda" else torch.float32,
+        device_map="auto" if DEVICE == "cuda" else None,
+        trust_remote_code=True,
+        low_cpu_mem_usage=True
+    )
+    if DEVICE == "cpu":
+        model = model.to(DEVICE)
+    model.eval()
+    print(f"Model loaded successfully on {DEVICE}")
+@asynccontextmanager
+async def lifespan(app: FastAPI):
+    """Application lifespan manager."""
+    # Startup
+    load_model()
+    yield
+    # Shutdown - cleanup happens automatically
+app = FastAPI(
+    title="Nanbeige4.1-3B Inference API",
+    description="Remote LLM inference service for Enterprise ReAct Agent",
+    version="1.0.0",
+    lifespan=lifespan
+)
+# CORS for local agent communication
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],  # Configure for production
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
 )
+def format_messages_for_model(messages: List[ChatMessage], tools: Optional[List[Dict]] = None) -> str:
+    """Format messages using Nanbeige chat template."""
+    formatted_messages = []
+    for msg in messages:
+        if msg.role == "system":
+            formatted_messages.append({"role": "system", "content": msg.content})
+        elif msg.role == "user":
+            formatted_messages.append({"role": "user", "content": msg.content})
+        elif msg.role == "assistant":
+            content = msg.content
+            if msg.tool_calls:
+                # Append tool calls to content
+                tool_calls_str = json.dumps(msg.tool_calls)
+                content = f"{content}\n<tool_calls>{tool_calls_str}</tool_calls>"
+            formatted_messages.append({"role": "assistant", "content": content})
+        elif msg.role == "tool":
+            formatted_messages.append({
+                "role": "tool",
+                "content": msg.content,
+                "tool_call_id": msg.tool_call_id
+            })
+    # Add tools to system message if provided
+    if tools:
+        tools_description = "\n\nAvailable tools:\n" + json.dumps(tools, indent=2)
+        if formatted_messages and formatted_messages[0]["role"] == "system":
+            formatted_messages[0]["content"] += tools_description
+        else:
+            formatted_messages.insert(0, {"role": "system", "content": tools_description})
+    # Apply chat template
+    prompt = tokenizer.apply_chat_template(
+        formatted_messages,
+        tokenize=False,
+        add_generation_prompt=True
     )
+    return prompt
+def parse_tool_calls(response_text: str) -> tuple[str, Optional[List[Dict]]]:
+    """Parse tool calls from model response."""
+    tool_calls = None
+    content = response_text
+    # Look for tool_calls in the response
+    if "<tool_calls>" in response_text and "</tool_calls>" in response_text:
+        try:
+            start = response_text.find("<tool_calls>") + len("<tool_calls>")
+            end = response_text.find("</tool_calls>")
+            tool_calls_json = response_text[start:end]
+            tool_calls = json.loads(tool_calls_json)
+            content = response_text[:response_text.find("<tool_calls>")].strip()
+        except (json.JSONDecodeError, ValueError):
+            pass
+    return content, tool_calls
+def generate_stream(
+    prompt: str,
+    max_tokens: int,
+    temperature: float,
+    top_p: float,
+    stop: Optional[List[str]]
+) -> AsyncGenerator[str, None]:
+    """Generate streaming response."""
+    inputs = tokenizer(prompt, return_tensors="pt", padding=True)
+    inputs = {k: v.to(model.device) for k, v in inputs.items()}
+    streamer = TextIteratorStreamer(
+        tokenizer,
+        skip_prompt=True,
+        skip_special_tokens=True
+    )
+    generation_kwargs = {
+        "input_ids": inputs["input_ids"],
+        "attention_mask": inputs["attention_mask"],
+        "max_new_tokens": max_tokens,
+        "temperature": temperature,
+        "top_p": top_p,
+        "do_sample": temperature > 0,
+        "streamer": streamer,
+        "pad_token_id": tokenizer.pad_token_id,
+        "eos_token_id": tokenizer.eos_token_id,
+    }
+    if stop:
+        generation_kwargs["stopping_criteria"] = create_stopping_criteria(stop)
+    # Run generation in separate thread
     thread = Thread(target=model.generate, kwargs=generation_kwargs)
     thread.start()
+    generated_text = ""
+    for new_text in streamer:
+        generated_text += new_text
+        # Check for stop sequences
+        if stop:
+            for s in stop:
+                if s in generated_text:
+                    generated_text = generated_text[:generated_text.find(s)]
+                    break
+        yield new_text
+    thread.join()
+def create_stopping_criteria(stop_sequences: List[str]):
+    """Create stopping criteria for generation."""
+    from transformers import StoppingCriteria, StoppingCriteriaList
+    class StopSequenceCriteria(StoppingCriteria):
+        def __init__(self, stops, tokenizer):
+            self.stops = stops
+            self.tokenizer = tokenizer
+        def __call__(self, input_ids, scores, **kwargs):
+            generated = self.tokenizer.decode(input_ids[0], skip_special_tokens=True)
+            for stop in self.stops:
+                if stop in generated:
+                    return True
+            return False
+    return StoppingCriteriaList([StopSequenceCriteria(stop_sequences, tokenizer)])
+def generate_non_stream(
+    prompt: str,
+    max_tokens: int,
+    temperature: float,
+    top_p: float,
+    stop: Optional[List[str]]
+) -> str:
+    """Generate non-streaming response."""
+    inputs = tokenizer(prompt, return_tensors="pt", padding=True)
+    inputs = {k: v.to(model.device) for k, v in inputs.items()}
+    with torch.no_grad():
+        outputs = model.generate(
+            input_ids=inputs["input_ids"],
+            attention_mask=inputs["attention_mask"],
+            max_new_tokens=max_tokens,
+            temperature=temperature,
+            top_p=top_p,
+            do_sample=temperature > 0,
+            pad_token_id=tokenizer.pad_token_id,
+            eos_token_id=tokenizer.eos_token_id,
+        )
+    generated = tokenizer.decode(outputs[0][inputs["input_ids"].shape[1]:], skip_special_tokens=True)
+    # Apply stop sequences
+    if stop:
+        for s in stop:
+            if s in generated:
+                generated = generated[:generated.find(s)]
+                break
+    return generated
+@app.post("/chat", response_model=ChatResponse)
+async def chat_completion(request: ChatRequest):
+    """
+    Main chat completion endpoint.
+    Compatible with OpenAI-style API for easy integration.
+    """
+    import time
+    prompt = format_messages_for_model(request.messages, request.tools)
+    if request.stream:
+        async def stream_response():
+            generated = ""
+            async for chunk in generate_stream(
+                prompt,
+                request.max_tokens,
+                request.temperature,
+                request.top_p,
+                request.stop
+            ):
+                generated += chunk
+                data = {
+                    "id": f"chatcmpl-{int(time.time())}",
+                    "object": "chat.completion.chunk",
+                    "created": int(time.time()),
+                    "model": MODEL_ID,
+                    "choices": [{
+                        "index": 0,
+                        "delta": {"content": chunk},
+                        "finish_reason": None
+                    }]
+                }
+                yield f"data: {json.dumps(data)}\n\n"
+            # Final chunk
+            content, tool_calls = parse_tool_calls(generated)
+            final_data = {
+                "id": f"chatcmpl-{int(time.time())}",
+                "object": "chat.completion.chunk",
+                "created": int(time.time()),
+                "model": MODEL_ID,
+                "choices": [{
+                    "index": 0,
+                    "delta": {},
+                    "finish_reason": "stop"
+                }]
+            }
+            yield f"data: {json.dumps(final_data)}\n\n"
+            yield "data: [DONE]\n\n"
+        return StreamingResponse(
+            stream_response(),
+            media_type="text/event-stream",
+            headers={
+                "Cache-Control": "no-cache",
+                "Connection": "keep-alive",
+                "X-Accel-Buffering": "no"
+            }
+        )
+    else:
+        generated = generate_non_stream(
+            prompt,
+            request.max_tokens,
+            request.temperature,
+            request.top_p,
+            request.stop
+        )
+        content, tool_calls = parse_tool_calls(generated)
+        # Calculate token usage
+        input_tokens = len(tokenizer.encode(prompt))
+        output_tokens = len(tokenizer.encode(generated))
+        response = ChatResponse(
+            id=f"chatcmpl-{int(time.time())}",
+            object="chat.completion",
+            created=int(time.time()),
+            model=MODEL_ID,
+            choices=[{
+                "index": 0,
+                "message": {
+                    "role": "assistant",
+                    "content": content,
+                    "tool_calls": tool_calls
+                },
+                "finish_reason": "stop"
+            }],
+            usage={
+                "prompt_tokens": input_tokens,
+                "completion_tokens": output_tokens,
+                "total_tokens": input_tokens + output_tokens
+            }
+        )
+        return response
+@app.get("/chat", response_class=HTMLResponse)
+async def chat_interface():
+    """Simple web interface for testing."""
+    return """
+<!DOCTYPE html>
+<html lang="en">
+<head>
+    <meta charset="UTF-8">
+    <meta name="viewport" content="width=device-width, initial-scale=1.0">
+    <title>Nanbeige4.1-3B Chat</title>
+    <style>
+        * { margin: 0; padding: 0; box-sizing: border-box; }
+        body {
+            font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif;
+            background: #1a1a2e;
+            color: #eee;
+            min-height: 100vh;
+            display: flex;
+            flex-direction: column;
+        }
+        header {
+            background: #16213e;
+            padding: 1rem 2rem;
+            border-bottom: 1px solid #0f3460;
+        }
+        header h1 { font-size: 1.25rem; color: #e94560; }
+        header p { font-size: 0.875rem; color: #888; margin-top: 0.25rem; }
+        .chat-container {
+            flex: 1;
+            display: flex;
+            flex-direction: column;
+            max-width: 900px;
+            width: 100%;
+            margin: 0 auto;
+            padding: 1rem;
+        }
+        .messages {
+            flex: 1;
+            overflow-y: auto;
+            padding: 1rem;
+            display: flex;
+            flex-direction: column;
+            gap: 1rem;
+        }
+        .message {
+            max-width: 80%;
+            padding: 1rem;
+            border-radius: 12px;
+            line-height: 1.6;
+        }
+        .message.user {
+            align-self: flex-end;
+            background: #e94560;
+            color: white;
+        }
+        .message.assistant {
+            align-self: flex-start;
+            background: #16213e;
+            border: 1px solid #0f3460;
+        }
+        .message.system {
+            align-self: center;
+            background: #0f3460;
+            font-size: 0.875rem;
+            color: #888;
+        }
+        .input-area {
+            display: flex;
+            gap: 0.5rem;
+            padding: 1rem;
+            background: #16213e;
+            border-top: 1px solid #0f3460;
+        }
+        textarea {
+            flex: 1;
+            padding: 0.75rem 1rem;
+            border: 1px solid #0f3460;
+            border-radius: 8px;
+            background: #1a1a2e;
+            color: #eee;
+            font-size: 1rem;
+            resize: none;
+            min-height: 50px;
+            max-height: 150px;
+        }
+        textarea:focus {
+            outline: none;
+            border-color: #e94560;
+        }
+        button {
+            padding: 0.75rem 1.5rem;
+            background: #e94560;
+            color: white;
+            border: none;
+            border-radius: 8px;
+            cursor: pointer;
+            font-size: 1rem;
+            transition: background 0.2s;
+        }
+        button:hover { background: #d63d56; }
+        button:disabled { background: #666; cursor: not-allowed; }
+        .loading {
+            display: inline-block;
+            width: 20px;
+            height: 20px;
+            border: 2px solid #0f3460;
+            border-top-color: #e94560;
+            border-radius: 50%;
+            animation: spin 1s linear infinite;
+        }
+        @keyframes spin { to { transform: rotate(360deg); } }
+        .tool-calls {
+            margin-top: 0.5rem;
+            padding: 0.5rem;
+            background: #0f3460;
+            border-radius: 6px;
+            font-size: 0.8rem;
+            font-family: monospace;
+        }
+    </style>
+</head>
+<body>
+    <header>
+        <h1>Nanbeige4.1-3B Inference Server</h1>
+        <p>Remote LLM service for Enterprise ReAct Agent</p>
+    </header>
+    <div class="chat-container">
+        <div class="messages" id="messages"></div>
+        <div class="input-area">
+            <textarea id="input" placeholder="Type your message..." rows="1"></textarea>
+            <button id="send" onclick="sendMessage()">Send</button>
+        </div>
+    </div>
+    <script>
+        const messages = document.getElementById('messages');
+        const input = document.getElementById('input');
+        const sendBtn = document.getElementById('send');
+        let conversation = [];
+        // Auto-resize textarea
+        input.addEventListener('input', () => {
+            input.style.height = 'auto';
+            input.style.height = Math.min(input.scrollHeight, 150) + 'px';
+        });
+        // Enter to send, Shift+Enter for new line
+        input.addEventListener('keydown', (e) => {
+            if (e.key === 'Enter' && !e.shiftKey) {
+                e.preventDefault();
+                sendMessage();
+            }
+        });
+        function addMessage(role, content, toolCalls = null) {
+            const div = document.createElement('div');
+            div.className = `message ${role}`;
+            div.textContent = content;
+            if (toolCalls) {
+                const toolDiv = document.createElement('div');
+                toolDiv.className = 'tool-calls';
+                toolDiv.textContent = 'Tool calls: ' + JSON.stringify(toolCalls, null, 2);
+                div.appendChild(toolDiv);
+            }
+            messages.appendChild(div);
+            messages.scrollTop = messages.scrollHeight;
+        }
+        async function sendMessage() {
+            const text = input.value.trim();
+            if (!text) return;
+            addMessage('user', text);
+            conversation.push({ role: 'user', content: text });
+            input.value = '';
+            input.style.height = 'auto';
+            sendBtn.disabled = true;
+            sendBtn.innerHTML = '<span class="loading"></span>';
+            try {
+                const response = await fetch('/chat', {
+                    method: 'POST',
+                    headers: { 'Content-Type': 'application/json' },
+                    body: JSON.stringify({
+                        messages: conversation,
+                        stream: false,
+                        max_tokens: 2048,
+                        temperature: 0.6
+                    })
+                });
+                const data = await response.json();
+                const assistantMsg = data.choices[0].message;
+                addMessage('assistant', assistantMsg.content, assistantMsg.tool_calls);
+                conversation.push({
+                    role: 'assistant',
+                    content: assistantMsg.content,
+                    tool_calls: assistantMsg.tool_calls
+                });
+            } catch (error) {
+                addMessage('system', 'Error: ' + error.message);
+            } finally {
+                sendBtn.disabled = false;
+                sendBtn.textContent = 'Send';
+            }
+        }
+        // Initial system message
+        addMessage('system', 'Welcome! The model is ready for inference.');
+    </script>
+</body>
+</html>
+    """
+@app.get("/health")
+async def health_check():
+    """Health check endpoint."""
+    return {
+        "status": "healthy",
+        "model": MODEL_ID,
+        "device": DEVICE,
+        "model_loaded": model is not None and tokenizer is not None
+    }
+@app.get("/")
+async def root():
+    """Root endpoint - redirect to chat interface."""
+    return {
+        "message": "Nanbeige4.1-3B Inference Server",
+        "endpoints": {
+            "chat": "/chat (POST for API, GET for web interface)",
+            "health": "/health"
+        },
+        "model": MODEL_ID,
+        "device": DEVICE
+    }
+if __name__ == "__main__":
+    import uvicorn
+    port = int(os.environ.get("PORT", 7860))
+    uvicorn.run(app, host="0.0.0.0", port=port)