Rox-Turbo commited on
Commit
44bcbbf
·
verified ·
1 Parent(s): 86666bd

Upload 6 files

Browse files
Files changed (6) hide show
  1. .dockerignore +15 -0
  2. Dockerfile +22 -0
  3. README.md +100 -11
  4. docker-compose.yml +12 -0
  5. requirements.txt +4 -0
  6. server.py +162 -0
.dockerignore ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ __pycache__/
2
+ *.py[cod]
3
+ *.pyo
4
+ *.pyd
5
+ .Python
6
+ env/
7
+ venv/
8
+ .venv/
9
+ build/
10
+ dist/
11
+ *.egg-info/
12
+ .git
13
+ .gitignore
14
+ .env
15
+
Dockerfile ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.11-slim
2
+
3
+ WORKDIR /app
4
+
5
+ ENV PYTHONDONTWRITEBYTECODE=1 \
6
+ PYTHONUNBUFFERED=1
7
+
8
+ RUN groupadd --system app && useradd --system --gid app app
9
+
10
+ COPY requirements.txt .
11
+ RUN pip install --no-cache-dir -r requirements.txt
12
+
13
+ COPY . .
14
+
15
+ RUN chown -R app:app /app
16
+ USER app
17
+
18
+ # NVIDIA_API_KEY must be provided at runtime (docker run -e ... or env_file)
19
+ EXPOSE 8000
20
+
21
+ CMD ["uvicorn", "server:app", "--host", "0.0.0.0", "--port", "8000"]
22
+
README.md CHANGED
@@ -1,11 +1,100 @@
1
- ---
2
- title: API
3
- emoji:
4
- colorFrom: purple
5
- colorTo: blue
6
- sdk: docker
7
- pinned: false
8
- short_description: API
9
- ---
10
-
11
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ## NVIDIA Chat Proxy API
2
+
3
+ This is a small FastAPI server that proxies requests from your static website to the NVIDIA/`OpenAI` compatible endpoint, so your API key stays on the server and is never exposed in the browser.
4
+
5
+ ### 1. Setup
6
+
7
+ Create and activate a virtual environment (optional but recommended), then install dependencies:
8
+
9
+ ```bash
10
+ pip install -r requirements.txt
11
+ ```
12
+
13
+ Create a `.env` file in this folder:
14
+
15
+ ```bash
16
+ echo NVIDIA_API_KEY=your_real_nvidia_key_here > .env
17
+ ```
18
+
19
+ > **Important**: Never commit your real key to git or paste it in client-side code.
20
+
21
+ ### 2. Run the server
22
+
23
+ ```bash
24
+ python server.py
25
+ ```
26
+
27
+ The API will be available at `http://localhost:8000`.
28
+
29
+ ### 3. HTTP API
30
+
31
+ **Endpoint**: `POST /chat`
32
+
33
+ **Request body**:
34
+
35
+ ```json
36
+ {
37
+ "messages": [
38
+ { "role": "user", "content": "Hello!" }
39
+ ],
40
+ "temperature": 1.0,
41
+ "top_p": 1.0,
42
+ "max_tokens": 512
43
+ }
44
+ ```
45
+
46
+ **Response body**:
47
+
48
+ ```json
49
+ {
50
+ "content": "Model reply here..."
51
+ }
52
+ ```
53
+
54
+ ### 4. Example usage from a static website
55
+
56
+ ```html
57
+ <!DOCTYPE html>
58
+ <html>
59
+ <head>
60
+ <meta charset="UTF-8" />
61
+ <title>Chat with NVIDIA Model</title>
62
+ </head>
63
+ <body>
64
+ <textarea id="input" placeholder="Ask something..."></textarea>
65
+ <button id="send">Send</button>
66
+ <pre id="output"></pre>
67
+
68
+ <script>
69
+ const API_URL = "http://localhost:8000/chat"; // or your deployed URL
70
+
71
+ document.getElementById("send").addEventListener("click", async () => {
72
+ const userText = document.getElementById("input").value;
73
+
74
+ const body = {
75
+ messages: [{ role: "user", content: userText }],
76
+ temperature: 1,
77
+ top_p: 1,
78
+ max_tokens: 512,
79
+ };
80
+
81
+ const res = await fetch(API_URL, {
82
+ method: "POST",
83
+ headers: { "Content-Type": "application/json" },
84
+ body: JSON.stringify(body),
85
+ });
86
+
87
+ if (!res.ok) {
88
+ document.getElementById("output").textContent =
89
+ "Error: " + (await res.text());
90
+ return;
91
+ }
92
+
93
+ const data = await res.json();
94
+ document.getElementById("output").textContent = data.content;
95
+ });
96
+ </script>
97
+ </body>
98
+ </html>
99
+ ```
100
+
docker-compose.yml ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ version: "3.9"
2
+
3
+ services:
4
+ nvidia-chat-proxy:
5
+ build: .
6
+ container_name: nvidia-chat-proxy
7
+ ports:
8
+ - "8000:8000"
9
+ env_file:
10
+ - .env
11
+ restart: unless-stopped
12
+
requirements.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ fastapi
2
+ uvicorn
3
+ openai
4
+ python-dotenv
server.py ADDED
@@ -0,0 +1,162 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ import os
3
+ from typing import List, Optional
4
+
5
+ from dotenv import load_dotenv
6
+ from fastapi import FastAPI, HTTPException
7
+ from fastapi.middleware.cors import CORSMiddleware
8
+ from pydantic import BaseModel
9
+ from openai import OpenAI
10
+
11
+
12
+ load_dotenv()
13
+
14
+ logger = logging.getLogger("nvidia_chat_proxy")
15
+ logging.basicConfig(level=logging.INFO)
16
+
17
+ NVIDIA_API_KEY = os.getenv("NVIDIA_API_KEY")
18
+
19
+ if not NVIDIA_API_KEY:
20
+ # Fail fast on startup rather than at first request.
21
+ raise RuntimeError(
22
+ "NVIDIA_API_KEY environment variable is not set. "
23
+ "Create a .env file or set it in your environment."
24
+ )
25
+
26
+
27
+ client = OpenAI(
28
+ base_url="https://integrate.api.nvidia.com/v1",
29
+ api_key=NVIDIA_API_KEY,
30
+ )
31
+
32
+ app = FastAPI(title="NVIDIA Chat Proxy API")
33
+
34
+ # Adjust this list to only include your real frontend origins in production.
35
+ app.add_middleware(
36
+ CORSMiddleware,
37
+ allow_origins=["*"], # e.g. ["https://your-site.com"]
38
+ allow_credentials=True,
39
+ allow_methods=["*"],
40
+ allow_headers=["*"],
41
+ )
42
+
43
+
44
+ class ChatMessage(BaseModel):
45
+ role: str
46
+ content: str
47
+
48
+
49
+ class ChatRequest(BaseModel):
50
+ messages: List[ChatMessage]
51
+ temperature: Optional[float] = 1.0
52
+ top_p: Optional[float] = 1.0
53
+ max_tokens: Optional[int] = 4096
54
+
55
+
56
+ class ChatResponse(BaseModel):
57
+ content: str
58
+
59
+
60
+ class HFParameters(BaseModel):
61
+ temperature: Optional[float] = None
62
+ top_p: Optional[float] = None
63
+ max_new_tokens: Optional[int] = None
64
+
65
+
66
+ class HFRequest(BaseModel):
67
+ inputs: str
68
+ parameters: Optional[HFParameters] = None
69
+
70
+
71
+ class HFResponseItem(BaseModel):
72
+ generated_text: str
73
+
74
+
75
+ @app.post("/chat", response_model=ChatResponse)
76
+ def chat(req: ChatRequest):
77
+ try:
78
+ completion = client.chat.completions.create(
79
+ model="openai/gpt-oss-120b",
80
+ messages=[m.dict() for m in req.messages],
81
+ temperature=req.temperature,
82
+ top_p=req.top_p,
83
+ max_tokens=req.max_tokens,
84
+ stream=False,
85
+ )
86
+ except Exception as e:
87
+ logger.exception("Error while calling NVIDIA chat completion for /chat")
88
+ # Do not leak internal error details to the client.
89
+ raise HTTPException(
90
+ status_code=500,
91
+ detail="Internal server error while calling upstream model.",
92
+ ) from e
93
+
94
+ # Combine all response message parts into a single string
95
+ try:
96
+ content = completion.choices[0].message.content or ""
97
+ except Exception:
98
+ logger.exception("Unexpected response format from NVIDIA API for /chat")
99
+ raise HTTPException(
100
+ status_code=502,
101
+ detail="Bad response from upstream model provider.",
102
+ )
103
+
104
+ return ChatResponse(content=content)
105
+
106
+
107
+ @app.post("/hf/generate", response_model=List[HFResponseItem])
108
+ def hf_generate(req: HFRequest):
109
+ """
110
+ Hugging Face-style text-generation endpoint.
111
+
112
+ Request:
113
+ {
114
+ "inputs": "your prompt",
115
+ "parameters": {
116
+ "temperature": 0.7,
117
+ "top_p": 0.95,
118
+ "max_new_tokens": 256
119
+ }
120
+ }
121
+
122
+ Response:
123
+ [
124
+ { "generated_text": "..." }
125
+ ]
126
+ """
127
+ params = req.parameters or HFParameters()
128
+
129
+ try:
130
+ completion = client.chat.completions.create(
131
+ model="openai/gpt-oss-120b",
132
+ messages=[{"role": "user", "content": req.inputs}],
133
+ temperature=params.temperature if params.temperature is not None else 1.0,
134
+ top_p=params.top_p if params.top_p is not None else 1.0,
135
+ max_tokens=params.max_new_tokens if params.max_new_tokens is not None else 4096,
136
+ stream=False,
137
+ )
138
+ except Exception as e:
139
+ logger.exception("Error while calling NVIDIA chat completion for /hf/generate")
140
+ raise HTTPException(
141
+ status_code=500,
142
+ detail="Internal server error while calling upstream model.",
143
+ ) from e
144
+
145
+ try:
146
+ content = completion.choices[0].message.content or ""
147
+ except Exception:
148
+ logger.exception("Unexpected response format from NVIDIA API for /hf/generate")
149
+ raise HTTPException(
150
+ status_code=502,
151
+ detail="Bad response from upstream model provider.",
152
+ )
153
+
154
+ # Match the common HF text-generation API: list of objects with generated_text
155
+ return [HFResponseItem(generated_text=content)]
156
+
157
+
158
+ if __name__ == "__main__":
159
+ import uvicorn
160
+
161
+ uvicorn.run("server:app", host="0.0.0.0", port=8000, reload=True)
162
+