feat: finalize MI300X inference connection and live status update
Browse files- backend/agents.py +4 -4
- backend/deploy_to_amd.sh +4 -2
- backend/server.py +6 -0
- backend/start_vllm.sh +8 -8
- hf_space/agents.py +4 -4
- hf_space_repo +1 -1
- lablab_forgesight +1 -0
- lablab_forgesight_download/.gitattributes +35 -0
- lablab_forgesight_download/README.md +13 -0
- scratch/test_amd_connection.py +50 -0
backend/agents.py
CHANGED
|
@@ -15,17 +15,17 @@ import httpx # async HTTP — lightweight, no extra deps beyond requirements
|
|
| 15 |
# ── AMD vLLM inference endpoint ─────────────────────────────────────────────
|
| 16 |
# vLLM exposes an OpenAI-compatible API at /v1/chat/completions.
|
| 17 |
# Set AMD_INFERENCE_URL in your .env to point at the running vLLM server.
|
| 18 |
-
# Example: http://
|
| 19 |
-
# Or use the Jupyter proxy route: http://
|
| 20 |
AMD_INFERENCE_URL = os.environ.get(
|
| 21 |
"AMD_INFERENCE_URL",
|
| 22 |
-
"http://165.245.
|
| 23 |
).rstrip("/")
|
| 24 |
|
| 25 |
# Token for the AMD inference server (if required)
|
| 26 |
AMD_INFERENCE_TOKEN = os.environ.get(
|
| 27 |
"AMD_INFERENCE_TOKEN",
|
| 28 |
-
"
|
| 29 |
)
|
| 30 |
|
| 31 |
# The model name vLLM is serving (used in the chat/completions request).
|
|
|
|
| 15 |
# ── AMD vLLM inference endpoint ─────────────────────────────────────────────
|
| 16 |
# vLLM exposes an OpenAI-compatible API at /v1/chat/completions.
|
| 17 |
# Set AMD_INFERENCE_URL in your .env to point at the running vLLM server.
|
| 18 |
+
# Example: http://165.245.143.46:8000 (direct port — ensure firewall allows it)
|
| 19 |
+
# Or use the Jupyter proxy route: http://165.245.143.46/proxy/8000
|
| 20 |
AMD_INFERENCE_URL = os.environ.get(
|
| 21 |
"AMD_INFERENCE_URL",
|
| 22 |
+
"http://165.245.137.80"
|
| 23 |
).rstrip("/")
|
| 24 |
|
| 25 |
# Token for the AMD inference server (if required)
|
| 26 |
AMD_INFERENCE_TOKEN = os.environ.get(
|
| 27 |
"AMD_INFERENCE_TOKEN",
|
| 28 |
+
"DiPipPSZoxb96rcrP7X+B0N5mTTEzxU/ziesgI/Z2NPo9xPKM"
|
| 29 |
)
|
| 30 |
|
| 31 |
# The model name vLLM is serving (used in the chat/completions request).
|
backend/deploy_to_amd.sh
CHANGED
|
@@ -59,8 +59,10 @@ cat > /opt/forgesight/.env << 'EOF'
|
|
| 59 |
MONGO_URL=mongodb://localhost:27017
|
| 60 |
DB_NAME=forgesight
|
| 61 |
CORS_ORIGINS=*
|
| 62 |
-
# Set your AMD vLLM inference server URL here
|
| 63 |
-
AMD_INFERENCE_URL=http://
|
|
|
|
|
|
|
| 64 |
EOF
|
| 65 |
|
| 66 |
echo ""
|
|
|
|
| 59 |
MONGO_URL=mongodb://localhost:27017
|
| 60 |
DB_NAME=forgesight
|
| 61 |
CORS_ORIGINS=*
|
| 62 |
+
# Set your AMD vLLM inference server URL here:
|
| 63 |
+
AMD_INFERENCE_URL=http://165.245.137.80
|
| 64 |
+
AMD_INFERENCE_TOKEN=DiPipPSZoxb96rcrP7X+B0N5mTTEzxU/ziesgI/Z2NPo9xPKM
|
| 65 |
+
AMD_MODEL_NAME=Qwen/Qwen2-VL-7B-Instruct
|
| 66 |
EOF
|
| 67 |
|
| 68 |
echo ""
|
backend/server.py
CHANGED
|
@@ -341,3 +341,9 @@ logger = logging.getLogger("forgesight")
|
|
| 341 |
@app.on_event("shutdown")
|
| 342 |
async def shutdown_db_client():
|
| 343 |
client.close()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 341 |
@app.on_event("shutdown")
|
| 342 |
async def shutdown_db_client():
|
| 343 |
client.close()
|
| 344 |
+
|
| 345 |
+
|
| 346 |
+
if __name__ == "__main__":
|
| 347 |
+
import uvicorn
|
| 348 |
+
port = int(os.environ.get("PORT", 8001))
|
| 349 |
+
uvicorn.run(app, host="0.0.0.0", port=port)
|
backend/start_vllm.sh
CHANGED
|
@@ -17,13 +17,13 @@ fi
|
|
| 17 |
export HSA_OVERRIDE_GFX_VERSION=11.0.0
|
| 18 |
export NCCL_DEBUG=ERROR
|
| 19 |
|
| 20 |
-
|
| 21 |
-
--model "$MODEL_NAME" \
|
| 22 |
--host 0.0.0.0 \
|
| 23 |
--port "$PORT" \
|
| 24 |
-
--
|
| 25 |
-
--
|
| 26 |
-
--
|
| 27 |
-
--
|
| 28 |
-
--
|
| 29 |
-
--
|
|
|
|
|
|
| 17 |
export HSA_OVERRIDE_GFX_VERSION=11.0.0
|
| 18 |
export NCCL_DEBUG=ERROR
|
| 19 |
|
| 20 |
+
vllm serve "$MODEL_NAME" \
|
|
|
|
| 21 |
--host 0.0.0.0 \
|
| 22 |
--port "$PORT" \
|
| 23 |
+
--tensor-parallel-size 8 \
|
| 24 |
+
--enable-expert-parallel \
|
| 25 |
+
--mm-encoder-tp-mode data \
|
| 26 |
+
--mm-processor-cache-type shm \
|
| 27 |
+
--reasoning-parser qwen3 \
|
| 28 |
+
--enable-prefix-caching \
|
| 29 |
+
--trust-remote-code
|
hf_space/agents.py
CHANGED
|
@@ -15,17 +15,17 @@ import httpx # async HTTP — lightweight, no extra deps beyond requirements
|
|
| 15 |
# ── AMD vLLM inference endpoint ─────────────────────────────────────────────
|
| 16 |
# vLLM exposes an OpenAI-compatible API at /v1/chat/completions.
|
| 17 |
# Set AMD_INFERENCE_URL in your .env to point at the running vLLM server.
|
| 18 |
-
# Example: http://
|
| 19 |
-
# Or use the Jupyter proxy route: http://
|
| 20 |
AMD_INFERENCE_URL = os.environ.get(
|
| 21 |
"AMD_INFERENCE_URL",
|
| 22 |
-
"http://165.245.
|
| 23 |
).rstrip("/")
|
| 24 |
|
| 25 |
# Token for the AMD inference server (if required)
|
| 26 |
AMD_INFERENCE_TOKEN = os.environ.get(
|
| 27 |
"AMD_INFERENCE_TOKEN",
|
| 28 |
-
"
|
| 29 |
)
|
| 30 |
|
| 31 |
# The model name vLLM is serving (used in the chat/completions request).
|
|
|
|
| 15 |
# ── AMD vLLM inference endpoint ─────────────────────────────────────────────
|
| 16 |
# vLLM exposes an OpenAI-compatible API at /v1/chat/completions.
|
| 17 |
# Set AMD_INFERENCE_URL in your .env to point at the running vLLM server.
|
| 18 |
+
# Example: http://165.245.143.46:8000 (direct port — ensure firewall allows it)
|
| 19 |
+
# Or use the Jupyter proxy route: http://165.245.143.46/proxy/8000
|
| 20 |
AMD_INFERENCE_URL = os.environ.get(
|
| 21 |
"AMD_INFERENCE_URL",
|
| 22 |
+
"http://165.245.137.80"
|
| 23 |
).rstrip("/")
|
| 24 |
|
| 25 |
# Token for the AMD inference server (if required)
|
| 26 |
AMD_INFERENCE_TOKEN = os.environ.get(
|
| 27 |
"AMD_INFERENCE_TOKEN",
|
| 28 |
+
"DiPipPSZoxb96rcrP7X+B0N5mTTEzxU/ziesgI/Z2NPo9xPKM"
|
| 29 |
)
|
| 30 |
|
| 31 |
# The model name vLLM is serving (used in the chat/completions request).
|
hf_space_repo
CHANGED
|
@@ -1 +1 @@
|
|
| 1 |
-
Subproject commit
|
|
|
|
| 1 |
+
Subproject commit 5afad5017a9c8584dd462568837d8fa95ebfe1d1
|
lablab_forgesight
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
Subproject commit 7ca49a46b7aa216edef41b4dc1a0f9095ec71ef9
|
lablab_forgesight_download/.gitattributes
ADDED
|
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
*.7z filter=lfs diff=lfs merge=lfs -text
|
| 2 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
| 3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
| 4 |
+
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
| 5 |
+
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
| 6 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
| 7 |
+
*.gz filter=lfs diff=lfs merge=lfs -text
|
| 8 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
| 9 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
| 10 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
| 11 |
+
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
| 12 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
| 13 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
| 14 |
+
*.npy filter=lfs diff=lfs merge=lfs -text
|
| 15 |
+
*.npz filter=lfs diff=lfs merge=lfs -text
|
| 16 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
| 17 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
| 18 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
| 19 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
| 20 |
+
*.pickle filter=lfs diff=lfs merge=lfs -text
|
| 21 |
+
*.pkl filter=lfs diff=lfs merge=lfs -text
|
| 22 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
| 23 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
| 24 |
+
*.rar filter=lfs diff=lfs merge=lfs -text
|
| 25 |
+
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
| 26 |
+
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
| 27 |
+
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
| 28 |
+
*.tar filter=lfs diff=lfs merge=lfs -text
|
| 29 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
| 30 |
+
*.tgz filter=lfs diff=lfs merge=lfs -text
|
| 31 |
+
*.wasm filter=lfs diff=lfs merge=lfs -text
|
| 32 |
+
*.xz filter=lfs diff=lfs merge=lfs -text
|
| 33 |
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
+
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
lablab_forgesight_download/README.md
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
title: Forgesight
|
| 3 |
+
emoji: 🌍
|
| 4 |
+
colorFrom: gray
|
| 5 |
+
colorTo: indigo
|
| 6 |
+
sdk: gradio
|
| 7 |
+
sdk_version: 6.14.0
|
| 8 |
+
python_version: '3.13'
|
| 9 |
+
app_file: app.py
|
| 10 |
+
pinned: false
|
| 11 |
+
---
|
| 12 |
+
|
| 13 |
+
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
scratch/test_amd_connection.py
ADDED
|
@@ -0,0 +1,50 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import httpx
|
| 2 |
+
import asyncio
|
| 3 |
+
import json
|
| 4 |
+
|
| 5 |
+
AMD_URL = "http://165.245.137.80"
|
| 6 |
+
AMD_TOKEN = "DiPipPSZoxb96rcrP7X+B0N5mTTEzxU/ziesgI/Z2NPo9xPKM"
|
| 7 |
+
|
| 8 |
+
async def test():
|
| 9 |
+
headers = {"Authorization": f"Bearer {AMD_TOKEN}"}
|
| 10 |
+
|
| 11 |
+
print(f"Testing connectivity to {AMD_URL}...")
|
| 12 |
+
|
| 13 |
+
# 1. Test port 80 / proxy
|
| 14 |
+
try:
|
| 15 |
+
async with httpx.AsyncClient(timeout=10) as client:
|
| 16 |
+
r = await client.get(f"{AMD_URL}/v1/models", headers=headers)
|
| 17 |
+
print(f"Port 80 /v1/models: {r.status_code}")
|
| 18 |
+
if r.status_code == 200:
|
| 19 |
+
print("SUCCESS: vLLM is alive on Port 80!")
|
| 20 |
+
print(r.json())
|
| 21 |
+
return
|
| 22 |
+
except Exception as e:
|
| 23 |
+
print(f"Port 80 /v1/models failed: {e}")
|
| 24 |
+
|
| 25 |
+
# 2. Test /proxy/8000
|
| 26 |
+
try:
|
| 27 |
+
async with httpx.AsyncClient(timeout=10) as client:
|
| 28 |
+
r = await client.get(f"{AMD_URL}/proxy/8000/v1/models", headers=headers)
|
| 29 |
+
print(f"Port 80 /proxy/8000/v1/models: {r.status_code}")
|
| 30 |
+
if r.status_code == 200:
|
| 31 |
+
print("SUCCESS: vLLM is alive on /proxy/8000!")
|
| 32 |
+
print(r.json())
|
| 33 |
+
return
|
| 34 |
+
except Exception as e:
|
| 35 |
+
print(f"/proxy/8000 failed: {e}")
|
| 36 |
+
|
| 37 |
+
# 3. Test port 8000 directly
|
| 38 |
+
try:
|
| 39 |
+
async with httpx.AsyncClient(timeout=10) as client:
|
| 40 |
+
r = await client.get(f"http://165.245.137.80:8000/v1/models", headers=headers)
|
| 41 |
+
print(f"Port 8000 /v1/models: {r.status_code}")
|
| 42 |
+
if r.status_code == 200:
|
| 43 |
+
print("SUCCESS: vLLM is alive on Port 8000!")
|
| 44 |
+
print(r.json())
|
| 45 |
+
return
|
| 46 |
+
except Exception as e:
|
| 47 |
+
print(f"Port 8000 failed: {e}")
|
| 48 |
+
|
| 49 |
+
if __name__ == "__main__":
|
| 50 |
+
asyncio.run(test())
|