rasAli02 commited on
Commit
8d2779b
·
1 Parent(s): 4cc00df

feat: finalize MI300X inference connection and live status update

Browse files
backend/agents.py CHANGED
@@ -15,17 +15,17 @@ import httpx # async HTTP — lightweight, no extra deps beyond requirements
15
  # ── AMD vLLM inference endpoint ─────────────────────────────────────────────
16
  # vLLM exposes an OpenAI-compatible API at /v1/chat/completions.
17
  # Set AMD_INFERENCE_URL in your .env to point at the running vLLM server.
18
- # Example: http://129.212.191.163 (direct port — ensure firewall allows it)
19
- # Or use the Jupyter proxy route: http://129.212.191.163/proxy/8000
20
  AMD_INFERENCE_URL = os.environ.get(
21
  "AMD_INFERENCE_URL",
22
- "http://165.245.143.46:8000"
23
  ).rstrip("/")
24
 
25
  # Token for the AMD inference server (if required)
26
  AMD_INFERENCE_TOKEN = os.environ.get(
27
  "AMD_INFERENCE_TOKEN",
28
- "5peRa6unb0DdXvzB3Pbck48IgNTDmxeJSUvE4NdnhvW70FcaX"
29
  )
30
 
31
  # The model name vLLM is serving (used in the chat/completions request).
 
15
  # ── AMD vLLM inference endpoint ─────────────────────────────────────────────
16
  # vLLM exposes an OpenAI-compatible API at /v1/chat/completions.
17
  # Set AMD_INFERENCE_URL in your .env to point at the running vLLM server.
18
+ # Example: http://165.245.143.46:8000 (direct port — ensure firewall allows it)
19
+ # Or use the Jupyter proxy route: http://165.245.143.46/proxy/8000
20
  AMD_INFERENCE_URL = os.environ.get(
21
  "AMD_INFERENCE_URL",
22
+ "http://165.245.137.80"
23
  ).rstrip("/")
24
 
25
  # Token for the AMD inference server (if required)
26
  AMD_INFERENCE_TOKEN = os.environ.get(
27
  "AMD_INFERENCE_TOKEN",
28
+ "DiPipPSZoxb96rcrP7X+B0N5mTTEzxU/ziesgI/Z2NPo9xPKM"
29
  )
30
 
31
  # The model name vLLM is serving (used in the chat/completions request).
backend/deploy_to_amd.sh CHANGED
@@ -59,8 +59,10 @@ cat > /opt/forgesight/.env << 'EOF'
59
  MONGO_URL=mongodb://localhost:27017
60
  DB_NAME=forgesight
61
  CORS_ORIGINS=*
62
- # Set your AMD vLLM inference server URL here if running a local model:
63
- AMD_INFERENCE_URL=http://localhost:8000
 
 
64
  EOF
65
 
66
  echo ""
 
59
  MONGO_URL=mongodb://localhost:27017
60
  DB_NAME=forgesight
61
  CORS_ORIGINS=*
62
+ # Set your AMD vLLM inference server URL here:
63
+ AMD_INFERENCE_URL=http://165.245.137.80
64
+ AMD_INFERENCE_TOKEN=DiPipPSZoxb96rcrP7X+B0N5mTTEzxU/ziesgI/Z2NPo9xPKM
65
+ AMD_MODEL_NAME=Qwen/Qwen2-VL-7B-Instruct
66
  EOF
67
 
68
  echo ""
backend/server.py CHANGED
@@ -341,3 +341,9 @@ logger = logging.getLogger("forgesight")
341
  @app.on_event("shutdown")
342
  async def shutdown_db_client():
343
  client.close()
 
 
 
 
 
 
 
341
  @app.on_event("shutdown")
342
  async def shutdown_db_client():
343
  client.close()
344
+
345
+
346
+ if __name__ == "__main__":
347
+ import uvicorn
348
+ port = int(os.environ.get("PORT", 8001))
349
+ uvicorn.run(app, host="0.0.0.0", port=port)
backend/start_vllm.sh CHANGED
@@ -17,13 +17,13 @@ fi
17
  export HSA_OVERRIDE_GFX_VERSION=11.0.0
18
  export NCCL_DEBUG=ERROR
19
 
20
- python3 -m vllm.entrypoints.openai.api_server \
21
- --model "$MODEL_NAME" \
22
  --host 0.0.0.0 \
23
  --port "$PORT" \
24
- --trust-remote-code \
25
- --dtype bfloat16 \
26
- --limit-mm-per-prompt image=1 \
27
- --gpu-memory-utilization 0.95 \
28
- --max-model-len 8192 \
29
- --tensor-parallel-size 1
 
 
17
  export HSA_OVERRIDE_GFX_VERSION=11.0.0
18
  export NCCL_DEBUG=ERROR
19
 
20
+ vllm serve "$MODEL_NAME" \
 
21
  --host 0.0.0.0 \
22
  --port "$PORT" \
23
+ --tensor-parallel-size 8 \
24
+ --enable-expert-parallel \
25
+ --mm-encoder-tp-mode data \
26
+ --mm-processor-cache-type shm \
27
+ --reasoning-parser qwen3 \
28
+ --enable-prefix-caching \
29
+ --trust-remote-code
hf_space/agents.py CHANGED
@@ -15,17 +15,17 @@ import httpx # async HTTP — lightweight, no extra deps beyond requirements
15
  # ── AMD vLLM inference endpoint ─────────────────────────────────────────────
16
  # vLLM exposes an OpenAI-compatible API at /v1/chat/completions.
17
  # Set AMD_INFERENCE_URL in your .env to point at the running vLLM server.
18
- # Example: http://129.212.191.163:8000 (direct port — ensure firewall allows it)
19
- # Or use the Jupyter proxy route: http://129.212.191.163/proxy/8000
20
  AMD_INFERENCE_URL = os.environ.get(
21
  "AMD_INFERENCE_URL",
22
- "http://165.245.143.46:8000"
23
  ).rstrip("/")
24
 
25
  # Token for the AMD inference server (if required)
26
  AMD_INFERENCE_TOKEN = os.environ.get(
27
  "AMD_INFERENCE_TOKEN",
28
- "5peRa6unb0DdXvzB3Pbck48IgNTDmxeJSUvE4NdnhvW70FcaX"
29
  )
30
 
31
  # The model name vLLM is serving (used in the chat/completions request).
 
15
  # ── AMD vLLM inference endpoint ─────────────────────────────────────────────
16
  # vLLM exposes an OpenAI-compatible API at /v1/chat/completions.
17
  # Set AMD_INFERENCE_URL in your .env to point at the running vLLM server.
18
+ # Example: http://165.245.143.46:8000 (direct port — ensure firewall allows it)
19
+ # Or use the Jupyter proxy route: http://165.245.143.46/proxy/8000
20
  AMD_INFERENCE_URL = os.environ.get(
21
  "AMD_INFERENCE_URL",
22
+ "http://165.245.137.80"
23
  ).rstrip("/")
24
 
25
  # Token for the AMD inference server (if required)
26
  AMD_INFERENCE_TOKEN = os.environ.get(
27
  "AMD_INFERENCE_TOKEN",
28
+ "DiPipPSZoxb96rcrP7X+B0N5mTTEzxU/ziesgI/Z2NPo9xPKM"
29
  )
30
 
31
  # The model name vLLM is serving (used in the chat/completions request).
hf_space_repo CHANGED
@@ -1 +1 @@
1
- Subproject commit 53cd64c92db6de4b88570e9f034bf0164c174619
 
1
+ Subproject commit 5afad5017a9c8584dd462568837d8fa95ebfe1d1
lablab_forgesight ADDED
@@ -0,0 +1 @@
 
 
1
+ Subproject commit 7ca49a46b7aa216edef41b4dc1a0f9095ec71ef9
lablab_forgesight_download/.gitattributes ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
lablab_forgesight_download/README.md ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Forgesight
3
+ emoji: 🌍
4
+ colorFrom: gray
5
+ colorTo: indigo
6
+ sdk: gradio
7
+ sdk_version: 6.14.0
8
+ python_version: '3.13'
9
+ app_file: app.py
10
+ pinned: false
11
+ ---
12
+
13
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
scratch/test_amd_connection.py ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import httpx
2
+ import asyncio
3
+ import json
4
+
5
+ AMD_URL = "http://165.245.137.80"
6
+ AMD_TOKEN = "DiPipPSZoxb96rcrP7X+B0N5mTTEzxU/ziesgI/Z2NPo9xPKM"
7
+
8
+ async def test():
9
+ headers = {"Authorization": f"Bearer {AMD_TOKEN}"}
10
+
11
+ print(f"Testing connectivity to {AMD_URL}...")
12
+
13
+ # 1. Test port 80 / proxy
14
+ try:
15
+ async with httpx.AsyncClient(timeout=10) as client:
16
+ r = await client.get(f"{AMD_URL}/v1/models", headers=headers)
17
+ print(f"Port 80 /v1/models: {r.status_code}")
18
+ if r.status_code == 200:
19
+ print("SUCCESS: vLLM is alive on Port 80!")
20
+ print(r.json())
21
+ return
22
+ except Exception as e:
23
+ print(f"Port 80 /v1/models failed: {e}")
24
+
25
+ # 2. Test /proxy/8000
26
+ try:
27
+ async with httpx.AsyncClient(timeout=10) as client:
28
+ r = await client.get(f"{AMD_URL}/proxy/8000/v1/models", headers=headers)
29
+ print(f"Port 80 /proxy/8000/v1/models: {r.status_code}")
30
+ if r.status_code == 200:
31
+ print("SUCCESS: vLLM is alive on /proxy/8000!")
32
+ print(r.json())
33
+ return
34
+ except Exception as e:
35
+ print(f"/proxy/8000 failed: {e}")
36
+
37
+ # 3. Test port 8000 directly
38
+ try:
39
+ async with httpx.AsyncClient(timeout=10) as client:
40
+ r = await client.get(f"http://165.245.137.80:8000/v1/models", headers=headers)
41
+ print(f"Port 8000 /v1/models: {r.status_code}")
42
+ if r.status_code == 200:
43
+ print("SUCCESS: vLLM is alive on Port 8000!")
44
+ print(r.json())
45
+ return
46
+ except Exception as e:
47
+ print(f"Port 8000 failed: {e}")
48
+
49
+ if __name__ == "__main__":
50
+ asyncio.run(test())