Commit ·
1dc5d45
1
Parent(s): 0c72122
app: add QUEST_API_KEY for self-hosted endpoint Bearer
Browse filesAdds a dedicated env var so the Bearer sent to QUEST_BASE_URL can be
separated from HF_TOKEN. When QUEST_API_KEY is unset, behaviour is
unchanged (falls back to HF_TOKEN), preserving the existing dedicated
HF Inference Endpoint flow.
Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
- .env.example +7 -0
- app.py +4 -1
.env.example
CHANGED
|
@@ -13,6 +13,13 @@ QUEST_BASE_URL=https://your-endpoint-id.aws.endpoints.huggingface.cloud/v1/
|
|
| 13 |
# vLLM containers usually use the original repo id ("osunlp/Quest-4B").
|
| 14 |
QUEST_ENDPOINT_MODEL=tgi
|
| 15 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 16 |
# Default model preselected in the dropdown.
|
| 17 |
DEFAULT_MODEL=osunlp/Quest-4B
|
| 18 |
|
|
|
|
| 13 |
# vLLM containers usually use the original repo id ("osunlp/Quest-4B").
|
| 14 |
QUEST_ENDPOINT_MODEL=tgi
|
| 15 |
|
| 16 |
+
# Bearer token sent to QUEST_BASE_URL. Optional. When unset, HF_TOKEN is used
|
| 17 |
+
# (matches legacy behaviour for HF Inference Endpoints). Set this to the
|
| 18 |
+
# `--api-key` of a self-hosted vLLM (or any other OpenAI-compatible server
|
| 19 |
+
# you tunnel through Cloudflare/ngrok) so the real HF_TOKEN never reaches
|
| 20 |
+
# third-party logs.
|
| 21 |
+
QUEST_API_KEY=
|
| 22 |
+
|
| 23 |
# Default model preselected in the dropdown.
|
| 24 |
DEFAULT_MODEL=osunlp/Quest-4B
|
| 25 |
|
app.py
CHANGED
|
@@ -1389,9 +1389,12 @@ def _build_client_for_model(model: str) -> Tuple[InferenceClient, str, List[str]
|
|
| 1389 |
token = os.getenv("HF_TOKEN")
|
| 1390 |
quest_timeout = int(os.getenv("QUEST_REQUEST_TIMEOUT", "600"))
|
| 1391 |
if model == QUEST_MODEL_ID and QUEST_BASE_URL:
|
|
|
|
|
|
|
|
|
|
| 1392 |
client = InferenceClient(
|
| 1393 |
base_url=QUEST_BASE_URL,
|
| 1394 |
-
token=
|
| 1395 |
timeout=quest_timeout,
|
| 1396 |
)
|
| 1397 |
return client, QUEST_ENDPOINT_MODEL, []
|
|
|
|
| 1389 |
token = os.getenv("HF_TOKEN")
|
| 1390 |
quest_timeout = int(os.getenv("QUEST_REQUEST_TIMEOUT", "600"))
|
| 1391 |
if model == QUEST_MODEL_ID and QUEST_BASE_URL:
|
| 1392 |
+
# Prefer a dedicated key for the self-hosted endpoint so the real HF
|
| 1393 |
+
# token never travels into vLLM / tunnel logs.
|
| 1394 |
+
endpoint_token = os.getenv("QUEST_API_KEY") or token
|
| 1395 |
client = InferenceClient(
|
| 1396 |
base_url=QUEST_BASE_URL,
|
| 1397 |
+
token=endpoint_token,
|
| 1398 |
timeout=quest_timeout,
|
| 1399 |
)
|
| 1400 |
return client, QUEST_ENDPOINT_MODEL, []
|