Lzy01241010 Claude Opus 4.7 commited on
Commit
1dc5d45
·
1 Parent(s): 0c72122

app: add QUEST_API_KEY for self-hosted endpoint Bearer

Browse files

Adds a dedicated env var so the Bearer sent to QUEST_BASE_URL can be
separated from HF_TOKEN. When QUEST_API_KEY is unset, behaviour is
unchanged (falls back to HF_TOKEN), preserving the existing dedicated
HF Inference Endpoint flow.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>

Files changed (2) hide show
  1. .env.example +7 -0
  2. app.py +4 -1
.env.example CHANGED
@@ -13,6 +13,13 @@ QUEST_BASE_URL=https://your-endpoint-id.aws.endpoints.huggingface.cloud/v1/
13
  # vLLM containers usually use the original repo id ("osunlp/Quest-4B").
14
  QUEST_ENDPOINT_MODEL=tgi
15
 
 
 
 
 
 
 
 
16
  # Default model preselected in the dropdown.
17
  DEFAULT_MODEL=osunlp/Quest-4B
18
 
 
13
  # vLLM containers usually use the original repo id ("osunlp/Quest-4B").
14
  QUEST_ENDPOINT_MODEL=tgi
15
 
16
+ # Bearer token sent to QUEST_BASE_URL. Optional. When unset, HF_TOKEN is used
17
+ # (matches legacy behaviour for HF Inference Endpoints). Set this to the
18
+ # `--api-key` of a self-hosted vLLM (or any other OpenAI-compatible server
19
+ # you tunnel through Cloudflare/ngrok) so the real HF_TOKEN never reaches
20
+ # third-party logs.
21
+ QUEST_API_KEY=
22
+
23
  # Default model preselected in the dropdown.
24
  DEFAULT_MODEL=osunlp/Quest-4B
25
 
app.py CHANGED
@@ -1389,9 +1389,12 @@ def _build_client_for_model(model: str) -> Tuple[InferenceClient, str, List[str]
1389
  token = os.getenv("HF_TOKEN")
1390
  quest_timeout = int(os.getenv("QUEST_REQUEST_TIMEOUT", "600"))
1391
  if model == QUEST_MODEL_ID and QUEST_BASE_URL:
 
 
 
1392
  client = InferenceClient(
1393
  base_url=QUEST_BASE_URL,
1394
- token=token,
1395
  timeout=quest_timeout,
1396
  )
1397
  return client, QUEST_ENDPOINT_MODEL, []
 
1389
  token = os.getenv("HF_TOKEN")
1390
  quest_timeout = int(os.getenv("QUEST_REQUEST_TIMEOUT", "600"))
1391
  if model == QUEST_MODEL_ID and QUEST_BASE_URL:
1392
+ # Prefer a dedicated key for the self-hosted endpoint so the real HF
1393
+ # token never travels into vLLM / tunnel logs.
1394
+ endpoint_token = os.getenv("QUEST_API_KEY") or token
1395
  client = InferenceClient(
1396
  base_url=QUEST_BASE_URL,
1397
+ token=endpoint_token,
1398
  timeout=quest_timeout,
1399
  )
1400
  return client, QUEST_ENDPOINT_MODEL, []