Spaces:

osunlp
/

QUEST

Running

Lzy01241010 Claude Opus 4.7 commited on 12 days ago

Commit

1dc5d45

1 Parent(s): 0c72122

app: add QUEST_API_KEY for self-hosted endpoint Bearer

Adds a dedicated env var so the Bearer sent to QUEST_BASE_URL can be
separated from HF_TOKEN. When QUEST_API_KEY is unset, behaviour is
unchanged (falls back to HF_TOKEN), preserving the existing dedicated
HF Inference Endpoint flow.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>

Files changed (2) hide show

.env.example +7 -0
app.py +4 -1

.env.example CHANGED Viewed

@@ -13,6 +13,13 @@ QUEST_BASE_URL=https://your-endpoint-id.aws.endpoints.huggingface.cloud/v1/
 # vLLM containers usually use the original repo id ("osunlp/Quest-4B").
 QUEST_ENDPOINT_MODEL=tgi
 # Default model preselected in the dropdown.
 DEFAULT_MODEL=osunlp/Quest-4B

 # vLLM containers usually use the original repo id ("osunlp/Quest-4B").
 QUEST_ENDPOINT_MODEL=tgi
+# Bearer token sent to QUEST_BASE_URL. Optional. When unset, HF_TOKEN is used
+# (matches legacy behaviour for HF Inference Endpoints). Set this to the
+# `--api-key` of a self-hosted vLLM (or any other OpenAI-compatible server
+# you tunnel through Cloudflare/ngrok) so the real HF_TOKEN never reaches
+# third-party logs.
+QUEST_API_KEY=
 # Default model preselected in the dropdown.
 DEFAULT_MODEL=osunlp/Quest-4B

app.py CHANGED Viewed

@@ -1389,9 +1389,12 @@ def _build_client_for_model(model: str) -> Tuple[InferenceClient, str, List[str]
     token = os.getenv("HF_TOKEN")
     quest_timeout = int(os.getenv("QUEST_REQUEST_TIMEOUT", "600"))
     if model == QUEST_MODEL_ID and QUEST_BASE_URL:
         client = InferenceClient(
             base_url=QUEST_BASE_URL,
-            token=token,
             timeout=quest_timeout,
         )
         return client, QUEST_ENDPOINT_MODEL, []

     token = os.getenv("HF_TOKEN")
     quest_timeout = int(os.getenv("QUEST_REQUEST_TIMEOUT", "600"))
     if model == QUEST_MODEL_ID and QUEST_BASE_URL:
+        # Prefer a dedicated key for the self-hosted endpoint so the real HF
+        # token never travels into vLLM / tunnel logs.
+        endpoint_token = os.getenv("QUEST_API_KEY") or token
         client = InferenceClient(
             base_url=QUEST_BASE_URL,
+            token=endpoint_token,
             timeout=quest_timeout,
         )
         return client, QUEST_ENDPOINT_MODEL, []