Spaces:

dpv007
/

cloud

Sleeping

App Files Files Community

dpv007 commited on Dec 12, 2025

Commit

879187e

verified ·

1 Parent(s): 45ec08d

Update app.py

Browse files

Files changed (1) hide show

app.py +159 -109

app.py CHANGED Viewed

@@ -12,7 +12,7 @@ Notes:
  - Add httpx to requirements.txt for VLM POST/GET flow
  - If VLM/LLM Spaces are private, set HF_TOKEN in the environment for authentication.
  - This variant:
-    * sends the face image to the HF Space using the POST/GET event flow
     * returns raw VLM output and meta (no VLM-side JSON extraction)
 """
@@ -250,6 +250,7 @@ def extract_json_via_regex(raw_text: str) -> Dict[str, Any]:
 # -----------------------
 # VLM helper using HF Spaces POST/GET event flow (gradio_api/call/chat)
 # -----------------------
 def run_vlm_and_get_features(face_path: str, eye_path: Optional[str] = None, prompt: Optional[str] = None,
                              raise_on_file_delivery_failure: bool = False
@@ -259,30 +260,27 @@ def run_vlm_and_get_features(face_path: str, eye_path: Optional[str] = None, pro
       1) POST -> returns an EVENT_ID
       2) GET /gradio_api/call/chat/{EVENT_ID} -> fetch result
-    This function:
-      - Loads face image, encodes as base64 and embeds in JSON payload as a single file
-      - POSTs to the Space endpoint to create an event
-      - GETs the event result and extracts text/output
-      - Returns (parsed_features_or_None, raw_text, meta)
-    NOTE: The function returns parsed_features=None (no JSON extraction here) and raw_text for LLM downstream.
     """
     prompt = prompt or DEFAULT_VLM_PROMPT
     if not os.path.exists(face_path):
         raise FileNotFoundError(f"Face image not found at: {face_path}")
-    # Read and base64-encode the face image for embedding in JSON
     with open(face_path, "rb") as f:
         face_bytes = f.read()
     if not face_bytes:
         raise ValueError("Face image is empty (0 bytes)")
     face_b64 = base64.b64encode(face_bytes).decode("ascii")
-    # prefix with MIME type (assume jpeg)
     face_data_uri = f"data:image/jpeg;base64,{face_b64}"
-    # Build the JSON payload consistent with gradio multimodal style:
-    payload = {
         "data": [
             {
                 "text": prompt,
@@ -296,8 +294,7 @@ def run_vlm_and_get_features(face_path: str, eye_path: Optional[str] = None, pro
     if SPACE_HOST:
         base_url = SPACE_HOST.rstrip("/")
     else:
-        # infer from GRADIO_VLM_SPACE if it's of form "owner/space-name"
-        # Many public HF spaces also map to {owner}-{space}.hf.space
         if "/" in GRADIO_VLM_SPACE:
             base_url = f"https://{GRADIO_VLM_SPACE.replace('/', '-')}.hf.space"
         else:
@@ -306,125 +303,178 @@ def run_vlm_and_get_features(face_path: str, eye_path: Optional[str] = None, pro
     post_url = f"{base_url}/gradio_api/call/chat"
     get_url_template = f"{base_url}/gradio_api/call/chat/{{event_id}}"
-    headers = {"Content-Type": "application/json"}
     if HF_TOKEN:
-        headers["Authorization"] = f"Bearer {HF_TOKEN}"
     meta: Dict[str, Any] = {
         "vlm_file_delivery_ok": False,
         "vlm_files_seen": None,
         "vlm_raw_len": 0,
         "vlm_out_object": None,
-        "post_url": post_url
     }
-    try:
-        logger.info("VLM POST -> %s (payload text len=%d, files=1)", post_url, len(prompt))
-        with httpx.Client(timeout=30.0) as client:
-            resp = client.post(post_url, headers=headers, json=payload)
-            resp.raise_for_status()
-            # Try to robustly extract an event id from the POST response
-            event_id = None
             try:
-                rj = resp.json()
             except Exception:
-                rj = {}
-            if isinstance(rj, dict):
-                event_id = rj.get("event_id") or rj.get("id") or rj.get("job")
-            if not event_id:
-                # try to extract using regex from resp.text
-                m = re.search(r'"([^"]{8,})"', resp.text or "")
-                if m:
-                    event_id = m.group(1)
-            if not event_id:
-                parts = re.split(r'"', resp.text or "")
-                if len(parts) >= 5:
-                    event_id_candidate = parts[3].strip()
-                    if event_id_candidate:
-                        event_id = event_id_candidate
             if not event_id:
-                raise RuntimeError(f"Failed to obtain EVENT_ID from VLM POST response: {resp.text[:1000]}")
             meta["event_id"] = event_id
-            logger.info("VLM event created: %s", event_id)
-            # Poll the GET result endpoint
-            get_url = get_url_template.format(event_id=event_id)
-            logger.info("Polling VLM event result at %s", get_url)
-            max_polls = 6
-            poll_delay = 0.5
-            final_text = ""
-            last_response_json = None
-            for attempt in range(max_polls):
-                r2 = client.get(get_url, headers=headers, timeout=30.0)
-                if r2.status_code == 204 or not (r2.text and r2.text.strip()):
-                    time.sleep(poll_delay)
-                    continue
-                try:
-                    r2j = r2.json()
-                    last_response_json = r2j
-                except Exception:
-                    r2j = None
-                text_out = ""
-                if isinstance(r2j, dict):
-                    if "data" in r2j and isinstance(r2j["data"], list) and len(r2j["data"]) > 0:
-                        first = r2j["data"][0]
-                        if isinstance(first, dict):
-                            text_out = first.get("text") or first.get("output") or json.dumps(first)
-                        elif isinstance(first, str):
-                            text_out = first
-                    text_out = text_out or r2j.get("text") or r2j.get("msg") or r2j.get("output", "") or ""
-                else:
-                    text_out = r2.text or ""
-                if text_out and text_out.strip():
-                    final_text = text_out
-                    break
-                else:
-                    time.sleep(poll_delay)
-                    continue
-            if not final_text:
-                final_text = (r2.text or "").strip()
-            meta["vlm_raw_len"] = len(final_text)
-            meta["vlm_out_object"] = (final_text[:2000] + "...") if len(final_text) > 2000 else final_text
-            # Best-effort: detect whether server mentions receiving a file
-            files_seen = None
             try:
-                if isinstance(last_response_json, dict):
-                    for key in ("files", "output_files", "files_sent", "uploaded_files", "received_files"):
-                        if key in last_response_json and isinstance(last_response_json[key], (list, tuple)):
-                            files_seen = len(last_response_json[key])
-                            break
-                if files_seen is None and final_text:
-                    ext_matches = re.findall(r"\.(?:jpg|jpeg|png|bmp|gif)\b", final_text, flags=re.IGNORECASE)
-                    if ext_matches:
-                        files_seen = len(ext_matches)
-                    else:
-                        matches = re.findall(r"\b(?:uploaded|received|file)\b", final_text, flags=re.IGNORECASE)
-                        if matches:
-                            files_seen = max(1, len(matches))
             except Exception:
-                files_seen = None
-            meta["vlm_files_seen"] = files_seen
-            meta["vlm_file_delivery_ok"] = (files_seen is not None and files_seen >= 1)
-            parsed_features = None
-            return parsed_features, (final_text or ""), meta
-    except httpx.HTTPStatusError as he:
-        logger.exception("VLM HTTP error")
-        raise RuntimeError(f"VLM http error: {he.response.status_code} {str(he)}")
-    except Exception as e:
-        logger.exception("VLM call (httpx) failed")
-        raise RuntimeError(f"VLM call failed: {e}")
 # -----------------------
 # Gradio / LLM helper (defensive, with retry + clamps)

  - Add httpx to requirements.txt for VLM POST/GET flow
  - If VLM/LLM Spaces are private, set HF_TOKEN in the environment for authentication.
  - This variant:
+    * sends the face image to the HF Space using the POST/GET event flow (tries JSON data-uri first, then multipart fallback)
     * returns raw VLM output and meta (no VLM-side JSON extraction)
 """
 # -----------------------
 # VLM helper using HF Spaces POST/GET event flow (gradio_api/call/chat)
+# Robust: try JSON (data-uri) POST first; if 5xx, fall back to multipart/form-data file upload.
 # -----------------------
 def run_vlm_and_get_features(face_path: str, eye_path: Optional[str] = None, prompt: Optional[str] = None,
                              raise_on_file_delivery_failure: bool = False
       1) POST -> returns an EVENT_ID
       2) GET /gradio_api/call/chat/{EVENT_ID} -> fetch result
+    Behavior:
+      - Try JSON payload with data URI (fast path)
+      - If JSON POST yields server error (5xx), retry with multipart/form-data attaching the face image
+      - Poll GET endpoint a few times for result
+      - Return (parsed_features_or_None, raw_text, meta)
+      - parsed_features is None (we avoid parsing JSON here)
     """
     prompt = prompt or DEFAULT_VLM_PROMPT
     if not os.path.exists(face_path):
         raise FileNotFoundError(f"Face image not found at: {face_path}")
     with open(face_path, "rb") as f:
         face_bytes = f.read()
     if not face_bytes:
         raise ValueError("Face image is empty (0 bytes)")
     face_b64 = base64.b64encode(face_bytes).decode("ascii")
     face_data_uri = f"data:image/jpeg;base64,{face_b64}"
+    payload_json = {
         "data": [
             {
                 "text": prompt,
     if SPACE_HOST:
         base_url = SPACE_HOST.rstrip("/")
     else:
+        # Many public HF spaces map to {owner}-{space}.hf.space when used in hostnames.
         if "/" in GRADIO_VLM_SPACE:
             base_url = f"https://{GRADIO_VLM_SPACE.replace('/', '-')}.hf.space"
         else:
     post_url = f"{base_url}/gradio_api/call/chat"
     get_url_template = f"{base_url}/gradio_api/call/chat/{{event_id}}"
+    headers_json = {"Content-Type": "application/json"}
     if HF_TOKEN:
+        headers_json["Authorization"] = f"Bearer {HF_TOKEN}"
     meta: Dict[str, Any] = {
         "vlm_file_delivery_ok": False,
         "vlm_files_seen": None,
         "vlm_raw_len": 0,
         "vlm_out_object": None,
+        "post_url": post_url,
+        "attempts": []
     }
+    def _extract_event_id(resp_text: str, resp_json: Optional[Dict[str, Any]]) -> Optional[str]:
+        if isinstance(resp_json, dict):
+            for k in ("event_id", "id", "job"):
+                if k in resp_json and resp_json[k]:
+                    return resp_json[k]
+        # try a quoted token heuristic (like the awk approach)
+        m = re.search(r'"([^"]{8,})"', resp_text or "")
+        if m:
+            return m.group(1)
+        parts = re.split(r'"', resp_text or "")
+        if len(parts) >= 5:
+            candidate = parts[3].strip()
+            if candidate:
+                return candidate
+        return None
+    with httpx.Client(timeout=30.0) as client:
+        # Attempt 1: JSON data-uri POST
+        try:
+            logger.info("VLM POST (JSON data-uri) -> %s (prompt len=%d)", post_url, len(prompt))
+            resp = client.post(post_url, headers=headers_json, json=payload_json)
+            resp.raise_for_status()
+            meta["attempts"].append({"mode": "json", "status_code": resp.status_code})
             try:
+                resp_json = resp.json()
             except Exception:
+                resp_json = None
+            event_id = _extract_event_id(resp.text, resp_json)
             if not event_id:
+                raise RuntimeError(f"Failed to obtain EVENT_ID from VLM POST (json) response: {resp.text[:1000]}")
             meta["event_id"] = event_id
+        except httpx.HTTPStatusError as he:
+            # Log attempt and fallback to multipart if server-side error
+            status = he.response.status_code if he.response is not None else None
+            body_excerpt = (he.response.text[:1000] if he.response is not None else str(he))
+            logger.warning("VLM JSON POST failed (status=%s). Response excerpt: %s", status, body_excerpt[:400])
+            meta["attempts"].append({"mode": "json", "status_code": status, "error": body_excerpt})
+            if status is None or 500 <= status < 600:
+                # Try multipart fallback
+                try:
+                    logger.info("Attempting multipart/form-data fallback to %s", post_url)
+                    # Some Spaces expect 'data' field to be JSON array describing inputs and files to be referenced.
+                    # We'll send 'data' as JSON string with a placeholder for file indices, and attach the file in 'file' part.
+                    data_field = json.dumps([{"text": prompt, "files": [None]}])
+                    files = {
+                        "data": (None, data_field, "application/json"),
+                        "file": (os.path.basename(face_path), face_bytes, "image/jpeg")
+                    }
+                    # Authorization header only; content-type will be set by httpx for multipart
+                    headers_mp = {}
+                    if HF_TOKEN:
+                        headers_mp["Authorization"] = f"Bearer {HF_TOKEN}"
+                    resp2 = client.post(post_url, headers=headers_mp, files=files)
+                    resp2.raise_for_status()
+                    meta["attempts"].append({"mode": "multipart", "status_code": resp2.status_code})
+                    try:
+                        resp2_json = resp2.json()
+                    except Exception:
+                        resp2_json = None
+                    event_id = _extract_event_id(resp2.text, resp2_json)
+                    if not event_id:
+                        raise RuntimeError(f"Failed to obtain EVENT_ID from VLM POST (multipart) response: {resp2.text[:1000]}")
+                    meta["event_id"] = event_id
+                except Exception as e_mp:
+                    logger.exception("Multipart fallback failed")
+                    meta["attempts"].append({"mode": "multipart", "error": str(e_mp)})
+                    raise RuntimeError(f"VLM POST failed (json then multipart): {body_excerpt[:1000]} | multipart error: {str(e_mp)}")
+            else:
+                # Non-5xx error — surface it
+                raise RuntimeError(f"VLM POST failed with status {status}: {body_excerpt[:1000]}")
+        except Exception as e:
+            logger.exception("VLM POST unexpected failure")
+            meta["attempts"].append({"mode": "json", "error": str(e)})
+            raise RuntimeError(f"VLM POST failed: {e}")
+        # If we have event_id, poll GET endpoint for result
+        event_id = meta.get("event_id")
+        if not event_id:
+            raise RuntimeError("No event_id obtained from VLM POST (unexpected)")
+        get_url = get_url_template.format(event_id=event_id)
+        logger.info("Polling VLM event result at %s", get_url)
+        max_polls = 8
+        poll_delay = 0.5
+        final_text = ""
+        last_response_json = None
+        for attempt in range(max_polls):
+            try:
+                r2 = client.get(get_url, timeout=30.0)
+            except Exception as e_get:
+                logger.warning("GET attempt %d failed: %s", attempt + 1, str(e_get))
+                time.sleep(poll_delay)
+                continue
+            if r2.status_code == 204 or not (r2.text and r2.text.strip()):
+                time.sleep(poll_delay)
+                continue
             try:
+                r2j = r2.json()
+                last_response_json = r2j
             except Exception:
+                r2j = None
+            text_out = ""
+            if isinstance(r2j, dict):
+                if "data" in r2j and isinstance(r2j["data"], list) and len(r2j["data"]) > 0:
+                    first = r2j["data"][0]
+                    if isinstance(first, dict):
+                        text_out = first.get("text") or first.get("output") or json.dumps(first)
+                    elif isinstance(first, str):
+                        text_out = first
+                text_out = text_out or r2j.get("text") or r2j.get("msg") or r2j.get("output", "") or ""
+            else:
+                text_out = r2.text or ""
+            if text_out and text_out.strip():
+                final_text = text_out
+                meta["attempts"].append({"mode": "get", "status_code": r2.status_code})
+                break
+            else:
+                time.sleep(poll_delay)
+                continue
+        if not final_text:
+            final_text = (r2.text or "").strip()
+            meta["attempts"].append({"mode": "get_last", "status_code": r2.status_code if 'r2' in locals() and r2 is not None else None, "raw": final_text[:500]})
+        meta["vlm_raw_len"] = len(final_text)
+        meta["vlm_out_object"] = (final_text[:2000] + "...") if len(final_text) > 2000 else final_text
+        # Best-effort: detect whether server mentions receiving a file
+        files_seen = None
+        try:
+            if isinstance(last_response_json, dict):
+                for key in ("files", "output_files", "files_sent", "uploaded_files", "received_files"):
+                    if key in last_response_json and isinstance(last_response_json[key], (list, tuple)):
+                        files_seen = len(last_response_json[key])
+                        break
+            if files_seen is None and final_text:
+                ext_matches = re.findall(r"\.(?:jpg|jpeg|png|bmp|gif)\b", final_text, flags=re.IGNORECASE)
+                if ext_matches:
+                    files_seen = len(ext_matches)
+                else:
+                    matches = re.findall(r"\b(?:uploaded|received|file)\b", final_text, flags=re.IGNORECASE)
+                    if matches:
+                        files_seen = max(1, len(matches))
+        except Exception:
+            files_seen = None
+        meta["vlm_files_seen"] = files_seen
+        meta["vlm_file_delivery_ok"] = (files_seen is not None and files_seen >= 1)
+        parsed_features = None
+        return parsed_features, (final_text or ""), meta
 # -----------------------
 # Gradio / LLM helper (defensive, with retry + clamps)