Spaces:

Seth0330
/

AIEXTRACT1

Running

App Files Files Community

Seth0330 commited on 16 days ago

Commit

93fcaaf

verified ·

1 Parent(s): 32c001b

Update backend/app/openrouter_client.py

Browse files

Files changed (1) hide show

backend/app/openrouter_client.py +61 -32

backend/app/openrouter_client.py CHANGED Viewed

@@ -23,7 +23,7 @@ MODEL_NAME = "qwen/qwen3-vl-235b-a22b-instruct"
 # HuggingFace Inference API
 HF_TOKEN = os.environ.get("HF_TOKEN")
 HF_INFERENCE_API_URL = "https://api-inference.huggingface.co/models"
-HF_MODEL_NAME = os.environ.get("HF_MODEL_NAME", "Qwen/Qwen2-VL-7B-Instruct")  # Alternative HF model
 # Backend selection: "openrouter" or "huggingface"
 EXTRACTION_BACKEND = os.environ.get("EXTRACTION_BACKEND", "openrouter").lower()
@@ -325,17 +325,10 @@ async def _extract_with_openrouter_single(image_bytes: bytes, page_num: int, tot
 async def _extract_with_hf(image_bytes: bytes, page_num: int, total_pages: int) -> Dict[str, Any]:
-    """Extract from a single page using HuggingFace Inference API."""
     if not HF_TOKEN:
         raise RuntimeError("HF_TOKEN environment variable is not set")
-    try:
-        from huggingface_hub import InferenceClient
-    except ImportError:
-        raise RuntimeError("huggingface_hub not installed. Add it to requirements.txt")
-    client = InferenceClient(model=HF_MODEL_NAME, token=HF_TOKEN, timeout=180.0)
     prompt = (
         f"Read this document page ({page_num} of {total_pages}) and extract ALL text content. "
         "Extract every word, number, and piece of information, including any non-English text. "
@@ -345,39 +338,70 @@ async def _extract_with_hf(image_bytes: bytes, page_num: int, total_pages: int)
     print(f"[INFO] HuggingFace: Processing page {page_num} with model {HF_MODEL_NAME}")
     try:
-        # Convert image bytes to base64 data URL for HuggingFace API
         image_base64 = base64.b64encode(image_bytes).decode('utf-8')
-        image_data_url = f"data:image/jpeg;base64,{image_base64}"
-        # HF Inference API for vision models - use chat completion with base64 image
-        result = client.chat_completion(
-            messages=[
-                {
-                    "role": "user",
-                    "content": [
-                        {"type": "text", "text": prompt},
-                        {"type": "image", "image": image_data_url}  # Use base64 data URL, not raw bytes
-                    ]
-                }
-            ],
-            max_tokens=2048
-        )
-        # Extract response text
-        if isinstance(result, dict):
-            if "choices" in result and len(result["choices"]) > 0:
                 message = result["choices"][0].get("message", {})
-                if isinstance(message.get("content"), list):
-                    # Content might be a list of content blocks
                     response_text = "".join(
                         item.get("text", "")
-                        for item in message["content"]
                         if item.get("type") == "text"
                     )
                 else:
-                    response_text = message.get("content", "")
             else:
-                response_text = result.get("generated_text", str(result))
         elif isinstance(result, str):
             response_text = result
         else:
@@ -385,8 +409,13 @@ async def _extract_with_hf(image_bytes: bytes, page_num: int, total_pages: int)
         if not response_text:
             raise ValueError("Empty response from HuggingFace API")
         return _parse_model_response(response_text, page_num)
     except Exception as e:
         print(f"[ERROR] HuggingFace API error details: {type(e).__name__}: {str(e)}")
         raise RuntimeError(f"HuggingFace API error for page {page_num}: {str(e)}")

 # HuggingFace Inference API
 HF_TOKEN = os.environ.get("HF_TOKEN")
 HF_INFERENCE_API_URL = "https://api-inference.huggingface.co/models"
+HF_MODEL_NAME = os.environ.get("HF_MODEL_NAME", "Qwen/Qwen3-VL-235B-A22B-Instruct")  # Default HF model
 # Backend selection: "openrouter" or "huggingface"
 EXTRACTION_BACKEND = os.environ.get("EXTRACTION_BACKEND", "openrouter").lower()
 async def _extract_with_hf(image_bytes: bytes, page_num: int, total_pages: int) -> Dict[str, Any]:
+    """Extract from a single page using HuggingFace Inference API (standard endpoint)."""
     if not HF_TOKEN:
         raise RuntimeError("HF_TOKEN environment variable is not set")
     prompt = (
         f"Read this document page ({page_num} of {total_pages}) and extract ALL text content. "
         "Extract every word, number, and piece of information, including any non-English text. "
     print(f"[INFO] HuggingFace: Processing page {page_num} with model {HF_MODEL_NAME}")
     try:
+        # Use standard HuggingFace Inference API endpoint (not chat_completion/router)
+        api_url = f"{HF_INFERENCE_API_URL}/{HF_MODEL_NAME}"
+        # Convert image to base64
         image_base64 = base64.b64encode(image_bytes).decode('utf-8')
+        # For Qwen3-VL models, use the chat format through standard API
+        # The standard API accepts chat-completion format for compatible models
+        payload = {
+            "inputs": {
+                "messages": [
+                    {
+                        "role": "user",
+                        "content": [
+                            {"type": "text", "text": prompt},
+                            {"type": "image", "image": f"data:image/jpeg;base64,{image_base64}"}
+                        ]
+                    }
+                ]
+            },
+            "parameters": {
+                "max_new_tokens": 2048,
+                "temperature": 0.1
+            }
+        }
+        headers = {
+            "Authorization": f"Bearer {HF_TOKEN}",
+            "Content-Type": "application/json"
+        }
+        timeout = httpx.Timeout(180.0, connect=30.0)
+        async with httpx.AsyncClient(timeout=timeout) as client:
+            print(f"[INFO] Making POST request to {api_url}...")
+            resp = await client.post(api_url, headers=headers, json=payload)
+            print(f"[INFO] Received response: Status {resp.status_code}")
+            resp.raise_for_status()
+            result = resp.json()
+        # Extract response text - format depends on model response
+        response_text = None
+        if isinstance(result, list) and len(result) > 0:
+            # Standard API often returns list with generated_text
+            response_text = result[0].get("generated_text", str(result[0]))
+        elif isinstance(result, dict):
+            # Check for different response formats
+            if "generated_text" in result:
+                response_text = result["generated_text"]
+            elif "text" in result:
+                response_text = result["text"]
+            elif "choices" in result and len(result["choices"]) > 0:
+                # Chat completion format
                 message = result["choices"][0].get("message", {})
+                content = message.get("content", "")
+                if isinstance(content, list):
                     response_text = "".join(
                         item.get("text", "")
+                        for item in content
                         if item.get("type") == "text"
                     )
                 else:
+                    response_text = content
             else:
+                response_text = str(result)
         elif isinstance(result, str):
             response_text = result
         else:
         if not response_text:
             raise ValueError("Empty response from HuggingFace API")
+        print(f"[DEBUG] HuggingFace response preview: {response_text[:500]}")
         return _parse_model_response(response_text, page_num)
+    except httpx.HTTPStatusError as e:
+        print(f"[ERROR] HuggingFace API HTTP error: {e.response.status_code} - {e.response.text[:500]}")
+        raise RuntimeError(f"HuggingFace API error for page {page_num}: {e.response.status_code} - {str(e)}")
     except Exception as e:
         print(f"[ERROR] HuggingFace API error details: {type(e).__name__}: {str(e)}")
         raise RuntimeError(f"HuggingFace API error for page {page_num}: {str(e)}")