Spaces:

Seth0330
/

AIEXTRACT1

Sleeping

App Files Files Community

Seth0330 commited on 15 days ago

Commit

32c001b

verified ·

1 Parent(s): d0cfc3b

Update backend/app/openrouter_client.py

Browse files

Files changed (1) hide show

backend/app/openrouter_client.py +17 -5

backend/app/openrouter_client.py CHANGED Viewed

@@ -334,7 +334,7 @@ async def _extract_with_hf(image_bytes: bytes, page_num: int, total_pages: int)
     except ImportError:
         raise RuntimeError("huggingface_hub not installed. Add it to requirements.txt")
-    client = InferenceClient(model=HF_MODEL_NAME, token=HF_TOKEN)
     prompt = (
         f"Read this document page ({page_num} of {total_pages}) and extract ALL text content. "
@@ -345,15 +345,18 @@ async def _extract_with_hf(image_bytes: bytes, page_num: int, total_pages: int)
     print(f"[INFO] HuggingFace: Processing page {page_num} with model {HF_MODEL_NAME}")
     try:
-        # HF Inference API for vision models - use image-to-text or chat completion
-        # For vision models, we need to use the chat completion format
         result = client.chat_completion(
             messages=[
                 {
                     "role": "user",
                     "content": [
                         {"type": "text", "text": prompt},
-                        {"type": "image", "image": image_bytes}
                     ]
                 }
             ],
@@ -363,7 +366,16 @@ async def _extract_with_hf(image_bytes: bytes, page_num: int, total_pages: int)
         # Extract response text
         if isinstance(result, dict):
             if "choices" in result and len(result["choices"]) > 0:
-                response_text = result["choices"][0].get("message", {}).get("content", "")
             else:
                 response_text = result.get("generated_text", str(result))
         elif isinstance(result, str):

     except ImportError:
         raise RuntimeError("huggingface_hub not installed. Add it to requirements.txt")
+    client = InferenceClient(model=HF_MODEL_NAME, token=HF_TOKEN, timeout=180.0)
     prompt = (
         f"Read this document page ({page_num} of {total_pages}) and extract ALL text content. "
     print(f"[INFO] HuggingFace: Processing page {page_num} with model {HF_MODEL_NAME}")
     try:
+        # Convert image bytes to base64 data URL for HuggingFace API
+        image_base64 = base64.b64encode(image_bytes).decode('utf-8')
+        image_data_url = f"data:image/jpeg;base64,{image_base64}"
+        # HF Inference API for vision models - use chat completion with base64 image
         result = client.chat_completion(
             messages=[
                 {
                     "role": "user",
                     "content": [
                         {"type": "text", "text": prompt},
+                        {"type": "image", "image": image_data_url}  # Use base64 data URL, not raw bytes
                     ]
                 }
             ],
         # Extract response text
         if isinstance(result, dict):
             if "choices" in result and len(result["choices"]) > 0:
+                message = result["choices"][0].get("message", {})
+                if isinstance(message.get("content"), list):
+                    # Content might be a list of content blocks
+                    response_text = "".join(
+                        item.get("text", "")
+                        for item in message["content"]
+                        if item.get("type") == "text"
+                    )
+                else:
+                    response_text = message.get("content", "")
             else:
                 response_text = result.get("generated_text", str(result))
         elif isinstance(result, str):