Spaces:
Runtime error
Runtime error
| """ | |
| Gemini client wrapper that mimics OpenAI client interface for MarkItDown compatibility. | |
| This allows us to use Gemini Flash 2.5 for image processing in MarkItDown. | |
| """ | |
| import logging | |
| import base64 | |
| from typing import List, Dict, Any, Optional | |
| from pathlib import Path | |
| try: | |
| from google import genai | |
| HAS_GEMINI = True | |
| except ImportError: | |
| HAS_GEMINI = False | |
| from src.core.config import config | |
| from src.core.logging_config import get_logger | |
| logger = get_logger(__name__) | |
| class GeminiChatCompletions: | |
| """Chat completions interface that mimics OpenAI's chat.completions API.""" | |
| def __init__(self, client): | |
| self.client = client | |
| def create(self, model: str, messages: List[Dict[str, Any]], **kwargs) -> 'GeminiResponse': | |
| """Create a chat completion that mimics OpenAI's API.""" | |
| if not messages: | |
| raise ValueError("Messages cannot be empty") | |
| # Extract the user message (MarkItDown sends a single user message with text + image) | |
| user_message = None | |
| for msg in messages: | |
| if msg.get("role") == "user": | |
| user_message = msg | |
| break | |
| if not user_message: | |
| raise ValueError("No user message found") | |
| content = user_message.get("content", []) | |
| if not isinstance(content, list): | |
| content = [{"type": "text", "text": str(content)}] | |
| # Extract text prompt and image | |
| text_prompt = "" | |
| image_data = None | |
| for item in content: | |
| if item.get("type") == "text": | |
| text_prompt = item.get("text", "") | |
| elif item.get("type") == "image_url": | |
| image_url = item.get("image_url", {}).get("url", "") | |
| if image_url.startswith("data:image/"): | |
| # Extract base64 data from data URI | |
| try: | |
| header, data = image_url.split(",", 1) | |
| image_data = base64.b64decode(data) | |
| except Exception as e: | |
| logger.error(f"Failed to decode image data: {e}") | |
| raise ValueError("Invalid image data URI") | |
| if not text_prompt: | |
| text_prompt = "Describe this image in detail." | |
| if not image_data: | |
| raise ValueError("No image data found in request") | |
| try: | |
| # Use Gemini to process the image | |
| response = self.client.models.generate_content( | |
| model=config.model.gemini_model, | |
| contents=[ | |
| { | |
| "parts": [ | |
| {"text": text_prompt}, | |
| { | |
| "inline_data": { | |
| "mime_type": "image/jpeg", # Assume JPEG for now | |
| "data": base64.b64encode(image_data).decode() | |
| } | |
| } | |
| ] | |
| } | |
| ], | |
| config={ | |
| "temperature": config.model.temperature, | |
| "max_output_tokens": 1024, # Reasonable limit for image descriptions | |
| } | |
| ) | |
| # Extract text from Gemini response | |
| response_text = "" | |
| if hasattr(response, "text") and response.text: | |
| response_text = response.text | |
| elif hasattr(response, "candidates") and response.candidates: | |
| candidate = response.candidates[0] | |
| if hasattr(candidate, "content") and candidate.content: | |
| if hasattr(candidate.content, "parts") and candidate.content.parts: | |
| response_text = candidate.content.parts[0].text | |
| if not response_text: | |
| logger.warning("Empty response from Gemini, using fallback") | |
| response_text = "Image processing completed but no description generated." | |
| return GeminiResponse(response_text) | |
| except Exception as e: | |
| logger.error(f"Gemini API error: {str(e)}") | |
| # Return a fallback response to avoid breaking MarkItDown | |
| return GeminiResponse(f"Image description unavailable due to processing error: {str(e)}") | |
| class GeminiChoice: | |
| """Mimics OpenAI's Choice object.""" | |
| def __init__(self, content: str): | |
| self.message = GeminiMessage(content) | |
| class GeminiMessage: | |
| """Mimics OpenAI's Message object.""" | |
| def __init__(self, content: str): | |
| self.content = content | |
| class GeminiResponse: | |
| """Mimics OpenAI's ChatCompletion response.""" | |
| def __init__(self, content: str): | |
| self.choices = [GeminiChoice(content)] | |
| class GeminiClientWrapper: | |
| """ | |
| Gemini client wrapper that mimics OpenAI client interface for MarkItDown. | |
| This allows MarkItDown to use Gemini for image processing while thinking | |
| it's using an OpenAI client. | |
| """ | |
| def __init__(self, api_key: Optional[str] = None): | |
| if not HAS_GEMINI: | |
| raise ImportError("google-genai package is required for Gemini support") | |
| api_key = api_key or config.api.google_api_key | |
| if not api_key: | |
| raise ValueError("Google API key is required for Gemini client") | |
| self.client = genai.Client(api_key=api_key) | |
| self.chat = GeminiChatCompletions(self.client) | |
| logger.info("Gemini client wrapper initialized for MarkItDown compatibility") | |
| def completions(self): | |
| """Alias for chat to match some OpenAI client patterns.""" | |
| return self.chat | |
| def create_gemini_client_for_markitdown() -> Optional[GeminiClientWrapper]: | |
| """ | |
| Create a Gemini client wrapper for use with MarkItDown. | |
| Returns: | |
| GeminiClientWrapper if Gemini is available and configured, None otherwise. | |
| """ | |
| if not HAS_GEMINI: | |
| logger.warning("Gemini not available for MarkItDown image processing") | |
| return None | |
| if not config.api.google_api_key: | |
| logger.warning("No Google API key found for MarkItDown image processing") | |
| return None | |
| try: | |
| return GeminiClientWrapper() | |
| except Exception as e: | |
| logger.error(f"Failed to create Gemini client for MarkItDown: {e}") | |
| return None | |
| # For testing purposes | |
| if __name__ == "__main__": | |
| # Test the wrapper | |
| try: | |
| client = create_gemini_client_for_markitdown() | |
| if client: | |
| print("β Gemini client wrapper created successfully") | |
| print("β Ready for MarkItDown integration") | |
| else: | |
| print("β Failed to create Gemini client wrapper") | |
| except Exception as e: | |
| print(f"β Error: {e}") |