"""Utilities for processing file uploads (images & documents) in chat.""" from __future__ import annotations import base64 import os from typing import Any, Dict IMAGE_MIME_TYPES = { ".jpg": "image/jpeg", ".jpeg": "image/jpeg", ".png": "image/png", ".gif": "image/gif", ".webp": "image/webp", } DOCUMENT_MIME_TYPES = { ".pdf": "application/pdf", ".txt": "text/plain", ".md": "text/markdown", ".csv": "text/csv", } MAX_IMAGE_SIZE = 10 * 1024 * 1024 # 10 MB MAX_DOCUMENT_SIZE = 20 * 1024 * 1024 # 20 MB def get_file_mime_type(filename: str, content_type: str | None = None) -> str | None: """Detect MIME type from file extension, falling back to content_type header.""" if filename: ext = os.path.splitext(filename.lower())[1] if ext in IMAGE_MIME_TYPES: return IMAGE_MIME_TYPES[ext] if ext in DOCUMENT_MIME_TYPES: return DOCUMENT_MIME_TYPES[ext] return content_type def is_image(mime: str | None) -> bool: return mime is not None and mime in IMAGE_MIME_TYPES.values() def is_document(mime: str | None) -> bool: return mime is not None and mime in DOCUMENT_MIME_TYPES.values() def extract_text_from_pdf(content: bytes, max_pages: int = 50) -> str: """Extract text from a PDF using pdfplumber, with PyPDF2 fallback.""" import io # Try pdfplumber first (better table/layout extraction) try: import pdfplumber pages_text: list[str] = [] with pdfplumber.open(io.BytesIO(content)) as pdf: for i, page in enumerate(pdf.pages): if i >= max_pages: pages_text.append(f"\n[... truncated at {max_pages} pages ...]") break text = page.extract_text() if text: pages_text.append(text) result = "\n".join(pages_text) if result.strip(): return result except ImportError: pass except Exception: pass # Fallback: PyPDF2 try: from PyPDF2 import PdfReader reader = PdfReader(io.BytesIO(content)) pages_text = [] for i, page in enumerate(reader.pages): if i >= max_pages: pages_text.append(f"\n[... truncated at {max_pages} pages ...]") break text = page.extract_text() if text: pages_text.append(text) result = "\n".join(pages_text) if result.strip(): return result except ImportError: pass except Exception: pass return "[PDF text extraction failed — neither pdfplumber nor PyPDF2 available]" def extract_text_from_document(content: bytes, mime: str) -> str: """Dispatch document text extraction based on MIME type.""" if mime == "application/pdf": return extract_text_from_pdf(content) # Plain text family: txt, markdown, csv return content.decode("utf-8", errors="replace") def encode_image_for_gemini(content: bytes, mime: str) -> Dict[str, Any]: """Encode image bytes to the Gemini multimodal content block format.""" encoded = base64.b64encode(content).decode("utf-8") return { "type": "media", "data": encoded, "mime_type": mime, }