Spaces:
Paused
Paused
| import type { ImageContent } from "@mariozechner/pi-ai"; | |
| import fs from "node:fs/promises"; | |
| import path from "node:path"; | |
| import { fileURLToPath } from "node:url"; | |
| import { extractTextFromMessage } from "../../../tui/tui-formatters.js"; | |
| import { resolveUserPath } from "../../../utils.js"; | |
| import { loadWebMedia } from "../../../web/media.js"; | |
| import { assertSandboxPath } from "../../sandbox-paths.js"; | |
| import { sanitizeImageBlocks } from "../../tool-images.js"; | |
| import { log } from "../logger.js"; | |
| /** | |
| * Common image file extensions for detection. | |
| */ | |
| const IMAGE_EXTENSIONS = new Set([ | |
| ".png", | |
| ".jpg", | |
| ".jpeg", | |
| ".gif", | |
| ".webp", | |
| ".bmp", | |
| ".tiff", | |
| ".tif", | |
| ".heic", | |
| ".heif", | |
| ]); | |
| /** | |
| * Result of detecting an image reference in text. | |
| */ | |
| export interface DetectedImageRef { | |
| /** The raw matched string from the prompt */ | |
| raw: string; | |
| /** The type of reference (path or url) */ | |
| type: "path" | "url"; | |
| /** The resolved/normalized path or URL */ | |
| resolved: string; | |
| /** Index of the message this ref was found in (for history images) */ | |
| messageIndex?: number; | |
| } | |
| /** | |
| * Checks if a file extension indicates an image file. | |
| */ | |
| function isImageExtension(filePath: string): boolean { | |
| const ext = path.extname(filePath).toLowerCase(); | |
| return IMAGE_EXTENSIONS.has(ext); | |
| } | |
| async function sanitizeImagesWithLog( | |
| images: ImageContent[], | |
| label: string, | |
| ): Promise<ImageContent[]> { | |
| const { images: sanitized, dropped } = await sanitizeImageBlocks(images, label); | |
| if (dropped > 0) { | |
| log.warn(`Native image: dropped ${dropped} image(s) after sanitization (${label}).`); | |
| } | |
| return sanitized; | |
| } | |
| /** | |
| * Detects image references in a user prompt. | |
| * | |
| * Patterns detected: | |
| * - Absolute paths: /path/to/image.png | |
| * - Relative paths: ./image.png, ../images/photo.jpg | |
| * - Home paths: ~/Pictures/screenshot.png | |
| * - file:// URLs: file:///path/to/image.png | |
| * - Message attachments: [Image: source: /path/to/image.jpg] | |
| * | |
| * @param prompt The user prompt text to scan | |
| * @returns Array of detected image references | |
| */ | |
| export function detectImageReferences(prompt: string): DetectedImageRef[] { | |
| const refs: DetectedImageRef[] = []; | |
| const seen = new Set<string>(); | |
| // Helper to add a path ref | |
| const addPathRef = (raw: string) => { | |
| const trimmed = raw.trim(); | |
| if (!trimmed || seen.has(trimmed.toLowerCase())) { | |
| return; | |
| } | |
| if (trimmed.startsWith("http://") || trimmed.startsWith("https://")) { | |
| return; | |
| } | |
| if (!isImageExtension(trimmed)) { | |
| return; | |
| } | |
| seen.add(trimmed.toLowerCase()); | |
| const resolved = trimmed.startsWith("~") ? resolveUserPath(trimmed) : trimmed; | |
| refs.push({ raw: trimmed, type: "path", resolved }); | |
| }; | |
| // Pattern for [media attached: path (type) | url] or [media attached N/M: path (type) | url] format | |
| // Each bracket = ONE file. The | separates path from URL, not multiple files. | |
| // Multi-file format uses separate brackets on separate lines. | |
| const mediaAttachedPattern = /\[media attached(?:\s+\d+\/\d+)?:\s*([^\]]+)\]/gi; | |
| let match: RegExpExecArray | null; | |
| while ((match = mediaAttachedPattern.exec(prompt)) !== null) { | |
| const content = match[1]; | |
| // Skip "[media attached: N files]" header lines | |
| if (/^\d+\s+files?$/i.test(content.trim())) { | |
| continue; | |
| } | |
| // Extract path before the (mime/type) or | delimiter | |
| // Format is: path (type) | url OR just: path (type) | |
| // Path may contain spaces (e.g., "ChatGPT Image Apr 21.png") | |
| // Use non-greedy .+? to stop at first image extension | |
| const pathMatch = content.match( | |
| /^\s*(.+?\.(?:png|jpe?g|gif|webp|bmp|tiff?|heic|heif))\s*(?:\(|$|\|)/i, | |
| ); | |
| if (pathMatch?.[1]) { | |
| addPathRef(pathMatch[1].trim()); | |
| } | |
| } | |
| // Pattern for [Image: source: /path/...] format from messaging systems | |
| const messageImagePattern = | |
| /\[Image:\s*source:\s*([^\]]+\.(?:png|jpe?g|gif|webp|bmp|tiff?|heic|heif))\]/gi; | |
| while ((match = messageImagePattern.exec(prompt)) !== null) { | |
| const raw = match[1]?.trim(); | |
| if (raw) { | |
| addPathRef(raw); | |
| } | |
| } | |
| // Remote HTTP(S) URLs are intentionally ignored. Native image injection is local-only. | |
| // Pattern for file:// URLs - treat as paths since loadWebMedia handles them | |
| const fileUrlPattern = /file:\/\/[^\s<>"'`\]]+\.(?:png|jpe?g|gif|webp|bmp|tiff?|heic|heif)/gi; | |
| while ((match = fileUrlPattern.exec(prompt)) !== null) { | |
| const raw = match[0]; | |
| if (seen.has(raw.toLowerCase())) { | |
| continue; | |
| } | |
| seen.add(raw.toLowerCase()); | |
| // Use fileURLToPath for proper handling (e.g., file://localhost/path) | |
| try { | |
| const resolved = fileURLToPath(raw); | |
| refs.push({ raw, type: "path", resolved }); | |
| } catch { | |
| // Skip malformed file:// URLs | |
| } | |
| } | |
| // Pattern for file paths (absolute, relative, or home) | |
| // Matches: | |
| // - /absolute/path/to/file.ext (including paths with special chars like Messages/Attachments) | |
| // - ./relative/path.ext | |
| // - ../parent/path.ext | |
| // - ~/home/path.ext | |
| const pathPattern = | |
| /(?:^|\s|["'`(])((\.\.?\/|[~/])[^\s"'`()[\]]*\.(?:png|jpe?g|gif|webp|bmp|tiff?|heic|heif))/gi; | |
| while ((match = pathPattern.exec(prompt)) !== null) { | |
| // Use capture group 1 (the path without delimiter prefix); skip if undefined | |
| if (match[1]) { | |
| addPathRef(match[1]); | |
| } | |
| } | |
| return refs; | |
| } | |
| /** | |
| * Loads an image from a file path or URL and returns it as ImageContent. | |
| * | |
| * @param ref The detected image reference | |
| * @param workspaceDir The current workspace directory for resolving relative paths | |
| * @param options Optional settings for sandbox and size limits | |
| * @returns The loaded image content, or null if loading failed | |
| */ | |
| export async function loadImageFromRef( | |
| ref: DetectedImageRef, | |
| workspaceDir: string, | |
| options?: { | |
| maxBytes?: number; | |
| /** If set, enforce that file paths are within this sandbox root */ | |
| sandboxRoot?: string; | |
| }, | |
| ): Promise<ImageContent | null> { | |
| try { | |
| let targetPath = ref.resolved; | |
| // Remote URL loading is disabled (local-only). | |
| if (ref.type === "url") { | |
| log.debug(`Native image: rejecting remote URL (local-only): ${ref.resolved}`); | |
| return null; | |
| } | |
| // For file paths, resolve relative to the appropriate root: | |
| // - When sandbox is enabled, resolve relative to sandboxRoot for security | |
| // - Otherwise, resolve relative to workspaceDir | |
| // Note: ref.resolved may already be absolute (e.g., after ~ expansion in detectImageReferences), | |
| // in which case we skip relative resolution. | |
| if (ref.type === "path" && !path.isAbsolute(targetPath)) { | |
| const resolveRoot = options?.sandboxRoot ?? workspaceDir; | |
| targetPath = path.resolve(resolveRoot, targetPath); | |
| } | |
| // Enforce sandbox restrictions if sandboxRoot is set | |
| if (ref.type === "path" && options?.sandboxRoot) { | |
| try { | |
| const validated = await assertSandboxPath({ | |
| filePath: targetPath, | |
| cwd: options.sandboxRoot, | |
| root: options.sandboxRoot, | |
| }); | |
| targetPath = validated.resolved; | |
| } catch (err) { | |
| // Log the actual error for debugging (sandbox violation or other path error) | |
| log.debug( | |
| `Native image: sandbox validation failed for ${ref.resolved}: ${err instanceof Error ? err.message : String(err)}`, | |
| ); | |
| return null; | |
| } | |
| } | |
| // Check file exists for local paths | |
| if (ref.type === "path") { | |
| try { | |
| await fs.stat(targetPath); | |
| } catch { | |
| log.debug(`Native image: file not found: ${targetPath}`); | |
| return null; | |
| } | |
| } | |
| // loadWebMedia handles local file paths (including file:// URLs) | |
| const media = await loadWebMedia(targetPath, options?.maxBytes); | |
| if (media.kind !== "image") { | |
| log.debug(`Native image: not an image file: ${targetPath} (got ${media.kind})`); | |
| return null; | |
| } | |
| // EXIF orientation is already normalized by loadWebMedia -> resizeToJpeg | |
| // Default to JPEG since optimization converts images to JPEG format | |
| const mimeType = media.contentType ?? "image/jpeg"; | |
| const data = media.buffer.toString("base64"); | |
| return { type: "image", data, mimeType }; | |
| } catch (err) { | |
| // Log the actual error for debugging (size limits, network failures, etc.) | |
| log.debug( | |
| `Native image: failed to load ${ref.resolved}: ${err instanceof Error ? err.message : String(err)}`, | |
| ); | |
| return null; | |
| } | |
| } | |
| /** | |
| * Checks if a model supports image input based on its input capabilities. | |
| * | |
| * @param model The model object with input capability array | |
| * @returns True if the model supports image input | |
| */ | |
| export function modelSupportsImages(model: { input?: string[] }): boolean { | |
| return model.input?.includes("image") ?? false; | |
| } | |
| /** | |
| * Extracts image references from conversation history messages. | |
| * Scans user messages for image paths/URLs that can be loaded. | |
| * Each ref includes the messageIndex so images can be injected at their original location. | |
| * | |
| * Note: Global deduplication is intentional - if the same image appears in multiple | |
| * messages, we only inject it at the FIRST occurrence. This is sufficient because: | |
| * 1. The model sees all message content including the image | |
| * 2. Later references to "the image" or "that picture" will work since it's in context | |
| * 3. Injecting duplicates would waste tokens and potentially hit size limits | |
| */ | |
| function detectImagesFromHistory(messages: unknown[]): DetectedImageRef[] { | |
| const allRefs: DetectedImageRef[] = []; | |
| const seen = new Set<string>(); | |
| const messageHasImageContent = (msg: unknown): boolean => { | |
| if (!msg || typeof msg !== "object") { | |
| return false; | |
| } | |
| const content = (msg as { content?: unknown }).content; | |
| if (!Array.isArray(content)) { | |
| return false; | |
| } | |
| return content.some( | |
| (part) => | |
| part != null && typeof part === "object" && (part as { type?: string }).type === "image", | |
| ); | |
| }; | |
| for (let i = 0; i < messages.length; i++) { | |
| const msg = messages[i]; | |
| if (!msg || typeof msg !== "object") { | |
| continue; | |
| } | |
| const message = msg as { role?: string }; | |
| // Only scan user messages for image references | |
| if (message.role !== "user") { | |
| continue; | |
| } | |
| // Skip if message already has image content (prevents reloading each turn) | |
| if (messageHasImageContent(msg)) { | |
| continue; | |
| } | |
| const text = extractTextFromMessage(msg); | |
| if (!text) { | |
| continue; | |
| } | |
| const refs = detectImageReferences(text); | |
| for (const ref of refs) { | |
| const key = ref.resolved.toLowerCase(); | |
| if (seen.has(key)) { | |
| continue; | |
| } | |
| seen.add(key); | |
| allRefs.push({ ...ref, messageIndex: i }); | |
| } | |
| } | |
| return allRefs; | |
| } | |
| /** | |
| * Detects and loads images referenced in a prompt for models with vision capability. | |
| * | |
| * This function scans the prompt for image references (file paths and URLs), | |
| * loads them, and returns them as ImageContent array ready to be passed to | |
| * the model's prompt method. | |
| * | |
| * Also scans conversation history for images from previous turns and returns | |
| * them mapped by message index so they can be injected at their original location. | |
| * | |
| * @param params Configuration for image detection and loading | |
| * @returns Object with loaded images for current prompt and history images by message index | |
| */ | |
| export async function detectAndLoadPromptImages(params: { | |
| prompt: string; | |
| workspaceDir: string; | |
| model: { input?: string[] }; | |
| existingImages?: ImageContent[]; | |
| historyMessages?: unknown[]; | |
| maxBytes?: number; | |
| /** If set, enforce that file paths are within this sandbox root */ | |
| sandboxRoot?: string; | |
| }): Promise<{ | |
| /** Images for the current prompt (existingImages + detected in current prompt) */ | |
| images: ImageContent[]; | |
| /** Images from history messages, keyed by message index */ | |
| historyImagesByIndex: Map<number, ImageContent[]>; | |
| detectedRefs: DetectedImageRef[]; | |
| loadedCount: number; | |
| skippedCount: number; | |
| }> { | |
| // If model doesn't support images, return empty results | |
| if (!modelSupportsImages(params.model)) { | |
| return { | |
| images: [], | |
| historyImagesByIndex: new Map(), | |
| detectedRefs: [], | |
| loadedCount: 0, | |
| skippedCount: 0, | |
| }; | |
| } | |
| // Detect images from current prompt | |
| const promptRefs = detectImageReferences(params.prompt); | |
| // Detect images from conversation history (with message indices) | |
| const historyRefs = params.historyMessages ? detectImagesFromHistory(params.historyMessages) : []; | |
| // Deduplicate: if an image is in the current prompt, don't also load it from history. | |
| // Current prompt images are passed via the `images` parameter to prompt(), while history | |
| // images are injected into their original message positions. We don't want the same | |
| // image loaded and sent twice (wasting tokens and potentially causing confusion). | |
| const seenPaths = new Set(promptRefs.map((r) => r.resolved.toLowerCase())); | |
| const uniqueHistoryRefs = historyRefs.filter((r) => !seenPaths.has(r.resolved.toLowerCase())); | |
| const allRefs = [...promptRefs, ...uniqueHistoryRefs]; | |
| if (allRefs.length === 0) { | |
| return { | |
| images: params.existingImages ?? [], | |
| historyImagesByIndex: new Map(), | |
| detectedRefs: [], | |
| loadedCount: 0, | |
| skippedCount: 0, | |
| }; | |
| } | |
| log.debug( | |
| `Native image: detected ${allRefs.length} image refs (${promptRefs.length} in prompt, ${uniqueHistoryRefs.length} in history)`, | |
| ); | |
| // Load images for current prompt | |
| const promptImages: ImageContent[] = [...(params.existingImages ?? [])]; | |
| // Load images for history, grouped by message index | |
| const historyImagesByIndex = new Map<number, ImageContent[]>(); | |
| let loadedCount = 0; | |
| let skippedCount = 0; | |
| for (const ref of allRefs) { | |
| const image = await loadImageFromRef(ref, params.workspaceDir, { | |
| maxBytes: params.maxBytes, | |
| sandboxRoot: params.sandboxRoot, | |
| }); | |
| if (image) { | |
| if (ref.messageIndex !== undefined) { | |
| // History image - add to the appropriate message index | |
| const existing = historyImagesByIndex.get(ref.messageIndex); | |
| if (existing) { | |
| existing.push(image); | |
| } else { | |
| historyImagesByIndex.set(ref.messageIndex, [image]); | |
| } | |
| } else { | |
| // Current prompt image | |
| promptImages.push(image); | |
| } | |
| loadedCount++; | |
| log.debug(`Native image: loaded ${ref.type} ${ref.resolved}`); | |
| } else { | |
| skippedCount++; | |
| } | |
| } | |
| const sanitizedPromptImages = await sanitizeImagesWithLog(promptImages, "prompt:images"); | |
| const sanitizedHistoryImagesByIndex = new Map<number, ImageContent[]>(); | |
| for (const [index, images] of historyImagesByIndex) { | |
| const sanitized = await sanitizeImagesWithLog(images, `history:images:${index}`); | |
| if (sanitized.length > 0) { | |
| sanitizedHistoryImagesByIndex.set(index, sanitized); | |
| } | |
| } | |
| return { | |
| images: sanitizedPromptImages, | |
| historyImagesByIndex: sanitizedHistoryImagesByIndex, | |
| detectedRefs: allRefs, | |
| loadedCount, | |
| skippedCount, | |
| }; | |
| } | |