Spaces:

darkfire514
/

OpenClawBot

Paused

App Files Files Community

OpenClawBot / src /agents /pi-embedded-runner /run /images.ts

darkfire514

Upload 2526 files

fb4d8fe verified 3 months ago

raw

history blame contribute delete

15.1 kB

	import type { ImageContent } from "@mariozechner/pi-ai";
	import fs from "node:fs/promises";
	import path from "node:path";
	import { fileURLToPath } from "node:url";
	import { extractTextFromMessage } from "../../../tui/tui-formatters.js";
	import { resolveUserPath } from "../../../utils.js";
	import { loadWebMedia } from "../../../web/media.js";
	import { assertSandboxPath } from "../../sandbox-paths.js";
	import { sanitizeImageBlocks } from "../../tool-images.js";
	import { log } from "../logger.js";

	/**
	* Common image file extensions for detection.
	*/
	const IMAGE_EXTENSIONS = new Set([
	".png",
	".jpg",
	".jpeg",
	".gif",
	".webp",
	".bmp",
	".tiff",
	".tif",
	".heic",
	".heif",
	]);

	/**
	* Result of detecting an image reference in text.
	*/
	export interface DetectedImageRef {
	/** The raw matched string from the prompt */
	raw: string;
	/** The type of reference (path or url) */
	type: "path" \| "url";
	/** The resolved/normalized path or URL */
	resolved: string;
	/** Index of the message this ref was found in (for history images) */
	messageIndex?: number;
	}

	/**
	* Checks if a file extension indicates an image file.
	*/
	function isImageExtension(filePath: string): boolean {
	const ext = path.extname(filePath).toLowerCase();
	return IMAGE_EXTENSIONS.has(ext);
	}

	async function sanitizeImagesWithLog(
	images: ImageContent[],
	label: string,
	): Promise<ImageContent[]> {
	const { images: sanitized, dropped } = await sanitizeImageBlocks(images, label);
	if (dropped > 0) {
	log.warn(`Native image: dropped ${dropped} image(s) after sanitization (${label}).`);
	}
	return sanitized;
	}

	/**
	* Detects image references in a user prompt.
	*
	* Patterns detected:
	* - Absolute paths: /path/to/image.png
	* - Relative paths: ./image.png, ../images/photo.jpg
	* - Home paths: ~/Pictures/screenshot.png
	* - file:// URLs: file:///path/to/image.png
	* - Message attachments: [Image: source: /path/to/image.jpg]
	*
	* @param prompt The user prompt text to scan
	* @returns Array of detected image references
	*/
	export function detectImageReferences(prompt: string): DetectedImageRef[] {
	const refs: DetectedImageRef[] = [];
	const seen = new Set<string>();

	// Helper to add a path ref
	const addPathRef = (raw: string) => {
	const trimmed = raw.trim();
	if (!trimmed \|\| seen.has(trimmed.toLowerCase())) {
	return;
	}
	if (trimmed.startsWith("http://") \|\| trimmed.startsWith("https://")) {
	return;
	}
	if (!isImageExtension(trimmed)) {
	return;
	}
	seen.add(trimmed.toLowerCase());
	const resolved = trimmed.startsWith("~") ? resolveUserPath(trimmed) : trimmed;
	refs.push({ raw: trimmed, type: "path", resolved });
	};

	// Pattern for [media attached: path (type) \| url] or [media attached N/M: path (type) \| url] format
	// Each bracket = ONE file. The \| separates path from URL, not multiple files.
	// Multi-file format uses separate brackets on separate lines.
	const mediaAttachedPattern = /\[media attached(?:\s+\d+\/\d+)?:\s*([^\]]+)\]/gi;
	let match: RegExpExecArray \| null;
	while ((match = mediaAttachedPattern.exec(prompt)) !== null) {
	const content = match[1];

	// Skip "[media attached: N files]" header lines
	if (/^\d+\s+files?$/i.test(content.trim())) {
	continue;
	}

	// Extract path before the (mime/type) or \| delimiter
	// Format is: path (type) \| url OR just: path (type)
	// Path may contain spaces (e.g., "ChatGPT Image Apr 21.png")
	// Use non-greedy .+? to stop at first image extension
	const pathMatch = content.match(
	/^\s(.+?\.(?:png\|jpe?g\|gif\|webp\|bmp\|tiff?\|heic\|heif))\s(?:\(\|$\|\\|)/i,
	);
	if (pathMatch?.[1]) {
	addPathRef(pathMatch[1].trim());
	}
	}

	// Pattern for [Image: source: /path/...] format from messaging systems
	const messageImagePattern =
	/\[Image:\ssource:\s([^\]]+\.(?:png\|jpe?g\|gif\|webp\|bmp\|tiff?\|heic\|heif))\]/gi;
	while ((match = messageImagePattern.exec(prompt)) !== null) {
	const raw = match[1]?.trim();
	if (raw) {
	addPathRef(raw);
	}
	}

	// Remote HTTP(S) URLs are intentionally ignored. Native image injection is local-only.

	// Pattern for file:// URLs - treat as paths since loadWebMedia handles them
	const fileUrlPattern = /file:\/\/[^\s<>"'`\]]+\.(?:png\|jpe?g\|gif\|webp\|bmp\|tiff?\|heic\|heif)/gi;
	while ((match = fileUrlPattern.exec(prompt)) !== null) {
	const raw = match[0];
	if (seen.has(raw.toLowerCase())) {
	continue;
	}
	seen.add(raw.toLowerCase());
	// Use fileURLToPath for proper handling (e.g., file://localhost/path)
	try {
	const resolved = fileURLToPath(raw);
	refs.push({ raw, type: "path", resolved });
	} catch {
	// Skip malformed file:// URLs
	}
	}

	// Pattern for file paths (absolute, relative, or home)
	// Matches:
	// - /absolute/path/to/file.ext (including paths with special chars like Messages/Attachments)
	// - ./relative/path.ext
	// - ../parent/path.ext
	// - ~/home/path.ext
	const pathPattern =
	/(?:^\|\s\|["'`(])((\.\.?\/\|[~/])[^\s"'`()[\]]*\.(?:png\|jpe?g\|gif\|webp\|bmp\|tiff?\|heic\|heif))/gi;
	while ((match = pathPattern.exec(prompt)) !== null) {
	// Use capture group 1 (the path without delimiter prefix); skip if undefined
	if (match[1]) {
	addPathRef(match[1]);
	}
	}

	return refs;
	}

	/**
	* Loads an image from a file path or URL and returns it as ImageContent.
	*
	* @param ref The detected image reference
	* @param workspaceDir The current workspace directory for resolving relative paths
	* @param options Optional settings for sandbox and size limits
	* @returns The loaded image content, or null if loading failed
	*/
	export async function loadImageFromRef(
	ref: DetectedImageRef,
	workspaceDir: string,
	options?: {
	maxBytes?: number;
	/** If set, enforce that file paths are within this sandbox root */
	sandboxRoot?: string;
	},
	): Promise<ImageContent \| null> {
	try {
	let targetPath = ref.resolved;

	// Remote URL loading is disabled (local-only).
	if (ref.type === "url") {
	log.debug(`Native image: rejecting remote URL (local-only): ${ref.resolved}`);
	return null;
	}

	// For file paths, resolve relative to the appropriate root:
	// - When sandbox is enabled, resolve relative to sandboxRoot for security
	// - Otherwise, resolve relative to workspaceDir
	// Note: ref.resolved may already be absolute (e.g., after ~ expansion in detectImageReferences),
	// in which case we skip relative resolution.
	if (ref.type === "path" && !path.isAbsolute(targetPath)) {
	const resolveRoot = options?.sandboxRoot ?? workspaceDir;
	targetPath = path.resolve(resolveRoot, targetPath);
	}

	// Enforce sandbox restrictions if sandboxRoot is set
	if (ref.type === "path" && options?.sandboxRoot) {
	try {
	const validated = await assertSandboxPath({
	filePath: targetPath,
	cwd: options.sandboxRoot,
	root: options.sandboxRoot,
	});
	targetPath = validated.resolved;
	} catch (err) {
	// Log the actual error for debugging (sandbox violation or other path error)
	log.debug(
	`Native image: sandbox validation failed for ${ref.resolved}: ${err instanceof Error ? err.message : String(err)}`,
	);
	return null;
	}
	}

	// Check file exists for local paths
	if (ref.type === "path") {
	try {
	await fs.stat(targetPath);
	} catch {
	log.debug(`Native image: file not found: ${targetPath}`);
	return null;
	}
	}

	// loadWebMedia handles local file paths (including file:// URLs)
	const media = await loadWebMedia(targetPath, options?.maxBytes);

	if (media.kind !== "image") {
	log.debug(`Native image: not an image file: ${targetPath} (got ${media.kind})`);
	return null;
	}

	// EXIF orientation is already normalized by loadWebMedia -> resizeToJpeg
	// Default to JPEG since optimization converts images to JPEG format
	const mimeType = media.contentType ?? "image/jpeg";
	const data = media.buffer.toString("base64");

	return { type: "image", data, mimeType };
	} catch (err) {
	// Log the actual error for debugging (size limits, network failures, etc.)
	log.debug(
	`Native image: failed to load ${ref.resolved}: ${err instanceof Error ? err.message : String(err)}`,
	);
	return null;
	}
	}

	/**
	* Checks if a model supports image input based on its input capabilities.
	*
	* @param model The model object with input capability array
	* @returns True if the model supports image input
	*/
	export function modelSupportsImages(model: { input?: string[] }): boolean {
	return model.input?.includes("image") ?? false;
	}

	/**
	* Extracts image references from conversation history messages.
	* Scans user messages for image paths/URLs that can be loaded.
	* Each ref includes the messageIndex so images can be injected at their original location.
	*
	* Note: Global deduplication is intentional - if the same image appears in multiple
	* messages, we only inject it at the FIRST occurrence. This is sufficient because:
	* 1. The model sees all message content including the image
	* 2. Later references to "the image" or "that picture" will work since it's in context
	* 3. Injecting duplicates would waste tokens and potentially hit size limits
	*/
	function detectImagesFromHistory(messages: unknown[]): DetectedImageRef[] {
	const allRefs: DetectedImageRef[] = [];
	const seen = new Set<string>();

	const messageHasImageContent = (msg: unknown): boolean => {
	if (!msg \|\| typeof msg !== "object") {
	return false;
	}
	const content = (msg as { content?: unknown }).content;
	if (!Array.isArray(content)) {
	return false;
	}
	return content.some(
	(part) =>
	part != null && typeof part === "object" && (part as { type?: string }).type === "image",
	);
	};

	for (let i = 0; i < messages.length; i++) {
	const msg = messages[i];
	if (!msg \|\| typeof msg !== "object") {
	continue;
	}
	const message = msg as { role?: string };
	// Only scan user messages for image references
	if (message.role !== "user") {
	continue;
	}
	// Skip if message already has image content (prevents reloading each turn)
	if (messageHasImageContent(msg)) {
	continue;
	}

	const text = extractTextFromMessage(msg);
	if (!text) {
	continue;
	}

	const refs = detectImageReferences(text);
	for (const ref of refs) {
	const key = ref.resolved.toLowerCase();
	if (seen.has(key)) {
	continue;
	}
	seen.add(key);
	allRefs.push({ ...ref, messageIndex: i });
	}
	}

	return allRefs;
	}

	/**
	* Detects and loads images referenced in a prompt for models with vision capability.
	*
	* This function scans the prompt for image references (file paths and URLs),
	* loads them, and returns them as ImageContent array ready to be passed to
	* the model's prompt method.
	*
	* Also scans conversation history for images from previous turns and returns
	* them mapped by message index so they can be injected at their original location.
	*
	* @param params Configuration for image detection and loading
	* @returns Object with loaded images for current prompt and history images by message index
	*/
	export async function detectAndLoadPromptImages(params: {
	prompt: string;
	workspaceDir: string;
	model: { input?: string[] };
	existingImages?: ImageContent[];
	historyMessages?: unknown[];
	maxBytes?: number;
	/** If set, enforce that file paths are within this sandbox root */
	sandboxRoot?: string;
	}): Promise<{
	/** Images for the current prompt (existingImages + detected in current prompt) */
	images: ImageContent[];
	/** Images from history messages, keyed by message index */
	historyImagesByIndex: Map<number, ImageContent[]>;
	detectedRefs: DetectedImageRef[];
	loadedCount: number;
	skippedCount: number;
	}> {
	// If model doesn't support images, return empty results
	if (!modelSupportsImages(params.model)) {
	return {
	images: [],
	historyImagesByIndex: new Map(),
	detectedRefs: [],
	loadedCount: 0,
	skippedCount: 0,
	};
	}

	// Detect images from current prompt
	const promptRefs = detectImageReferences(params.prompt);

	// Detect images from conversation history (with message indices)
	const historyRefs = params.historyMessages ? detectImagesFromHistory(params.historyMessages) : [];

	// Deduplicate: if an image is in the current prompt, don't also load it from history.
	// Current prompt images are passed via the `images` parameter to prompt(), while history
	// images are injected into their original message positions. We don't want the same
	// image loaded and sent twice (wasting tokens and potentially causing confusion).
	const seenPaths = new Set(promptRefs.map((r) => r.resolved.toLowerCase()));
	const uniqueHistoryRefs = historyRefs.filter((r) => !seenPaths.has(r.resolved.toLowerCase()));

	const allRefs = [...promptRefs, ...uniqueHistoryRefs];

	if (allRefs.length === 0) {
	return {
	images: params.existingImages ?? [],
	historyImagesByIndex: new Map(),
	detectedRefs: [],
	loadedCount: 0,
	skippedCount: 0,
	};
	}

	log.debug(
	`Native image: detected ${allRefs.length} image refs (${promptRefs.length} in prompt, ${uniqueHistoryRefs.length} in history)`,
	);

	// Load images for current prompt
	const promptImages: ImageContent[] = [...(params.existingImages ?? [])];
	// Load images for history, grouped by message index
	const historyImagesByIndex = new Map<number, ImageContent[]>();

	let loadedCount = 0;
	let skippedCount = 0;

	for (const ref of allRefs) {
	const image = await loadImageFromRef(ref, params.workspaceDir, {
	maxBytes: params.maxBytes,
	sandboxRoot: params.sandboxRoot,
	});
	if (image) {
	if (ref.messageIndex !== undefined) {
	// History image - add to the appropriate message index
	const existing = historyImagesByIndex.get(ref.messageIndex);
	if (existing) {
	existing.push(image);
	} else {
	historyImagesByIndex.set(ref.messageIndex, [image]);
	}
	} else {
	// Current prompt image
	promptImages.push(image);
	}
	loadedCount++;
	log.debug(`Native image: loaded ${ref.type} ${ref.resolved}`);
	} else {
	skippedCount++;
	}
	}

	const sanitizedPromptImages = await sanitizeImagesWithLog(promptImages, "prompt:images");
	const sanitizedHistoryImagesByIndex = new Map<number, ImageContent[]>();
	for (const [index, images] of historyImagesByIndex) {
	const sanitized = await sanitizeImagesWithLog(images, `history:images:${index}`);
	if (sanitized.length > 0) {
	sanitizedHistoryImagesByIndex.set(index, sanitized);
	}
	}

	return {
	images: sanitizedPromptImages,
	historyImagesByIndex: sanitizedHistoryImagesByIndex,
	detectedRefs: allRefs,
	loadedCount,
	skippedCount,
	};
	}