Spaces:

Ac66
/

W

Sleeping

File size: 8,600 Bytes

2b64d42

import https from 'node:https';
import http from 'node:http';
import { lookup as dnsLookup } from 'node:dns';
import { log } from './config.js';
import { tryExtractPdf } from './pdf.js';
import { isPrivateIp, resolvePublicAddresses } from './net-safety.js';

const MAX_SIZE = 5 * 1024 * 1024; // 5 MB
const MAX_BASE64_LEN = Math.ceil(MAX_SIZE * 4 / 3) + 100;
const MAX_REDIRECTS = 3;
const MIME_OK = new Set(['image/png', 'image/jpeg', 'image/webp', 'image/gif']);
// http/https `lookup` hook: runs in place of the default DNS resolution.
// Rejecting here means the request never opens a socket to the internal
// address, closing the DNS-rebinding gap in the string-based host check.
function safeLookup(hostname, options, callback) {
  dnsLookup(hostname, options, (err, address, family) => {
    if (err) return callback(err);
    const addrs = Array.isArray(address) ? address : [{ address, family }];
    for (const a of addrs) {
      if (isPrivateIp(a.address)) {
        return callback(new Error(`Image URL resolves to private address: ${a.address}`));
      }
    }
    callback(null, address, family);
  });
}

function validateImageUrl(url) {
  let parsed;
  try { parsed = new URL(url); } catch { throw new Error('Invalid image URL'); }
  if (parsed.protocol !== 'https:' && parsed.protocol !== 'http:')
    throw new Error('Image URL must be http or https');
  if (String(parsed.hostname).toLowerCase() === 'localhost' || isPrivateIp(parsed.hostname))
    throw new Error('Image URL targets a private/internal address');
  return parsed;
}

export function parseDataUrl(url) {
  const clean = url.replace(/\s/g, '');
  const m = clean.match(/^data:(image\/[a-z+]+);base64,(.+)$/i);
  if (!m) return null;
  if (m[2].length > MAX_BASE64_LEN) throw new Error(`Image data URL exceeds ${MAX_SIZE} byte limit`);
  return { base64_data: m[2], mime_type: m[1].toLowerCase() };
}

// Extract base64 body from a data URL of any mime type. Used for PDF
// payloads which don't match parseDataUrl's image-only regex.
export function parseGenericDataUrl(url) {
  const clean = url.replace(/\s/g, '');
  const m = clean.match(/^data:([a-z0-9][a-z0-9.+/-]+);base64,(.+)$/i);
  if (!m) return null;
  if (m[2].length > MAX_BASE64_LEN) throw new Error(`Data URL exceeds ${MAX_SIZE} byte limit`);
  return { base64_data: m[2], mime_type: m[1].toLowerCase() };
}

export async function assertPublicUrlHost(urlOrHost, lookupFn = dnsLookup) {
  let host = urlOrHost;
  try { host = new URL(urlOrHost).hostname; } catch {}
  return resolvePublicAddresses(host, lookupFn);
}

export function fetchImageUrl(url, timeoutMs = 8000, _depth = 0) {
  if (_depth > MAX_REDIRECTS) return Promise.reject(new Error('Too many image redirects'));
  validateImageUrl(url);

  return new Promise((resolve, reject) => {
    let settled = false;
    const done = (fn, val) => { if (!settled) { settled = true; fn(val); } };

    const mod = url.startsWith('https') ? https : http;
    const req = mod.get(url, { timeout: timeoutMs, headers: { 'Accept': 'image/*' }, lookup: safeLookup }, (res) => {
      if (res.statusCode >= 300 && res.statusCode < 400 && res.headers.location) {
        res.resume();
        return fetchImageUrl(res.headers.location, timeoutMs, _depth + 1).then(
          v => done(resolve, v), e => done(reject, e)
        );
      }
      if (res.statusCode !== 200) {
        res.resume();
        return done(reject, new Error(`Image fetch HTTP ${res.statusCode}`));
      }
      const mime = (res.headers['content-type'] || '').split(';')[0].trim().toLowerCase();
      if (!MIME_OK.has(mime)) {
        res.resume();
        return done(reject, new Error(`Unsupported image type: ${mime}`));
      }
      const chunks = [];
      let size = 0;
      res.on('data', (d) => {
        if (settled) return;
        size += d.length;
        if (size > MAX_SIZE) { res.destroy(); done(reject, new Error(`Image exceeds ${MAX_SIZE} bytes`)); }
        else chunks.push(d);
      });
      res.on('end', () => done(resolve, { base64_data: Buffer.concat(chunks).toString('base64'), mime_type: mime }));
      res.on('error', (e) => done(reject, e));
    });
    req.on('error', (e) => done(reject, e));
    req.on('timeout', () => { req.destroy(); done(reject, new Error('Image fetch timeout')); });
  });
}

export async function extractImages(contentBlocks) {
  if (!Array.isArray(contentBlocks)) return { text: String(contentBlocks ?? ''), images: [] };

  let text = '';
  const images = [];

  for (const block of contentBlocks) {
    if (!block || typeof block === 'string') { text += block || ''; continue; }

    if (block.type === 'text') {
      text += block.text || '';
    } else if (block.type === 'document') {
      const src = block.source || {};
      const mime = (src.media_type || '').toLowerCase();
      if (mime === 'application/pdf' && src.data) {
        const pdf = tryExtractPdf(src.data);
        if (pdf?.text) {
          text += `\n[PDF Document — ${pdf.pageCount} page(s)]\n${pdf.text}\n`;
          log.info(`PDF extracted: ${pdf.pageCount} pages, ${pdf.text.length} chars`);
        } else {
          text += '\n[PDF Document — no extractable text (scanned/image-only PDF)]\n';
        }
      }
    } else if (block.type === 'image') {
      const src = block.source || {};
      const mime = (src.media_type || '').toLowerCase();
      if (mime === 'application/pdf' && src.data) {
        const pdf = tryExtractPdf(src.data);
        if (pdf?.text) {
          text += `\n[PDF Document — ${pdf.pageCount} page(s)]\n${pdf.text}\n`;
        }
        continue;
      }
      try {
        if ((src.type === 'base64' || !src.type) && src.data) {
          if (src.data.length > MAX_BASE64_LEN) { log.warn('Image base64 exceeds size limit, skipping'); continue; }
          images.push({ base64_data: src.data, mime_type: src.media_type || 'image/png' });
        } else if (src.type === 'url' && src.url) {
          images.push(await fetchImageUrl(src.url));
        }
      } catch (e) { log.warn(`Image extraction failed: ${e.message}`); }
    } else if (block.type === 'image_url') {
      const url = block.image_url?.url || '';
      try {
        if (url.startsWith('data:')) {
          // PDF-as-data-URL: let the model "see" it via text extraction
          // rather than treating it as an unsupported image type.
          const lower = url.slice(0, 40).toLowerCase();
          if (lower.startsWith('data:application/pdf')) {
            const g = parseGenericDataUrl(url);
            if (g?.base64_data) {
              const pdf = tryExtractPdf(g.base64_data);
              if (pdf?.text) {
                text += `\n[PDF Document — ${pdf.pageCount} page(s)]\n${pdf.text}\n`;
                log.info(`PDF extracted (image_url data URL): ${pdf.pageCount} pages, ${pdf.text.length} chars`);
              } else {
                text += '\n[PDF Document — no extractable text (scanned/image-only PDF)]\n';
              }
            }
            continue;
          }
          const parsed = parseDataUrl(url);
          if (parsed) images.push(parsed);
        } else if (url.startsWith('https://') || url.startsWith('http://')) {
          images.push(await fetchImageUrl(url));
        }
      } catch (e) { log.warn(`Image fetch failed: ${e.message}`); }
    } else if (block.type === 'file' || block.type === 'input_file') {
      // OpenAI PDF input: { type:'file', file:{ filename, file_data:'data:application/pdf;base64,...' } }
      // or file_id (uploaded via Files API — we can't fetch, so ignore).
      const file = block.file || {};
      const dataUrl = file.file_data || file.url || '';
      if (dataUrl.startsWith('data:application/pdf')) {
        const g = parseGenericDataUrl(dataUrl);
        if (g?.base64_data) {
          const pdf = tryExtractPdf(g.base64_data);
          if (pdf?.text) {
            const label = file.filename ? ` "${file.filename}"` : '';
            text += `\n[PDF Document${label} — ${pdf.pageCount} page(s)]\n${pdf.text}\n`;
            log.info(`PDF extracted (OpenAI file block): ${pdf.pageCount} pages, ${pdf.text.length} chars`);
          } else {
            text += '\n[PDF Document — no extractable text (scanned/image-only PDF)]\n';
          }
        }
      } else if (dataUrl && !file.file_id) {
        log.warn(`Unsupported file block data URL: ${dataUrl.slice(0, 40)}...`);
      } else if (file.file_id) {
        log.warn(`File block references file_id=${file.file_id} — upload API not supported, skipping`);
      }
    }
  }

  return { text, images };
}