Spaces:

comfyuiman
/

loracaptionertaz

Running

File size: 13,980 Bytes

5191cb5


/**
 * Service for interacting with OpenRouter API.
 */

const fileToBase64 = (file: File): Promise<string> => {
  return new Promise((resolve, reject) => {
    const reader = new FileReader();
    reader.readAsDataURL(file);
    reader.onload = () => {
      if (typeof reader.result === 'string') {
        resolve(reader.result);
      } else {
        reject(new Error('Failed to convert file to base64'));
      }
    };
    reader.onerror = error => reject(error);
  });
};

const extractFramesFromVideo = async (videoFile: File, numberOfFrames: number, signal?: AbortSignal): Promise<string[]> => {
  return new Promise((resolve, reject) => {
    const video = document.createElement('video');
    video.preload = 'metadata';
    video.muted = true;
    video.playsInline = true;
    const url = URL.createObjectURL(videoFile);
    const frames: string[] = [];
    
    const onAbort = () => {
        URL.revokeObjectURL(url);
        video.src = "";
        reject(new Error("AbortError"));
    };
    if (signal) signal.addEventListener('abort', onAbort);

    const timeout = setTimeout(() => {
        if (signal) signal.removeEventListener('abort', onAbort);
        URL.revokeObjectURL(url);
        video.src = "";
        reject(new Error("Video processing timed out"));
    }, 60000);

    video.onloadeddata = async () => {
        const duration = video.duration;
        const canvas = document.createElement('canvas');
        const ctx = canvas.getContext('2d');
        if (!ctx) {
            if (signal) signal.removeEventListener('abort', onAbort);
            clearTimeout(timeout);
            URL.revokeObjectURL(url);
            reject(new Error("Could not create canvas context"));
            return;
        }
        canvas.width = video.videoWidth;
        canvas.height = video.videoHeight;
        const step = duration / numberOfFrames;
        try {
            for (let i = 0; i < numberOfFrames; i++) {
                if (signal?.aborted) throw new Error("AbortError");
                const time = (step * i) + (step / 2);
                await new Promise<void>((frameResolve) => {
                    const onSeeked = () => {
                        video.removeEventListener('seeked', onSeeked);
                        frameResolve();
                    };
                    video.addEventListener('seeked', onSeeked);
                    video.currentTime = Math.min(time, duration - 0.1);
                });
                ctx.drawImage(video, 0, 0);
                frames.push(canvas.toDataURL('image/jpeg', 0.8));
            }
            if (signal) signal.removeEventListener('abort', onAbort);
            clearTimeout(timeout);
            URL.revokeObjectURL(url);
            video.src = "";
            resolve(frames);
        } catch (e) {
            if (signal) signal.removeEventListener('abort', onAbort);
            clearTimeout(timeout);
            URL.revokeObjectURL(url);
            reject(e);
        }
    };
    video.onerror = () => {
        if (signal) signal.removeEventListener('abort', onAbort);
        clearTimeout(timeout);
        URL.revokeObjectURL(url);
        reject(new Error("Failed to load video file"));
    };
    video.src = url;
  });
};

const constructPrompt = (
    triggerWord: string, 
    customInstructions?: string,
    isCharacterTaggingEnabled?: boolean,
    characterShowName?: string
): string => {
  let basePrompt = `You are an expert captioner for AI model training data. Your task is to describe the provided image/video in detail for a style LoRA. Follow these rules strictly:
1. Start the caption with the trigger word: "${triggerWord}".
2. Describe EVERYTHING visible: characters, clothing, actions, background, objects, lighting, and camera angle.
3. Be objective and factual.
4. DO NOT mention art styles or generic animation terms like "anime" or "cartoon".
5. Write as a single, continuous paragraph.`;

  if (isCharacterTaggingEnabled && characterShowName && characterShowName.trim() !== '') {
    basePrompt += `\n6. Identify characters from the show/series "${characterShowName}" and append tags at the end of the caption, separated by commas. The format for each tag must be "char_[charactername]" (e.g., ", char_simon, char_kamina"). If no characters are recognized, do not add tags.`;
  }

  if (customInstructions) {
    return `${basePrompt}\n\nAdditional instructions: ${customInstructions}`;
  }
  return basePrompt;
};

export const generateCaptionOpenRouter = async (
  apiKey: string,
  model: string,
  file: File,
  triggerWord: string,
  customInstructions?: string,
  isCharacterTaggingEnabled?: boolean,
  characterShowName?: string,
  videoFrameCount: number = 8,
  maxTokens: number = 4096,
  temperature: number = 0.7,
  useFullVideo: boolean = false,
  signal?: AbortSignal
): Promise<string> => {
  if (!apiKey) throw new Error("OpenRouter API Key is required.");
  const endpoint = 'https://openrouter.ai/api/v1/chat/completions';
  const prompt = constructPrompt(triggerWord, customInstructions, isCharacterTaggingEnabled, characterShowName);
  
  // Extract model ID from URL if provided
  let modelId = model.includes('openrouter.ai/') ? model.split('openrouter.ai/').pop() || '' : model;
  // Handle /models/ prefix if it exists in the URL
  if (modelId.startsWith('models/')) {
    modelId = modelId.replace('models/', '');
  }
  // Remove any trailing slashes or query params
  modelId = modelId.split('?')[0].replace(/\/+$/, '');

  let contentParts: any[] = [{ type: "text", text: prompt }];
  if (file.type.startsWith('video/')) {
    if (useFullVideo) {
      const base64Video = await fileToBase64(file);
      contentParts.push({ type: "image_url", image_url: { url: base64Video } });
    } else {
      const frames = await extractFramesFromVideo(file, videoFrameCount, signal);
      frames.forEach(frame => contentParts.push({ type: "image_url", image_url: { url: frame } }));
    }
  } else {
    const base64Image = await fileToBase64(file);
    contentParts.push({ type: "image_url", image_url: { url: base64Image } });
  }

  const payload = {
    model: modelId || 'openai/gpt-4o-mini',
    messages: [{ role: "user", content: contentParts }],
    max_tokens: maxTokens,
    temperature: temperature
  };

  const response = await fetch(endpoint, {
    method: "POST",
    headers: {
      "Content-Type": "application/json",
      "Authorization": `Bearer ${apiKey}`,
      "HTTP-Referer": window.location.origin,
      "X-Title": "LoRA Caption Assistant"
    },
    body: JSON.stringify(payload),
    signal
  });

  if (!response.ok) {
    let errorMessage = response.statusText;
    try {
      const errData = await response.json();
      errorMessage = errData.error?.message || errData.message || JSON.stringify(errData) || errorMessage;
    } catch (e) {
      const errText = await response.text().catch(() => "");
      if (errText) errorMessage = errText;
    }
    throw new Error(`OpenRouter API Error (${response.status}): ${errorMessage}`);
  }

  const data = await response.json();
  console.log('OpenRouter Generate Response:', data);
  const message = data.choices?.[0]?.message;
  let content = "";
  
  if (message) {
    if (typeof message.content === 'string') {
      content = message.content.trim();
    } else if (Array.isArray(message.content)) {
      // Handle cases where content might be returned as an array of parts
      content = message.content
        .filter((part: any) => part.type === 'text')
        .map((part: any) => part.text)
        .join('\n')
        .trim();
    }
  }

  const refusal = message?.refusal;
  const reasoning = message?.reasoning;
  const finishReason = data.choices?.[0]?.finish_reason;
  
  if (!content && refusal) {
    throw new Error(`OpenRouter Refusal: ${refusal}`);
  }
  
  if (!content && finishReason === 'length') {
    if (reasoning) {
        // If we only have reasoning and it hit the length limit, the model likely 
        // spent all tokens "thinking" and never got to the output.
        throw new Error("OpenRouter model hit token limit during reasoning. Try increasing max tokens or using a non-reasoning model.");
    }
    throw new Error("OpenRouter response was cut off (hit token limit).");
  }

  if (!content && finishReason === 'content_filter') {
    throw new Error("OpenRouter response was blocked by content filter.");
  }
  
  // Some models might put the result in reasoning if content is null, 
  // though rare for standard chat completions.
  return content || (reasoning ? `[Reasoning Only]: ${reasoning}` : "");
};

export const refineCaptionOpenRouter = async (
  apiKey: string,
  model: string,
  file: File,
  currentCaption: string,
  refinementInstructions: string,
  videoFrameCount: number = 4,
  maxTokens: number = 4096,
  temperature: number = 0.7,
  useFullVideo: boolean = false,
  signal?: AbortSignal
): Promise<string> => {
  if (!apiKey) throw new Error("OpenRouter API Key is required.");
  const endpoint = 'https://openrouter.ai/api/v1/chat/completions';
  const prompt = `Refine the following caption based on the visual information and the instructions. Output ONLY the refined text.
CURRENT CAPTION: "${currentCaption}"
INSTRUCTIONS: "${refinementInstructions}"`;

  let modelId = model.includes('openrouter.ai/') ? model.split('openrouter.ai/').pop() || '' : model;
  if (modelId.startsWith('models/')) modelId = modelId.replace('models/', '');
  modelId = modelId.split('?')[0].replace(/\/+$/, '');

  let contentParts: any[] = [{ type: "text", text: prompt }];
  if (file.type.startsWith('video/')) {
    if (useFullVideo) {
      const base64Video = await fileToBase64(file);
      contentParts.push({ type: "image_url", image_url: { url: base64Video } });
    } else {
      const frames = await extractFramesFromVideo(file, videoFrameCount, signal);
      frames.forEach(frame => contentParts.push({ type: "image_url", image_url: { url: frame } }));
    }
  } else {
    const base64Image = await fileToBase64(file);
    contentParts.push({ type: "image_url", image_url: { url: base64Image } });
  }

  const payload = {
    model: modelId || 'openai/gpt-4o-mini',
    messages: [{ role: "user", content: contentParts }],
    max_tokens: maxTokens,
    temperature: temperature
  };

  const response = await fetch(endpoint, {
    method: "POST",
    headers: {
      "Content-Type": "application/json",
      "Authorization": `Bearer ${apiKey}`,
      "HTTP-Referer": window.location.origin,
      "X-Title": "LoRA Caption Assistant"
    },
    body: JSON.stringify(payload),
    signal
  });

  if (!response.ok) {
    let errorMessage = response.statusText;
    try {
      const errData = await response.json();
      errorMessage = errData.error?.message || errData.message || JSON.stringify(errData) || errorMessage;
    } catch (e) {
      const errText = await response.text().catch(() => "");
      if (errText) errorMessage = errText;
    }
    throw new Error(`OpenRouter API Error (${response.status}): ${errorMessage}`);
  }
  const data = await response.json();
  console.log('OpenRouter Refine Response:', data);
  const content = data.choices?.[0]?.message?.content?.trim();
  const refusal = data.choices?.[0]?.message?.refusal;
  if (!content && refusal) throw new Error(`OpenRouter Refusal: ${refusal}`);
  return content || "";
};

export const checkQualityOpenRouter = async (
  apiKey: string,
  model: string,
  file: File,
  caption: string,
  videoFrameCount: number = 4,
  temperature: number = 0.7,
  useFullVideo: boolean = false,
  signal?: AbortSignal
): Promise<number> => {
  if (!apiKey) throw new Error("OpenRouter API Key is required.");
  const endpoint = 'https://openrouter.ai/api/v1/chat/completions';
  const prompt = `Evaluate the caption quality. Respond with ONLY an integer from 1 to 5.\nCaption: "${caption}"`;

  let modelId = model.includes('openrouter.ai/') ? model.split('openrouter.ai/').pop() || '' : model;
  if (modelId.startsWith('models/')) modelId = modelId.replace('models/', '');
  modelId = modelId.split('?')[0].replace(/\/+$/, '');

  let contentParts: any[] = [{ type: "text", text: prompt }];
  if (file.type.startsWith('video/')) {
    if (useFullVideo) {
      const base64Video = await fileToBase64(file);
      contentParts.push({ type: "image_url", image_url: { url: base64Video } });
    } else {
      const frames = await extractFramesFromVideo(file, videoFrameCount, signal);
      frames.forEach(frame => contentParts.push({ type: "image_url", image_url: { url: frame } }));
    }
  } else {
    const base64Image = await fileToBase64(file);
    contentParts.push({ type: "image_url", image_url: { url: base64Image } });
  }

  const payload = {
    model: modelId || 'openai/gpt-4o-mini',
    messages: [{ role: "user", content: contentParts }],
    max_tokens: 10,
    temperature: temperature
  };

  const response = await fetch(endpoint, {
    method: "POST",
    headers: {
      "Content-Type": "application/json",
      "Authorization": `Bearer ${apiKey}`,
      "HTTP-Referer": window.location.origin,
      "X-Title": "LoRA Caption Assistant"
    },
    body: JSON.stringify(payload),
    signal
  });

  if (!response.ok) {
    let errorMessage = response.statusText;
    try {
      const errData = await response.json();
      errorMessage = errData.error?.message || errData.message || JSON.stringify(errData) || errorMessage;
    } catch (e) {
      const errText = await response.text().catch(() => "");
      if (errText) errorMessage = errText;
    }
    throw new Error(`OpenRouter API Error (${response.status}): ${errorMessage}`);
  }
  const data = await response.json();
  console.log('OpenRouter Quality Response:', data);
  const text = data.choices?.[0]?.message?.content?.trim();
  const refusal = data.choices?.[0]?.message?.refusal;
  if (!text && refusal) throw new Error(`OpenRouter Refusal: ${refusal}`);
  return parseInt(text?.match(/\d+/)?.[0] || '0', 10);
};