loracaptionertaz_v2 / services /grokService.ts
comfyuiman's picture
Upload 18 files
7946263 verified
/**
* Service for interacting with xAI Grok via OpenAI-compatible vision endpoints.
*/
const fileToBase64 = (file: File): Promise<string> => {
return new Promise((resolve, reject) => {
const reader = new FileReader();
reader.readAsDataURL(file);
reader.onload = () => {
if (typeof reader.result === 'string') {
resolve(reader.result);
} else {
reject(new Error('Failed to convert file to base64'));
}
};
reader.onerror = error => reject(error);
});
};
const extractFramesFromVideo = async (videoFile: File, numberOfFrames: number): Promise<string[]> => {
return new Promise((resolve, reject) => {
const video = document.createElement('video');
video.preload = 'metadata';
video.muted = true;
video.playsInline = true;
const url = URL.createObjectURL(videoFile);
const frames: string[] = [];
const timeout = setTimeout(() => {
URL.revokeObjectURL(url);
video.src = "";
reject(new Error("Video processing timed out"));
}, 60000);
video.onloadeddata = async () => {
const duration = video.duration;
const canvas = document.createElement('canvas');
const ctx = canvas.getContext('2d');
if (!ctx) {
clearTimeout(timeout);
URL.revokeObjectURL(url);
reject(new Error("Could not create canvas context"));
return;
}
canvas.width = video.videoWidth;
canvas.height = video.videoHeight;
const step = duration / numberOfFrames;
try {
for (let i = 0; i < numberOfFrames; i++) {
const time = (step * i) + (step / 2);
await new Promise<void>((frameResolve) => {
const onSeeked = () => {
video.removeEventListener('seeked', onSeeked);
frameResolve();
};
video.addEventListener('seeked', onSeeked);
video.currentTime = Math.min(time, duration - 0.1);
});
ctx.drawImage(video, 0, 0);
frames.push(canvas.toDataURL('image/jpeg', 0.8));
}
clearTimeout(timeout);
URL.revokeObjectURL(url);
video.src = "";
resolve(frames);
} catch (e) {
clearTimeout(timeout);
URL.revokeObjectURL(url);
reject(e);
}
};
video.onerror = () => {
clearTimeout(timeout);
URL.revokeObjectURL(url);
reject(new Error("Failed to load video file"));
};
video.src = url;
});
};
const constructPrompt = (
triggerWord: string,
customInstructions?: string,
isCharacterTaggingEnabled?: boolean,
characterShowName?: string
): string => {
let basePrompt = `You are an expert captioner for AI model training data. Your task is to describe the provided image/video in detail for a style LoRA. Follow these rules strictly:
1. Start the caption with the trigger word: "${triggerWord}".
2. Describe EVERYTHING visible: characters, clothing, actions, background, objects, lighting, and camera angle.
3. Be objective and factual.
4. DO NOT mention the art style, "anime", "cartoon", "illustration", "2d", or "animation".
5. Write the description as a single, continuous paragraph.`;
if (isCharacterTaggingEnabled && characterShowName && characterShowName.trim() !== '') {
basePrompt += `\n6. After the description, identify any characters from the show "${characterShowName}" and append their tags to the very end of the caption, separated by commas. The format for each tag must be "char_[charactername]" (e.g., ", char_simon, char_kamina"). If no characters are recognized, add no tags.`;
}
if (customInstructions) {
return `${basePrompt}\n\nIMPORTANT USER INSTRUCTIONS:\n${customInstructions}`;
}
return basePrompt;
};
export const generateCaptionGrok = async (
apiKey: string,
model: string,
file: File,
triggerWord: string,
customInstructions?: string,
isCharacterTaggingEnabled?: boolean,
characterShowName?: string,
videoFrameCount: number = 8,
signal?: AbortSignal
): Promise<string> => {
if (!apiKey) throw new Error("xAI API Key is required for Grok.");
const endpoint = 'https://api.x.ai/v1/chat/completions';
const prompt = constructPrompt(triggerWord, customInstructions, isCharacterTaggingEnabled, characterShowName);
let contentParts: any[] = [{ type: "text", text: prompt }];
if (file.type.startsWith('video/')) {
const frames = await extractFramesFromVideo(file, videoFrameCount);
frames.forEach(frame => contentParts.push({ type: "image_url", image_url: { url: frame } }));
} else {
const base64Image = await fileToBase64(file);
contentParts.push({ type: "image_url", image_url: { url: base64Image } });
}
const payload = {
model: model || 'grok-2-vision-1212',
messages: [{ role: "user", content: contentParts }],
max_tokens: 1000,
temperature: 0.2
};
const response = await fetch(endpoint, {
method: "POST",
headers: {
"Content-Type": "application/json",
"Authorization": `Bearer ${apiKey}`
},
body: JSON.stringify(payload),
signal
});
if (!response.ok) {
const errData = await response.json().catch(() => ({}));
throw new Error(`Grok API Error (${response.status}): ${errData.error?.message || response.statusText}`);
}
const data = await response.json();
return data.choices?.[0]?.message?.content?.trim() || "";
};
export const refineCaptionGrok = async (
apiKey: string,
model: string,
file: File,
currentCaption: string,
refinementInstructions: string,
videoFrameCount: number = 4,
signal?: AbortSignal
): Promise<string> => {
if (!apiKey) throw new Error("xAI API Key is required for Grok.");
const endpoint = 'https://api.x.ai/v1/chat/completions';
const prompt = `Refine the following caption based on the visual information and the instructions. Output ONLY the refined text.
CURRENT CAPTION: "${currentCaption}"
INSTRUCTIONS: "${refinementInstructions}"`;
let contentParts: any[] = [{ type: "text", text: prompt }];
if (file.type.startsWith('video/')) {
const frames = await extractFramesFromVideo(file, videoFrameCount);
frames.forEach(frame => contentParts.push({ type: "image_url", image_url: { url: frame } }));
} else {
const base64Image = await fileToBase64(file);
contentParts.push({ type: "image_url", image_url: { url: base64Image } });
}
const payload = {
model: model || 'grok-2-vision-1212',
messages: [{ role: "user", content: contentParts }],
max_tokens: 1000,
temperature: 0.2
};
const response = await fetch(endpoint, {
method: "POST",
headers: {
"Content-Type": "application/json",
"Authorization": `Bearer ${apiKey}`
},
body: JSON.stringify(payload),
signal
});
if (!response.ok) throw new Error(`Grok API Error: ${response.status}`);
const data = await response.json();
return data.choices?.[0]?.message?.content?.trim() || "";
};
export const checkQualityGrok = async (
apiKey: string,
model: string,
file: File,
caption: string,
videoFrameCount: number = 4,
signal?: AbortSignal
): Promise<number> => {
if (!apiKey) throw new Error("xAI API Key is required for Grok.");
const endpoint = 'https://api.x.ai/v1/chat/completions';
const prompt = `Evaluate the caption quality. Respond with ONLY an integer from 1 to 5.\nCaption: "${caption}"`;
let contentParts: any[] = [{ type: "text", text: prompt }];
if (file.type.startsWith('video/')) {
const frames = await extractFramesFromVideo(file, videoFrameCount);
frames.forEach(frame => contentParts.push({ type: "image_url", image_url: { url: frame } }));
} else {
const base64Image = await fileToBase64(file);
contentParts.push({ type: "image_url", image_url: { url: base64Image } });
}
const payload = {
model: model || 'grok-2-vision-1212',
messages: [{ role: "user", content: contentParts }],
max_tokens: 10,
temperature: 0.1
};
const response = await fetch(endpoint, {
method: "POST",
headers: {
"Content-Type": "application/json",
"Authorization": `Bearer ${apiKey}`
},
body: JSON.stringify(payload),
signal
});
if (!response.ok) throw new Error(`Grok API Error: ${response.status}`);
const data = await response.json();
const text = data.choices?.[0]?.message?.content?.trim();
return parseInt(text?.match(/\d+/)?.[0] || '0', 10);
};