/** * Service for interacting with xAI Grok via OpenAI-compatible vision endpoints. */ const fileToBase64 = (file: File): Promise => { return new Promise((resolve, reject) => { const reader = new FileReader(); reader.readAsDataURL(file); reader.onload = () => { if (typeof reader.result === 'string') { resolve(reader.result); } else { reject(new Error('Failed to convert file to base64')); } }; reader.onerror = error => reject(error); }); }; const extractFramesFromVideo = async (videoFile: File, numberOfFrames: number): Promise => { return new Promise((resolve, reject) => { const video = document.createElement('video'); video.preload = 'metadata'; video.muted = true; video.playsInline = true; const url = URL.createObjectURL(videoFile); const frames: string[] = []; const timeout = setTimeout(() => { URL.revokeObjectURL(url); video.src = ""; reject(new Error("Video processing timed out")); }, 60000); video.onloadeddata = async () => { const duration = video.duration; const canvas = document.createElement('canvas'); const ctx = canvas.getContext('2d'); if (!ctx) { clearTimeout(timeout); URL.revokeObjectURL(url); reject(new Error("Could not create canvas context")); return; } canvas.width = video.videoWidth; canvas.height = video.videoHeight; const step = duration / numberOfFrames; try { for (let i = 0; i < numberOfFrames; i++) { const time = (step * i) + (step / 2); await new Promise((frameResolve) => { const onSeeked = () => { video.removeEventListener('seeked', onSeeked); frameResolve(); }; video.addEventListener('seeked', onSeeked); video.currentTime = Math.min(time, duration - 0.1); }); ctx.drawImage(video, 0, 0); frames.push(canvas.toDataURL('image/jpeg', 0.8)); } clearTimeout(timeout); URL.revokeObjectURL(url); video.src = ""; resolve(frames); } catch (e) { clearTimeout(timeout); URL.revokeObjectURL(url); reject(e); } }; video.onerror = () => { clearTimeout(timeout); URL.revokeObjectURL(url); reject(new Error("Failed to load video file")); }; video.src = url; }); }; const constructPrompt = ( triggerWord: string, customInstructions?: string, isCharacterTaggingEnabled?: boolean, characterShowName?: string ): string => { let basePrompt = `You are an expert captioner for AI model training data. Your task is to describe the provided image/video in detail for a style LoRA. Follow these rules strictly: 1. Start the caption with the trigger word: "${triggerWord}". 2. Describe EVERYTHING visible: characters, clothing, actions, background, objects, lighting, and camera angle. 3. Be objective and factual. 4. DO NOT mention the art style, "anime", "cartoon", "illustration", "2d", or "animation". 5. Write the description as a single, continuous paragraph.`; if (isCharacterTaggingEnabled && characterShowName && characterShowName.trim() !== '') { basePrompt += `\n6. After the description, identify any characters from the show "${characterShowName}" and append their tags to the very end of the caption, separated by commas. The format for each tag must be "char_[charactername]" (e.g., ", char_simon, char_kamina"). If no characters are recognized, add no tags.`; } if (customInstructions) { return `${basePrompt}\n\nIMPORTANT USER INSTRUCTIONS:\n${customInstructions}`; } return basePrompt; }; export const generateCaptionGrok = async ( apiKey: string, model: string, file: File, triggerWord: string, customInstructions?: string, isCharacterTaggingEnabled?: boolean, characterShowName?: string, videoFrameCount: number = 8, signal?: AbortSignal ): Promise => { if (!apiKey) throw new Error("xAI API Key is required for Grok."); const endpoint = 'https://api.x.ai/v1/chat/completions'; const prompt = constructPrompt(triggerWord, customInstructions, isCharacterTaggingEnabled, characterShowName); let contentParts: any[] = [{ type: "text", text: prompt }]; if (file.type.startsWith('video/')) { const frames = await extractFramesFromVideo(file, videoFrameCount); frames.forEach(frame => contentParts.push({ type: "image_url", image_url: { url: frame } })); } else { const base64Image = await fileToBase64(file); contentParts.push({ type: "image_url", image_url: { url: base64Image } }); } const payload = { model: model || 'grok-2-vision-1212', messages: [{ role: "user", content: contentParts }], max_tokens: 1000, temperature: 0.2 }; const response = await fetch(endpoint, { method: "POST", headers: { "Content-Type": "application/json", "Authorization": `Bearer ${apiKey}` }, body: JSON.stringify(payload), signal }); if (!response.ok) { const errData = await response.json().catch(() => ({})); throw new Error(`Grok API Error (${response.status}): ${errData.error?.message || response.statusText}`); } const data = await response.json(); return data.choices?.[0]?.message?.content?.trim() || ""; }; export const refineCaptionGrok = async ( apiKey: string, model: string, file: File, currentCaption: string, refinementInstructions: string, videoFrameCount: number = 4, signal?: AbortSignal ): Promise => { if (!apiKey) throw new Error("xAI API Key is required for Grok."); const endpoint = 'https://api.x.ai/v1/chat/completions'; const prompt = `Refine the following caption based on the visual information and the instructions. Output ONLY the refined text. CURRENT CAPTION: "${currentCaption}" INSTRUCTIONS: "${refinementInstructions}"`; let contentParts: any[] = [{ type: "text", text: prompt }]; if (file.type.startsWith('video/')) { const frames = await extractFramesFromVideo(file, videoFrameCount); frames.forEach(frame => contentParts.push({ type: "image_url", image_url: { url: frame } })); } else { const base64Image = await fileToBase64(file); contentParts.push({ type: "image_url", image_url: { url: base64Image } }); } const payload = { model: model || 'grok-2-vision-1212', messages: [{ role: "user", content: contentParts }], max_tokens: 1000, temperature: 0.2 }; const response = await fetch(endpoint, { method: "POST", headers: { "Content-Type": "application/json", "Authorization": `Bearer ${apiKey}` }, body: JSON.stringify(payload), signal }); if (!response.ok) throw new Error(`Grok API Error: ${response.status}`); const data = await response.json(); return data.choices?.[0]?.message?.content?.trim() || ""; }; export const checkQualityGrok = async ( apiKey: string, model: string, file: File, caption: string, videoFrameCount: number = 4, signal?: AbortSignal ): Promise => { if (!apiKey) throw new Error("xAI API Key is required for Grok."); const endpoint = 'https://api.x.ai/v1/chat/completions'; const prompt = `Evaluate the caption quality. Respond with ONLY an integer from 1 to 5.\nCaption: "${caption}"`; let contentParts: any[] = [{ type: "text", text: prompt }]; if (file.type.startsWith('video/')) { const frames = await extractFramesFromVideo(file, videoFrameCount); frames.forEach(frame => contentParts.push({ type: "image_url", image_url: { url: frame } })); } else { const base64Image = await fileToBase64(file); contentParts.push({ type: "image_url", image_url: { url: base64Image } }); } const payload = { model: model || 'grok-2-vision-1212', messages: [{ role: "user", content: contentParts }], max_tokens: 10, temperature: 0.1 }; const response = await fetch(endpoint, { method: "POST", headers: { "Content-Type": "application/json", "Authorization": `Bearer ${apiKey}` }, body: JSON.stringify(payload), signal }); if (!response.ok) throw new Error(`Grok API Error: ${response.status}`); const data = await response.json(); const text = data.choices?.[0]?.message?.content?.trim(); return parseInt(text?.match(/\d+/)?.[0] || '0', 10); };