Spaces:

comfyuiman
/

loracaptionertaz_v2

Running

App Files Files Community

loracaptionertaz_v2 / services /grokService.ts

comfyuiman

Upload 18 files

7946263 verified about 1 month ago

raw

history blame contribute delete

8.58 kB


	/**
	* Service for interacting with xAI Grok via OpenAI-compatible vision endpoints.
	*/

	const fileToBase64 = (file: File): Promise<string> => {
	return new Promise((resolve, reject) => {
	const reader = new FileReader();
	reader.readAsDataURL(file);
	reader.onload = () => {
	if (typeof reader.result === 'string') {
	resolve(reader.result);
	} else {
	reject(new Error('Failed to convert file to base64'));
	}
	};
	reader.onerror = error => reject(error);
	});
	};

	const extractFramesFromVideo = async (videoFile: File, numberOfFrames: number): Promise<string[]> => {
	return new Promise((resolve, reject) => {
	const video = document.createElement('video');
	video.preload = 'metadata';
	video.muted = true;
	video.playsInline = true;
	const url = URL.createObjectURL(videoFile);
	const frames: string[] = [];
	const timeout = setTimeout(() => {
	URL.revokeObjectURL(url);
	video.src = "";
	reject(new Error("Video processing timed out"));
	}, 60000);

	video.onloadeddata = async () => {
	const duration = video.duration;
	const canvas = document.createElement('canvas');
	const ctx = canvas.getContext('2d');
	if (!ctx) {
	clearTimeout(timeout);
	URL.revokeObjectURL(url);
	reject(new Error("Could not create canvas context"));
	return;
	}
	canvas.width = video.videoWidth;
	canvas.height = video.videoHeight;
	const step = duration / numberOfFrames;
	try {
	for (let i = 0; i < numberOfFrames; i++) {
	const time = (step * i) + (step / 2);
	await new Promise<void>((frameResolve) => {
	const onSeeked = () => {
	video.removeEventListener('seeked', onSeeked);
	frameResolve();
	};
	video.addEventListener('seeked', onSeeked);
	video.currentTime = Math.min(time, duration - 0.1);
	});
	ctx.drawImage(video, 0, 0);
	frames.push(canvas.toDataURL('image/jpeg', 0.8));
	}
	clearTimeout(timeout);
	URL.revokeObjectURL(url);
	video.src = "";
	resolve(frames);
	} catch (e) {
	clearTimeout(timeout);
	URL.revokeObjectURL(url);
	reject(e);
	}
	};
	video.onerror = () => {
	clearTimeout(timeout);
	URL.revokeObjectURL(url);
	reject(new Error("Failed to load video file"));
	};
	video.src = url;
	});
	};

	const constructPrompt = (
	triggerWord: string,
	customInstructions?: string,
	isCharacterTaggingEnabled?: boolean,
	characterShowName?: string
	): string => {
	let basePrompt = `You are an expert captioner for AI model training data. Your task is to describe the provided image/video in detail for a style LoRA. Follow these rules strictly:
	1. Start the caption with the trigger word: "${triggerWord}".
	2. Describe EVERYTHING visible: characters, clothing, actions, background, objects, lighting, and camera angle.
	3. Be objective and factual.
	4. DO NOT mention the art style, "anime", "cartoon", "illustration", "2d", or "animation".
	5. Write the description as a single, continuous paragraph.`;

	if (isCharacterTaggingEnabled && characterShowName && characterShowName.trim() !== '') {
	basePrompt += `\n6. After the description, identify any characters from the show "${characterShowName}" and append their tags to the very end of the caption, separated by commas. The format for each tag must be "char_[charactername]" (e.g., ", char_simon, char_kamina"). If no characters are recognized, add no tags.`;
	}

	if (customInstructions) {
	return `${basePrompt}\n\nIMPORTANT USER INSTRUCTIONS:\n${customInstructions}`;
	}
	return basePrompt;
	};

	export const generateCaptionGrok = async (
	apiKey: string,
	model: string,
	file: File,
	triggerWord: string,
	customInstructions?: string,
	isCharacterTaggingEnabled?: boolean,
	characterShowName?: string,
	videoFrameCount: number = 8,
	signal?: AbortSignal
	): Promise<string> => {
	if (!apiKey) throw new Error("xAI API Key is required for Grok.");
	const endpoint = 'https://api.x.ai/v1/chat/completions';
	const prompt = constructPrompt(triggerWord, customInstructions, isCharacterTaggingEnabled, characterShowName);

	let contentParts: any[] = [{ type: "text", text: prompt }];
	if (file.type.startsWith('video/')) {
	const frames = await extractFramesFromVideo(file, videoFrameCount);
	frames.forEach(frame => contentParts.push({ type: "image_url", image_url: { url: frame } }));
	} else {
	const base64Image = await fileToBase64(file);
	contentParts.push({ type: "image_url", image_url: { url: base64Image } });
	}

	const payload = {
	model: model \|\| 'grok-2-vision-1212',
	messages: [{ role: "user", content: contentParts }],
	max_tokens: 1000,
	temperature: 0.2
	};

	const response = await fetch(endpoint, {
	method: "POST",
	headers: {
	"Content-Type": "application/json",
	"Authorization": `Bearer ${apiKey}`
	},
	body: JSON.stringify(payload),
	signal
	});

	if (!response.ok) {
	const errData = await response.json().catch(() => ({}));
	throw new Error(`Grok API Error (${response.status}): ${errData.error?.message \|\| response.statusText}`);
	}

	const data = await response.json();
	return data.choices?.[0]?.message?.content?.trim() \|\| "";
	};

	export const refineCaptionGrok = async (
	apiKey: string,
	model: string,
	file: File,
	currentCaption: string,
	refinementInstructions: string,
	videoFrameCount: number = 4,
	signal?: AbortSignal
	): Promise<string> => {
	if (!apiKey) throw new Error("xAI API Key is required for Grok.");
	const endpoint = 'https://api.x.ai/v1/chat/completions';
	const prompt = `Refine the following caption based on the visual information and the instructions. Output ONLY the refined text.
	CURRENT CAPTION: "${currentCaption}"
	INSTRUCTIONS: "${refinementInstructions}"`;

	let contentParts: any[] = [{ type: "text", text: prompt }];
	if (file.type.startsWith('video/')) {
	const frames = await extractFramesFromVideo(file, videoFrameCount);
	frames.forEach(frame => contentParts.push({ type: "image_url", image_url: { url: frame } }));
	} else {
	const base64Image = await fileToBase64(file);
	contentParts.push({ type: "image_url", image_url: { url: base64Image } });
	}

	const payload = {
	model: model \|\| 'grok-2-vision-1212',
	messages: [{ role: "user", content: contentParts }],
	max_tokens: 1000,
	temperature: 0.2
	};

	const response = await fetch(endpoint, {
	method: "POST",
	headers: {
	"Content-Type": "application/json",
	"Authorization": `Bearer ${apiKey}`
	},
	body: JSON.stringify(payload),
	signal
	});

	if (!response.ok) throw new Error(`Grok API Error: ${response.status}`);
	const data = await response.json();
	return data.choices?.[0]?.message?.content?.trim() \|\| "";
	};

	export const checkQualityGrok = async (
	apiKey: string,
	model: string,
	file: File,
	caption: string,
	videoFrameCount: number = 4,
	signal?: AbortSignal
	): Promise<number> => {
	if (!apiKey) throw new Error("xAI API Key is required for Grok.");
	const endpoint = 'https://api.x.ai/v1/chat/completions';
	const prompt = `Evaluate the caption quality. Respond with ONLY an integer from 1 to 5.\nCaption: "${caption}"`;

	let contentParts: any[] = [{ type: "text", text: prompt }];
	if (file.type.startsWith('video/')) {
	const frames = await extractFramesFromVideo(file, videoFrameCount);
	frames.forEach(frame => contentParts.push({ type: "image_url", image_url: { url: frame } }));
	} else {
	const base64Image = await fileToBase64(file);
	contentParts.push({ type: "image_url", image_url: { url: base64Image } });
	}

	const payload = {
	model: model \|\| 'grok-2-vision-1212',
	messages: [{ role: "user", content: contentParts }],
	max_tokens: 10,
	temperature: 0.1
	};

	const response = await fetch(endpoint, {
	method: "POST",
	headers: {
	"Content-Type": "application/json",
	"Authorization": `Bearer ${apiKey}`
	},
	body: JSON.stringify(payload),
	signal
	});

	if (!response.ok) throw new Error(`Grok API Error: ${response.status}`);
	const data = await response.json();
	const text = data.choices?.[0]?.message?.content?.trim();
	return parseInt(text?.match(/\d+/)?.[0] \|\| '0', 10);
	};