Spaces:

aukaru
/

344

Paused

App Files Files Community

344 / src /shared /tokenization /openai.ts

aukaru

Upload 236 files

5c5b371 verified 7 months ago

raw

history blame contribute delete

7.36 kB

	import { Tiktoken } from "tiktoken/lite";
	import cl100k_base from "tiktoken/encoders/cl100k_base.json";
	import { logger } from "../../logger";
	import { libSharp } from "../file-storage";
	import { GoogleAIChatMessage, OpenAIChatMessage } from "../api-schemas";

	const log = logger.child({ module: "tokenizer", service: "openai" });
	const GPT4_VISION_SYSTEM_PROMPT_SIZE = 170;

	let encoder: Tiktoken;

	export function init() {
	encoder = new Tiktoken(
	cl100k_base.bpe_ranks,
	cl100k_base.special_tokens,
	cl100k_base.pat_str
	);
	return true;
	}

	// Tested against:
	// https://github.com/openai/openai-cookbook/blob/main/examples/How_to_count_tokens_with_tiktoken.ipynb

	export async function getTokenCount(
	prompt: string \| OpenAIChatMessage[],
	model: string
	) {
	if (typeof prompt === "string") {
	return getTextTokenCount(prompt);
	}

	const oldFormatting = model.startsWith("turbo-0301");
	const vision = model.includes("vision");

	const tokensPerMessage = oldFormatting ? 4 : 3;
	const tokensPerName = oldFormatting ? -1 : 1; // older formatting replaces role with name if name is present

	let numTokens = vision ? GPT4_VISION_SYSTEM_PROMPT_SIZE : 0;

	for (const message of prompt) {
	numTokens += tokensPerMessage;
	for (const key of Object.keys(message)) {
	{
	let textContent: string = "";
	const value = message[key as keyof OpenAIChatMessage];

	if (!value) continue;
	if (key === 'function_call') continue;
	if (Array.isArray(value)) {
	for (const item of value) {
	if (item.type === "text") {
	textContent += item.text;
	} else if (["image", "image_url"].includes(item.type)) {
	const { url, detail } = item.image_url;
	const cost = await getGpt4VisionTokenCost(url, detail);
	numTokens += cost ?? 0;
	}
	}
	} else {
	textContent = value as string;
	}

	if (textContent.length > 800000 \|\| numTokens > 200000) {
	throw new Error("Content is too large to tokenize.");
	}

	numTokens += encoder.encode(textContent).length;
	if (key === "name") {
	numTokens += tokensPerName;
	}
	}
	}
	}
	numTokens += 3; // every reply is primed with <\|start\|>assistant<\|message\|>
	return { tokenizer: "tiktoken", token_count: numTokens };
	}

	async function getGpt4VisionTokenCost(
	url: string,
	detail: "auto" \| "low" \| "high" = "auto"
	) {
	// For now we do not allow remote images as the proxy would have to download
	// them, which is a potential DoS vector.
	if (!url.startsWith("data:image/")) {
	throw new Error(
	"Remote images are not supported. Add the image to your prompt as a base64 data URL."
	);
	}

	const base64Data = url.split(",")[1];
	const buffer = Buffer.from(base64Data, "base64");
	const image = libSharp(buffer);
	const metadata = await image.metadata();

	if (!metadata \|\| !metadata.width \|\| !metadata.height) {
	throw new Error("Prompt includes an image that could not be parsed");
	}

	const { width, height } = metadata;

	let selectedDetail: "low" \| "high";
	if (detail === "auto") {
	const threshold = 512 * 512;
	const imageSize = width * height;
	selectedDetail = imageSize > threshold ? "high" : "low";
	} else {
	selectedDetail = detail;
	}

	// https://platform.openai.com/docs/guides/vision/calculating-costs
	if (selectedDetail === "low") {
	log.info(
	{ width, height, tokens: 85 },
	"Using fixed GPT-4-Vision token cost for low detail image"
	);
	return 85;
	}

	let newWidth = width;
	let newHeight = height;
	if (width > 2048 \|\| height > 2048) {
	const aspectRatio = width / height;
	if (width > height) {
	newWidth = 2048;
	newHeight = Math.round(2048 / aspectRatio);
	} else {
	newHeight = 2048;
	newWidth = Math.round(2048 * aspectRatio);
	}
	}

	if (newWidth < newHeight) {
	newHeight = Math.round((newHeight / newWidth) * 768);
	newWidth = 768;
	} else {
	newWidth = Math.round((newWidth / newHeight) * 768);
	newHeight = 768;
	}

	const tiles = Math.ceil(newWidth / 512) * Math.ceil(newHeight / 512);
	const tokens = 170 * tiles + 85;

	log.info(
	{ width, height, newWidth, newHeight, tiles, tokens },
	"Calculated GPT-4-Vision token cost for high detail image"
	);
	return tokens;
	}

	function getTextTokenCount(prompt: string) {
	if (prompt.length > 500000) {
	return {
	tokenizer: "length fallback",
	token_count: 100000,
	};
	}

	return {
	tokenizer: "tiktoken",
	token_count: encoder.encode(prompt).length,
	};
	}

	// Model Resolution Price
	// DALL·E 3 1024×1024 $0.040 / image
	// 1024×1792, 1792×1024 $0.080 / image
	// DALL·E 3 HD 1024×1024 $0.080 / image
	// 1024×1792, 1792×1024 $0.120 / image
	// DALL·E 2 1024×1024 $0.020 / image
	// 512×512 $0.018 / image
	// 256×256 $0.016 / image

	export const DALLE_TOKENS_PER_DOLLAR = 100000;

	/**
	* OpenAI image generation with DALL-E doesn't use tokens but everything else
	* in the application does. There is a fixed cost for each image generation
	* request depending on the model and selected quality/resolution parameters,
	* which we convert to tokens at a rate of 100000 tokens per dollar.
	*/
	export function getOpenAIImageCost(params: {
	model: "dall-e-2" \| "dall-e-3" \| "gpt-image-1";
	quality: "standard" \| "hd" \| "high" \| "medium" \| "low" \| "auto";
	resolution: "512x512" \| "256x256" \| "1024x1024" \| "1024x1792" \| "1792x1024" \| "1536x1024" \| "1024x1536" \| "auto";
	n: number \| null;
	}) {
	const { model, quality, resolution, n } = params;
	const usd = (() => {
	switch (model) {
	case "dall-e-2":
	switch (resolution) {
	case "512x512":
	return 0.018;
	case "256x256":
	return 0.016;
	case "1024x1024":
	return 0.02;
	default:
	throw new Error("Invalid resolution");
	}
	case "dall-e-3":
	switch (resolution) {
	case "1024x1024":
	return quality === "standard" ? 0.04 : 0.08;
	case "1024x1792":
	case "1792x1024":
	return quality === "standard" ? 0.08 : 0.12;
	default:
	throw new Error("Invalid resolution");
	}
	case "gpt-image-1":
	// gpt-image-1 pricing is approximately $0.04 per image
	// This is a simplified pricing model, adjust as needed based on official pricing
	return 0.04;
	default:
	throw new Error("Invalid image generation model");
	}
	})();

	const tokens = (n ?? 1) * (usd * DALLE_TOKENS_PER_DOLLAR);

	return {
	tokenizer: `openai-image cost`,
	token_count: Math.ceil(tokens),
	};
	}

	export function estimateGoogleAITokenCount(
	prompt: string \| GoogleAIChatMessage[]
	) {
	if (typeof prompt === "string") {
	return getTextTokenCount(prompt);
	}

	const tokensPerMessage = 3;

	let numTokens = 0;
	for (const message of prompt) {
	numTokens += tokensPerMessage;
	const textPart = message.parts.find(p => 'text' in p) as { text: string } \| undefined;
	if (textPart) {
	numTokens += encoder.encode(textPart.text).length;
	}
	}

	numTokens += 3;

	return {
	tokenizer: "tiktoken (google-ai estimate)",
	token_count: numTokens,
	};
	}