Spaces:
Sleeping
Sleeping
| import { ClapProject, ClapSegmentCategory, getClapAssetSourceType, getValidNumber, newEntity } from "@aitube/clap" | |
| import { ClapCompletionMode, ClapEntityPrompt } from "@aitube/client" | |
| import { generateImageID } from "./generateImageID" | |
| import { generateAudioID } from "./generateAudioID" | |
| import { generateEntityPrompts } from "./generateEntityPrompts" | |
| import { clapToLatentStory } from "./clapToLatentStory" | |
| export async function editEntities({ | |
| existingClap, | |
| newerClap, | |
| entityPrompts = [], | |
| mode = ClapCompletionMode.PARTIAL, | |
| turbo = false, | |
| }: { | |
| existingClap: ClapProject | |
| newerClap: ClapProject | |
| entityPrompts?: ClapEntityPrompt[] | |
| mode?: ClapCompletionMode | |
| turbo?: boolean | |
| }) { | |
| // note that we can only handle either FULL or PARTIAL | |
| // other modes such as MERGE, REPLACE.. are irrelevant since those are client-side modes | |
| // so from a server point of view those correspond to PARTIAL | |
| // | |
| // it is also worth noting that the use of FULL should be discouraged | |
| const isFull = mode === ClapCompletionMode.FULL | |
| const isPartial = !isFull | |
| // if we don't have existing entities, and user passed none, | |
| // then we need to hallucinate them | |
| if (existingClap.entities.length === 0 && entityPrompts.length === 0) { | |
| const entityPromptsWithShots = await generateEntityPrompts({ | |
| prompt: existingClap.meta.description, | |
| latentStory: await clapToLatentStory(existingClap), | |
| turbo, | |
| }) | |
| const allShots = existingClap.segments.filter(s => s.category === ClapSegmentCategory.CAMERA) | |
| for (const { | |
| entityPrompt: { name, category, age, variant, region, identityImage, identityVoice }, | |
| shots: entityShots | |
| } of entityPromptsWithShots) { | |
| const newEnt = newEntity({ | |
| category, | |
| triggerName: name, | |
| label: name, | |
| description: name, | |
| author: "auto", | |
| thumbnailUrl: "", | |
| imagePrompt: "", | |
| imageSourceType: getClapAssetSourceType(identityImage), | |
| imageEngine: "SD Lightning", | |
| imageId: identityImage, | |
| audioPrompt: "", | |
| audioSourceType: getClapAssetSourceType(identityVoice), | |
| audioEngine: "Parler-TTS", // <- TODO: use OpenVoice 2, that way it can be personalized | |
| audioId: identityVoice, | |
| // note: using a numeric age should be deprecated, | |
| // instead we should be able to specify things using text, | |
| // eg. "8 months", "25 years old", "12th century" | |
| age: getValidNumber(age, 0, 120, 25), | |
| // TODO: delete gender and appearance, replace by a single concept of "variant" | |
| gender: "", | |
| appearance: variant, | |
| region: region, | |
| }) | |
| existingClap.entities.push(newEnt) | |
| // now let's assign our entity to shots! | |
| // | |
| // warning: the shot assignment is the responsibility of the LLM. | |
| // if the LLM hallucinates non-existing shot ids, it will cause trouble! | |
| for (const shotId of entityShots) { | |
| if (allShots[shotId]) { | |
| allShots[shotId].entityId = newEnt.id | |
| } else { | |
| console.log(`[api/v1/edit/entities] warning: the LLM generated a non-existing shot (shot "${shotId}", but we only have ${allShots.length} shots)`) | |
| } | |
| } | |
| } | |
| } | |
| // otherwise try to add what's new | |
| for (const { name, category, age, variant, region, identityImage, identityVoice } of entityPrompts) { | |
| const newEnt = newEntity({ | |
| category, | |
| triggerName: name, | |
| label: name, | |
| description: name, | |
| author: "auto", | |
| thumbnailUrl: "", | |
| imagePrompt: "", | |
| imageSourceType: getClapAssetSourceType(identityImage), | |
| imageEngine: "SD Lightning", | |
| imageId: identityImage, | |
| audioPrompt: "", | |
| audioSourceType: getClapAssetSourceType(identityVoice), | |
| audioEngine: "Parler-TTS", // <- TODO: use OpenVoice 2, that way it can be personalized | |
| audioId: identityVoice, | |
| // note: using a numeric age should be deprecated, | |
| // instead we should be able to specify things using text, | |
| // eg. "8 months", "25 years old", "12th century" | |
| age: getValidNumber(age, 0, 120, 25), | |
| // TODO: delete gender and appearance, replace by a single concept of "variant" | |
| gender: "", | |
| appearance: variant, | |
| region: region, | |
| }) | |
| existingClap.entities.push(newEnt) | |
| } | |
| if (!existingClap.entities.length) { throw new Error(`please provide at least one entity`) } | |
| // then we try to automatically repair, edit, complete.. all the existing entities | |
| for (const entity of existingClap.entities) { | |
| let entityHasBeenModified = false | |
| // TASK 1: GENERATE THE IMAGE PROMPT IF MISSING | |
| if (!entity.imagePrompt) { | |
| entity.imagePrompt = "a man with a beard" | |
| entityHasBeenModified = true | |
| } | |
| // TASK 2: GENERATE THE IMAGE ID IF MISSING | |
| if (!entity.imageId) { | |
| entity.imageId = await generateImageID({ | |
| prompt: entity.imagePrompt, | |
| seed: entity.seed, | |
| turbo, | |
| }) | |
| entity.imageSourceType = getClapAssetSourceType(entity.imageId) | |
| entityHasBeenModified = true | |
| } | |
| // TASK 3: GENERATE THE AUDIO PROMPT IF MISSING | |
| if (!entity.audioPrompt) { | |
| entity.audioPrompt = "a man with a beard" | |
| entityHasBeenModified = true | |
| } | |
| // TASK 4: GENERATE THE AUDIO ID IF MISSING | |
| // TODO here: call Parler-TTS or a generic audio generator | |
| if (!entity.audioId) { | |
| entity.audioId = await generateAudioID({ | |
| prompt: entity.audioPrompt, | |
| seed: entity.seed | |
| }) | |
| entity.audioSourceType = getClapAssetSourceType(entity.audioId) | |
| entityHasBeenModified = true | |
| } | |
| // in case we are doing a partial update | |
| if (mode !== ClapCompletionMode.FULL && entityHasBeenModified && !newerClap.entityIndex[entity.id]) { | |
| newerClap.entities.push(entity) | |
| newerClap.entityIndex[entity.id] = entity | |
| } | |
| } | |
| // console.log(`api/edit/entities(): returning the newerClap`) | |
| return newerClap | |
| } | |