File size: 2,604 Bytes
f8b5d42 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 |
const SUPPORTED_NATIVE_EMBEDDING_MODELS = {
"Xenova/all-MiniLM-L6-v2": {
maxConcurrentChunks: 25,
// Right now, this is NOT the token length, and is instead the number of characters
// that can be processed in a single pass. So we override to 1,000 characters.
// roughtly the max number of tokens assuming 2 characters per token. (undershooting)
// embeddingMaxChunkLength: 512, (from the model card)
embeddingMaxChunkLength: 1_000,
chunkPrefix: "",
queryPrefix: "",
apiInfo: {
id: "Xenova/all-MiniLM-L6-v2",
name: "all-MiniLM-L6-v2",
description:
"A lightweight and fast model for embedding text. The default model for AnythingLLM.",
lang: "English",
size: "23MB",
modelCard: "https://huggingface.co/Xenova/all-MiniLM-L6-v2",
},
},
"Xenova/nomic-embed-text-v1": {
maxConcurrentChunks: 5,
// Right now, this is NOT the token length, and is instead the number of characters
// that can be processed in a single pass. So we override to 16,000 characters.
// roughtly the max number of tokens assuming 2 characters per token. (undershooting)
// embeddingMaxChunkLength: 8192, (from the model card)
embeddingMaxChunkLength: 16_000,
chunkPrefix: "search_document: ",
queryPrefix: "search_query: ",
apiInfo: {
id: "Xenova/nomic-embed-text-v1",
name: "nomic-embed-text-v1",
description:
"A high-performing open embedding model with a large token context window. Requires more processing power and memory.",
lang: "English",
size: "139MB",
modelCard: "https://huggingface.co/Xenova/nomic-embed-text-v1",
},
},
"MintplexLabs/multilingual-e5-small": {
maxConcurrentChunks: 5,
// Right now, this is NOT the token length, and is instead the number of characters
// that can be processed in a single pass. So we override to 1,000 characters.
// roughtly the max number of tokens assuming 2 characters per token. (undershooting)
// embeddingMaxChunkLength: 512, (from the model card)
embeddingMaxChunkLength: 1_000,
chunkPrefix: "passage: ",
queryPrefix: "query: ",
apiInfo: {
id: "MintplexLabs/multilingual-e5-small",
name: "multilingual-e5-small",
description:
"A larger multilingual embedding model that supports 100+ languages. Requires more processing power and memory.",
lang: "100+ languages",
size: "487MB",
modelCard: "https://huggingface.co/intfloat/multilingual-e5-small",
},
},
};
module.exports = {
SUPPORTED_NATIVE_EMBEDDING_MODELS,
};
|