Email / src /llm /llama-cpp-llm.js
lenzcom's picture
Upload folder using huggingface_hub
e706de2 verified
/**
* LlamaCppLLM - node-llama-cpp wrapper as a Runnable
*
* @module llm/llama-cpp-llm
*/
import { Runnable } from '../core/runnable.js';
import { AIMessage, HumanMessage } from '../core/message.js';
import { getLlama, LlamaChatSession } from 'node-llama-cpp';
/**
* LlamaCppLLM - A Runnable wrapper for node-llama-cpp
*
* Wraps your LLM calls from agent fundamentals into a reusable,
* composable Runnable component.
*
* Key benefits over raw node-llama-cpp:
* - Composable with other Runnables via .pipe()
* - Supports batch processing multiple inputs
* - Built-in streaming support
* - Consistent interface across all LLMs
* - Easy to swap with other LLM providers
*/
export class LlamaCppLLM extends Runnable {
/**
* Create a new LlamaCppLLM instance
*
* @param {Object} options - Configuration options
* @param {string} options.modelPath - Path to your GGUF model file (REQUIRED)
* @param {number} [options.temperature=0.7] - Sampling temperature (0-1)
* - Lower (0.1): More focused, deterministic
* - Higher (0.9): More creative, random
* @param {number} [options.topP=0.9] - Nucleus sampling threshold
* @param {number} [options.topK=40] - Top-K sampling parameter
* @param {number} [options.maxTokens=2048] - Maximum tokens to generate
* @param {number} [options.repeatPenalty=1.1] - Penalty for repeating tokens
* @param {number} [options.contextSize=4096] - Context window size
* @param {number} [options.batchSize=512] - Batch processing size
* @param {boolean} [options.verbose=false] - Enable debug logging
* @param {string[]} [options.stopStrings] - Strings that stop generation
* @param {Object} [options.chatWrapper] - Custom chat wrapper instance (e.g., QwenChatWrapper)
* - If not provided, the library will automatically select the best wrapper for your model
*
* @example Basic Setup
* ```javascript
* const llm = new LlamaCppLLM({
* modelPath: './models/Meta-Llama-3.1-8B-Instruct-Q5_K_S.gguf',
* temperature: 0.7
* });
* ```
*
* @example With Qwen Chat Wrapper (Discourage Thoughts)
* ```javascript
* import { QwenChatWrapper } from 'node-llama-cpp';
*
* const llm = new LlamaCppLLM({
* modelPath: './models/Qwen3-1.7B-Q6_K.gguf',
* temperature: 0.7,
* chatWrapper: new QwenChatWrapper({
* thoughts: 'discourage'
* })
* });
* ```
*
* @example Different Configurations for Different Tasks
* ```javascript
* // Creative writing (higher temperature)
* const creative = new LlamaCppLLM({
* modelPath: './model.gguf',
* temperature: 0.9,
* maxTokens: 1000
* });
*
* // Factual responses (lower temperature)
* const factual = new LlamaCppLLM({
* modelPath: './model.gguf',
* temperature: 0.1,
* maxTokens: 500
* });
* ```
*/
constructor(options = {}) {
super();
// Validate required options
this.modelPath = options.modelPath;
if (!this.modelPath) {
throw new Error(
'modelPath is required. Example: new LlamaCppLLM({ modelPath: "./model.gguf" })'
);
}
// Generation parameters
// These control how the LLM generates text - same as in your fundamentals!
this.temperature = options.temperature ?? 0.7;
this.topP = options.topP ?? 0.9;
this.topK = options.topK ?? 40;
this.maxTokens = options.maxTokens ?? 2048;
this.repeatPenalty = options.repeatPenalty ?? 1.1;
// Context configuration
this.contextSize = options.contextSize ?? 4096;
this.batchSize = options.batchSize ?? 512;
// Behavior
this.verbose = options.verbose ?? false;
// Chat wrapper configuration
// If not provided, LlamaChatSession will auto-select the best wrapper
this.chatWrapper = options.chatWrapper ?? 'auto';
// Stop strings - when the model sees these, it stops generating
// Default includes common chat separators
this.stopStrings = options.stopStrings ?? [
'Human:',
'User:',
'\n\nHuman:',
'\n\nUser:'
];
// Internal state (lazy initialized)
this._llama = null;
this._model = null;
this._context = null;
this._chatSession = null;
this._initialized = false;
}
/**
* Initialize model (lazy loading)
*
* This loads the model only when first needed, not at construction.
* This pattern is useful because model loading is slow - we only
* want to do it once and only when we actually need it.
*
* @private
* @throws {Error} If model loading fails
*/
async _initialize() {
// Skip if already initialized
if (this._initialized) return;
if (this.verbose) {
console.log(`Loading model: ${this.modelPath}`);
}
try {
// Step 1: Get the llama instance
this._llama = await getLlama();
// Step 2: Load the model file
this._model = await this._llama.loadModel({
modelPath: this.modelPath
});
// Step 3: Create a context for generation
this._context = await this._model.createContext({
contextSize: this.contextSize,
batchSize: this.batchSize
});
// Step 4: Create a chat session
// This manages conversation history for us
const contextSequence = this._context.getSequence();
const sessionConfig = {
contextSequence
};
// Add chatWrapper if specified (otherwise LlamaChatSession uses "auto")
if (this.chatWrapper !== 'auto') {
sessionConfig.chatWrapper = this.chatWrapper;
}
this._chatSession = new LlamaChatSession(sessionConfig);
this._initialized = true;
if (this.verbose) {
console.log('✓ Model loaded successfully');
if (this.chatWrapper !== 'auto') {
console.log(`✓ Using custom chat wrapper: ${this.chatWrapper.constructor.name}`);
} else {
console.log('✓ Using auto-detected chat wrapper');
}
}
} catch (error) {
throw new Error(
`Failed to initialize model at ${this.modelPath}: ${error.message}`
);
}
}
/**
* Convert our Message objects to node-llama-cpp chat history format
*
* This bridges between our standardized Message types and what
* node-llama-cpp expects. Think of it as a translator.
*
* @private
* @param {Array<Message>} messages - Array of Message objects
* @returns {Array<Object>} Chat history in llama.cpp format
*
* @example
* ```javascript
* // Input: Our messages
* [
* new SystemMessage("You are helpful"),
* new HumanMessage("Hi"),
* new AIMessage("Hello!")
* ]
*
* // Output: llama.cpp format
* [
* { type: 'system', text: 'You are helpful' },
* { type: 'user', text: 'Hi' },
* { type: 'model', response: 'Hello!' }
* ]
* ```
*/
_messagesToChatHistory(messages) {
return messages.map(msg => {
// System messages: instructions for the AI
if (msg._type === 'system') {
return { type: 'system', text: msg.content };
}
// Human messages: user input
else if (msg._type === 'human') {
return { type: 'user', text: msg.content };
}
// AI messages: previous AI responses
else if (msg._type === 'ai') {
return { type: 'model', response: msg.content };
}
// Tool messages: results from tool execution
else if (msg._type === 'tool') {
// Convert tool results to system messages
return { type: 'system', text: `Tool Result: ${msg.content}` };
}
// Fallback: treat unknown types as user messages
return { type: 'user', text: msg.content };
});
}
/**
* Clean up model response
*
* Sometimes models include extra prefixes or suffixes.
* This cleans them up for a better user experience.
*
* @private
* @param {string} response - Raw model response
* @returns {string} Cleaned response
*
* @example
* ```javascript
* // Before: "Assistant: The answer is 42\n\nHuman: "
* // After: "The answer is 42"
* ```
*/
_cleanResponse(response) {
let cleaned = response.trim();
// Remove "Assistant:" or "AI:" prefixes
cleaned = cleaned.replace(/^(Assistant|AI):\s*/i, '');
// Remove any conversation continuations
cleaned = cleaned.replace(/\n\n(Human|User):.*$/s, '');
return cleaned.trim();
}
/**
* Main generation method - this is where your LLM calls happen!
*
* This is the same as calling `llm.chat(messages)` in your fundamentals,
* but wrapped to work with the Runnable interface.
*
* @async
* @param {string|Array<Message>} input - User input or message array
* @param {Object} [config={}] - Runtime configuration
* @param {number} [config.temperature] - Override temperature for this call
* @param {number} [config.maxTokens] - Override max tokens for this call
* @param {boolean} [config.clearHistory=false] - Clear chat history before this call
* @returns {Promise<AIMessage>} Generated response as AIMessage
*
* @example String Input (Simplest)
* ```javascript
* const response = await llm.invoke("What is AI?");
* console.log(response.content); // "AI is..."
* ```
*
* @example Message Array Input (Full Control)
* ```javascript
* const messages = [
* new SystemMessage("You are a helpful assistant"),
* new HumanMessage("What is AI?")
* ];
* const response = await llm.invoke(messages);
* ```
*
* @example Runtime Configuration
* ```javascript
* // Override temperature for this specific call
* const response = await llm.invoke(
* "Write a creative story",
* { temperature: 0.9, maxTokens: 500 }
* );
* ```
*
* @example Clear History Before Call
* ```javascript
* // Ensure fresh context with no prior conversation
* const response = await llm.invoke(
* "What is AI?",
* { clearHistory: true }
* );
* ```
*
* @example In a Pipeline (Composition)
* ```javascript
* const pipeline = promptFormatter
* .pipe(llm)
* .pipe(outputParser);
*
* const result = await pipeline.invoke("user input");
* ```
*/
async _call(input, config = {}) {
// Ensure model is loaded (only happens once)
await this._initialize();
// Clear history if requested (important for batch processing)
if (config.clearHistory) {
this._chatSession.setChatHistory([]);
}
// Handle different input types
let messages;
if (typeof input === 'string') {
messages = [new HumanMessage(input)];
} else if (Array.isArray(input)) {
messages = input;
} else {
throw new Error(
'Input must be a string or array of messages. ' +
'Example: "Hello" or [new HumanMessage("Hello")]'
);
}
// Extract system message if present
const systemMessages = messages.filter(msg => msg._type === 'system');
const systemPrompt = systemMessages.length > 0
? systemMessages[0].content
: '';
// Convert our Message objects to llama.cpp format
const chatHistory = this._messagesToChatHistory(messages);
this._chatSession.setChatHistory(chatHistory);
// ALWAYS set system prompt (either new value or empty string to clear)
this._chatSession.systemPrompt = systemPrompt;
try {
// Build prompt options
const promptOptions = {
temperature: config.temperature ?? this.temperature,
topP: config.topP ?? this.topP,
topK: config.topK ?? this.topK,
maxTokens: config.maxTokens ?? this.maxTokens,
repeatPenalty: config.repeatPenalty ?? this.repeatPenalty,
customStopTriggers: config.stopStrings ?? this.stopStrings
};
// Add random seed if temperature > 0 and no seed specified
// This ensures randomness works properly
if (promptOptions.temperature > 0 && config.seed === undefined) {
promptOptions.seed = Math.floor(Math.random() * 1000000);
} else if (config.seed !== undefined) {
promptOptions.seed = config.seed;
}
// Generate response using prompt (simpler than promptWithMeta for non-streaming)
const response = await this._chatSession.prompt('', promptOptions);
// Return as AIMessage for consistency
return new AIMessage(response);
} catch (error) {
throw new Error(`Generation failed: ${error.message}`);
}
}
/**
* Batch processing with history isolation
*
* Processes multiple inputs sequentially, ensuring each gets a clean chat history.
* Note: Local models process requests sequentially, so there's no performance
* benefit compared to calling invoke() multiple times.
*
* @async
* @param {Array<string|Array<Message>>} inputs - Array of inputs to process
* @param {Object} [config={}] - Runtime configuration
* @returns {Promise<Array<AIMessage>>} Array of generated responses
*
* @example
* ```javascript
* const questions = ["What is AI?", "What is ML?", "What is DL?"];
* const answers = await llm.batch(questions);
* ```
*/
async batch(inputs, config = {}) {
const results = [];
for (const input of inputs) {
// Clear history before each batch item to prevent contamination
const result = await this._call(input, { ...config, clearHistory: true });
results.push(result);
}
return results;
}
/**
* Streaming generation - show results as they're generated!
*
* This is the same as _call() but yields chunks as they arrive,
* like the typing effect you see in ChatGPT.
*
* @async
* @generator
* @param {string|Array<Message>} input - User input or message array
* @param {Object} [config={}] - Runtime configuration
* @yields {AIMessage} Chunks of generated text
*
* @example Basic Streaming
* ```javascript
* console.log("Response: ");
* for await (const chunk of llm.stream("Tell me a story")) {
* process.stdout.write(chunk.content); // Print without newline
* }
* console.log("\nDone!");
* ```
*
* @example Streaming in a Pipeline
* ```javascript
* const pipeline = promptFormatter
* .pipe(llm)
* .pipe(parser);
*
* // Only the last step (parser) gets streamed chunks
* for await (const chunk of pipeline.stream(input)) {
* console.log(chunk);
* }
* ```
*
* @example Building a Chat UI
* ```javascript
* async function streamToUI(input) {
* let fullResponse = '';
*
* for await (const chunk of llm.stream(input)) {
* fullResponse += chunk.content;
* updateUI(fullResponse); // Update your UI in real-time
* }
* }
* ```
*/
async* stream(input, config = {}) {
await this._initialize();
// Clear history if requested
if (config.clearHistory) {
this._chatSession.setChatHistory([]);
}
// Handle different input types (same as _call)
let messages;
if (typeof input === 'string') {
messages = [new HumanMessage(input)];
} else if (Array.isArray(input)) {
messages = input;
} else {
throw new Error(
'Input must be a string or array of messages for streaming'
);
}
// Extract system message if present
const systemMessages = messages.filter(msg => msg._type === 'system');
const systemPrompt = systemMessages.length > 0
? systemMessages[0].content
: '';
// Set up chat history
const chatHistory = this._messagesToChatHistory(messages);
this._chatSession.setChatHistory(chatHistory);
// ALWAYS set system prompt (either new value or empty string to clear)
this._chatSession.systemPrompt = systemPrompt;
try {
// Build prompt options
const promptOptions = {
temperature: config.temperature ?? this.temperature,
topP: config.topP ?? this.topP,
topK: config.topK ?? this.topK,
maxTokens: config.maxTokens ?? this.maxTokens,
repeatPenalty: config.repeatPenalty ?? this.repeatPenalty,
customStopTriggers: config.stopStrings ?? this.stopStrings
};
// Add random seed if temperature > 0 and no seed specified
if (promptOptions.temperature > 0 && config.seed === undefined) {
promptOptions.seed = Math.floor(Math.random() * 1000000);
} else if (config.seed !== undefined) {
promptOptions.seed = config.seed;
}
// Use onTextChunk callback to stream chunks as they arrive
const self = this;
promptOptions.onTextChunk = (chunk) => {
// This callback is synchronous, so we can't yield directly
// We'll collect chunks and yield them after
self._currentStreamChunks = self._currentStreamChunks || [];
self._currentStreamChunks.push(chunk);
};
// Initialize chunk collection
this._currentStreamChunks = [];
// Start generation (this will call onTextChunk as it generates)
const responsePromise = this._chatSession.prompt('', promptOptions);
// Yield chunks as they become available
let lastYieldedIndex = 0;
// Poll for new chunks
while (true) {
// Yield any new chunks
while (lastYieldedIndex < this._currentStreamChunks.length) {
yield new AIMessage(this._currentStreamChunks[lastYieldedIndex], {
additionalKwargs: { chunk: true }
});
lastYieldedIndex++;
}
// Check if generation is complete
const isDone = await Promise.race([
responsePromise.then(() => true),
new Promise(resolve => setTimeout(() => resolve(false), 10))
]);
if (isDone) {
// Yield any remaining chunks
while (lastYieldedIndex < this._currentStreamChunks.length) {
yield new AIMessage(this._currentStreamChunks[lastYieldedIndex], {
additionalKwargs: { chunk: true }
});
lastYieldedIndex++;
}
break;
}
}
// Wait for the full response
await responsePromise;
// Clean up
delete this._currentStreamChunks;
} catch (error) {
throw new Error(`Streaming failed: ${error.message}`);
}
}
/**
* Cleanup resources
*
* LLMs hold resources in memory. Call this when you're done
* to free them up properly.
*
* @async
* @returns {Promise<void>}
*
* @example
* ```javascript
* const llm = new LlamaCppLLM({ modelPath: './model.gguf' });
*
* try {
* const response = await llm.invoke("Hello");
* console.log(response.content);
* } finally {
* await llm.dispose(); // Always clean up!
* }
* ```
*
* @example With Multiple Uses
* ```javascript
* const llm = new LlamaCppLLM({ modelPath: './model.gguf' });
*
* // Use it many times
* await llm.invoke("Question 1");
* await llm.invoke("Question 2");
* await llm.batch(["Q3", "Q4", "Q5"]);
*
* // Clean up when completely done
* await llm.dispose();
* ```
*/
async dispose() {
if (this._context) {
await this._context.dispose();
this._context = null;
}
if (this._model) {
await this._model.dispose();
this._model = null;
}
this._chatSession = null;
this._initialized = false;
if (this.verbose) {
console.log('✓ Model resources disposed');
}
}
/**
* String representation for debugging
*
* @returns {string} Human-readable representation
*
* @example
* ```javascript
* const llm = new LlamaCppLLM({ modelPath: './llama-2-7b.gguf' });
* console.log(llm.toString());
* // "LlamaCppLLM(model=./llama-2-7b.gguf)"
*
* // Useful in pipelines
* const pipeline = formatter.pipe(llm).pipe(parser);
* console.log(pipeline.toString());
* // "PromptFormatter() | LlamaCppLLM(model=./llama-2-7b.gguf) | OutputParser()"
* ```
*/
toString() {
return `LlamaCppLLM(model=${this.modelPath})`;
}
}
export default LlamaCppLLM;