Spaces:

lenzcom
/

Email

Running

App Files Files Community

Email / src /llm /llama-cpp-llm.js

lenzcom's picture

Upload folder using huggingface_hub

e706de2 verified 1 day ago

history blame contribute delete

20.7 kB

	/**
	* LlamaCppLLM - node-llama-cpp wrapper as a Runnable
	*
	* @module llm/llama-cpp-llm
	*/

	import { Runnable } from '../core/runnable.js';
	import { AIMessage, HumanMessage } from '../core/message.js';
	import { getLlama, LlamaChatSession } from 'node-llama-cpp';

	/**
	* LlamaCppLLM - A Runnable wrapper for node-llama-cpp
	*
	* Wraps your LLM calls from agent fundamentals into a reusable,
	* composable Runnable component.
	*
	* Key benefits over raw node-llama-cpp:
	* - Composable with other Runnables via .pipe()
	* - Supports batch processing multiple inputs
	* - Built-in streaming support
	* - Consistent interface across all LLMs
	* - Easy to swap with other LLM providers
	*/
	export class LlamaCppLLM extends Runnable {
	/**
	* Create a new LlamaCppLLM instance
	*
	* @param {Object} options - Configuration options
	* @param {string} options.modelPath - Path to your GGUF model file (REQUIRED)
	* @param {number} [options.temperature=0.7] - Sampling temperature (0-1)
	* - Lower (0.1): More focused, deterministic
	* - Higher (0.9): More creative, random
	* @param {number} [options.topP=0.9] - Nucleus sampling threshold
	* @param {number} [options.topK=40] - Top-K sampling parameter
	* @param {number} [options.maxTokens=2048] - Maximum tokens to generate
	* @param {number} [options.repeatPenalty=1.1] - Penalty for repeating tokens
	* @param {number} [options.contextSize=4096] - Context window size
	* @param {number} [options.batchSize=512] - Batch processing size
	* @param {boolean} [options.verbose=false] - Enable debug logging
	* @param {string[]} [options.stopStrings] - Strings that stop generation
	* @param {Object} [options.chatWrapper] - Custom chat wrapper instance (e.g., QwenChatWrapper)
	* - If not provided, the library will automatically select the best wrapper for your model
	*
	* @example Basic Setup
	* ```javascript
	* const llm = new LlamaCppLLM({
	* modelPath: './models/Meta-Llama-3.1-8B-Instruct-Q5_K_S.gguf',
	* temperature: 0.7
	* });
	* ```
	*
	* @example With Qwen Chat Wrapper (Discourage Thoughts)
	* ```javascript
	* import { QwenChatWrapper } from 'node-llama-cpp';
	*
	* const llm = new LlamaCppLLM({
	* modelPath: './models/Qwen3-1.7B-Q6_K.gguf',
	* temperature: 0.7,
	* chatWrapper: new QwenChatWrapper({
	* thoughts: 'discourage'
	* })
	* });
	* ```
	*
	* @example Different Configurations for Different Tasks
	* ```javascript
	* // Creative writing (higher temperature)
	* const creative = new LlamaCppLLM({
	* modelPath: './model.gguf',
	* temperature: 0.9,
	* maxTokens: 1000
	* });
	*
	* // Factual responses (lower temperature)
	* const factual = new LlamaCppLLM({
	* modelPath: './model.gguf',
	* temperature: 0.1,
	* maxTokens: 500
	* });
	* ```
	*/
	constructor(options = {}) {
	super();

	// Validate required options
	this.modelPath = options.modelPath;
	if (!this.modelPath) {
	throw new Error(
	'modelPath is required. Example: new LlamaCppLLM({ modelPath: "./model.gguf" })'
	);
	}

	// Generation parameters
	// These control how the LLM generates text - same as in your fundamentals!
	this.temperature = options.temperature ?? 0.7;
	this.topP = options.topP ?? 0.9;
	this.topK = options.topK ?? 40;
	this.maxTokens = options.maxTokens ?? 2048;
	this.repeatPenalty = options.repeatPenalty ?? 1.1;

	// Context configuration
	this.contextSize = options.contextSize ?? 4096;
	this.batchSize = options.batchSize ?? 512;

	// Behavior
	this.verbose = options.verbose ?? false;

	// Chat wrapper configuration
	// If not provided, LlamaChatSession will auto-select the best wrapper
	this.chatWrapper = options.chatWrapper ?? 'auto';

	// Stop strings - when the model sees these, it stops generating
	// Default includes common chat separators
	this.stopStrings = options.stopStrings ?? [
	'Human:',
	'User:',
	'\n\nHuman:',
	'\n\nUser:'
	];

	// Internal state (lazy initialized)
	this._llama = null;
	this._model = null;
	this._context = null;
	this._chatSession = null;
	this._initialized = false;
	}

	/**
	* Initialize model (lazy loading)
	*
	* This loads the model only when first needed, not at construction.
	* This pattern is useful because model loading is slow - we only
	* want to do it once and only when we actually need it.
	*
	* @private
	* @throws {Error} If model loading fails
	*/
	async _initialize() {
	// Skip if already initialized
	if (this._initialized) return;

	if (this.verbose) {
	console.log(`Loading model: ${this.modelPath}`);
	}

	try {
	// Step 1: Get the llama instance
	this._llama = await getLlama();

	// Step 2: Load the model file
	this._model = await this._llama.loadModel({
	modelPath: this.modelPath
	});

	// Step 3: Create a context for generation
	this._context = await this._model.createContext({
	contextSize: this.contextSize,
	batchSize: this.batchSize
	});

	// Step 4: Create a chat session
	// This manages conversation history for us
	const contextSequence = this._context.getSequence();
	const sessionConfig = {
	contextSequence
	};

	// Add chatWrapper if specified (otherwise LlamaChatSession uses "auto")
	if (this.chatWrapper !== 'auto') {
	sessionConfig.chatWrapper = this.chatWrapper;
	}

	this._chatSession = new LlamaChatSession(sessionConfig);

	this._initialized = true;

	if (this.verbose) {
	console.log('✓ Model loaded successfully');
	if (this.chatWrapper !== 'auto') {
	console.log(`✓ Using custom chat wrapper: ${this.chatWrapper.constructor.name}`);
	} else {
	console.log('✓ Using auto-detected chat wrapper');
	}
	}
	} catch (error) {
	throw new Error(
	`Failed to initialize model at ${this.modelPath}: ${error.message}`
	);
	}
	}

	/**
	* Convert our Message objects to node-llama-cpp chat history format
	*
	* This bridges between our standardized Message types and what
	* node-llama-cpp expects. Think of it as a translator.
	*
	* @private
	* @param {Array<Message>} messages - Array of Message objects
	* @returns {Array<Object>} Chat history in llama.cpp format
	*
	* @example
	* ```javascript
	* // Input: Our messages
	* [
	* new SystemMessage("You are helpful"),
	* new HumanMessage("Hi"),
	* new AIMessage("Hello!")
	* ]
	*
	* // Output: llama.cpp format
	* [
	* { type: 'system', text: 'You are helpful' },
	* { type: 'user', text: 'Hi' },
	* { type: 'model', response: 'Hello!' }
	* ]
	* ```
	*/
	_messagesToChatHistory(messages) {
	return messages.map(msg => {
	// System messages: instructions for the AI
	if (msg._type === 'system') {
	return { type: 'system', text: msg.content };
	}
	// Human messages: user input
	else if (msg._type === 'human') {
	return { type: 'user', text: msg.content };
	}
	// AI messages: previous AI responses
	else if (msg._type === 'ai') {
	return { type: 'model', response: msg.content };
	}
	// Tool messages: results from tool execution
	else if (msg._type === 'tool') {
	// Convert tool results to system messages
	return { type: 'system', text: `Tool Result: ${msg.content}` };
	}

	// Fallback: treat unknown types as user messages
	return { type: 'user', text: msg.content };
	});
	}

	/**
	* Clean up model response
	*
	* Sometimes models include extra prefixes or suffixes.
	* This cleans them up for a better user experience.
	*
	* @private
	* @param {string} response - Raw model response
	* @returns {string} Cleaned response
	*
	* @example
	* ```javascript
	* // Before: "Assistant: The answer is 42\n\nHuman: "
	* // After: "The answer is 42"
	* ```
	*/
	_cleanResponse(response) {
	let cleaned = response.trim();

	// Remove "Assistant:" or "AI:" prefixes
	cleaned = cleaned.replace(/^(Assistant\|AI):\s*/i, '');

	// Remove any conversation continuations
	cleaned = cleaned.replace(/\n\n(Human\|User):.*$/s, '');

	return cleaned.trim();
	}

	/**
	* Main generation method - this is where your LLM calls happen!
	*
	* This is the same as calling `llm.chat(messages)` in your fundamentals,
	* but wrapped to work with the Runnable interface.
	*
	* @async
	* @param {string\|Array<Message>} input - User input or message array
	* @param {Object} [config={}] - Runtime configuration
	* @param {number} [config.temperature] - Override temperature for this call
	* @param {number} [config.maxTokens] - Override max tokens for this call
	* @param {boolean} [config.clearHistory=false] - Clear chat history before this call
	* @returns {Promise<AIMessage>} Generated response as AIMessage
	*
	* @example String Input (Simplest)
	* ```javascript
	* const response = await llm.invoke("What is AI?");
	* console.log(response.content); // "AI is..."
	* ```
	*
	* @example Message Array Input (Full Control)
	* ```javascript
	* const messages = [
	* new SystemMessage("You are a helpful assistant"),
	* new HumanMessage("What is AI?")
	* ];
	* const response = await llm.invoke(messages);
	* ```
	*
	* @example Runtime Configuration
	* ```javascript
	* // Override temperature for this specific call
	* const response = await llm.invoke(
	* "Write a creative story",
	* { temperature: 0.9, maxTokens: 500 }
	* );
	* ```
	*
	* @example Clear History Before Call
	* ```javascript
	* // Ensure fresh context with no prior conversation
	* const response = await llm.invoke(
	* "What is AI?",
	* { clearHistory: true }
	* );
	* ```
	*
	* @example In a Pipeline (Composition)
	* ```javascript
	* const pipeline = promptFormatter
	* .pipe(llm)
	* .pipe(outputParser);
	*
	* const result = await pipeline.invoke("user input");
	* ```
	*/
	async _call(input, config = {}) {
	// Ensure model is loaded (only happens once)
	await this._initialize();

	// Clear history if requested (important for batch processing)
	if (config.clearHistory) {
	this._chatSession.setChatHistory([]);
	}

	// Handle different input types
	let messages;
	if (typeof input === 'string') {
	messages = [new HumanMessage(input)];
	} else if (Array.isArray(input)) {
	messages = input;
	} else {
	throw new Error(
	'Input must be a string or array of messages. ' +
	'Example: "Hello" or [new HumanMessage("Hello")]'
	);
	}

	// Extract system message if present
	const systemMessages = messages.filter(msg => msg._type === 'system');
	const systemPrompt = systemMessages.length > 0
	? systemMessages[0].content
	: '';

	// Convert our Message objects to llama.cpp format
	const chatHistory = this._messagesToChatHistory(messages);
	this._chatSession.setChatHistory(chatHistory);

	// ALWAYS set system prompt (either new value or empty string to clear)
	this._chatSession.systemPrompt = systemPrompt;

	try {
	// Build prompt options
	const promptOptions = {
	temperature: config.temperature ?? this.temperature,
	topP: config.topP ?? this.topP,
	topK: config.topK ?? this.topK,
	maxTokens: config.maxTokens ?? this.maxTokens,
	repeatPenalty: config.repeatPenalty ?? this.repeatPenalty,
	customStopTriggers: config.stopStrings ?? this.stopStrings
	};

	// Add random seed if temperature > 0 and no seed specified
	// This ensures randomness works properly
	if (promptOptions.temperature > 0 && config.seed === undefined) {
	promptOptions.seed = Math.floor(Math.random() * 1000000);
	} else if (config.seed !== undefined) {
	promptOptions.seed = config.seed;
	}

	// Generate response using prompt (simpler than promptWithMeta for non-streaming)
	const response = await this._chatSession.prompt('', promptOptions);

	// Return as AIMessage for consistency
	return new AIMessage(response);
	} catch (error) {
	throw new Error(`Generation failed: ${error.message}`);
	}
	}

	/**
	* Batch processing with history isolation
	*
	* Processes multiple inputs sequentially, ensuring each gets a clean chat history.
	* Note: Local models process requests sequentially, so there's no performance
	* benefit compared to calling invoke() multiple times.
	*
	* @async
	* @param {Array<string\|Array<Message>>} inputs - Array of inputs to process
	* @param {Object} [config={}] - Runtime configuration
	* @returns {Promise<Array<AIMessage>>} Array of generated responses
	*
	* @example
	* ```javascript
	* const questions = ["What is AI?", "What is ML?", "What is DL?"];
	* const answers = await llm.batch(questions);
	* ```
	*/
	async batch(inputs, config = {}) {
	const results = [];
	for (const input of inputs) {
	// Clear history before each batch item to prevent contamination
	const result = await this._call(input, { ...config, clearHistory: true });
	results.push(result);
	}
	return results;
	}

	/**
	* Streaming generation - show results as they're generated!
	*
	* This is the same as _call() but yields chunks as they arrive,
	* like the typing effect you see in ChatGPT.
	*
	* @async
	* @generator
	* @param {string\|Array<Message>} input - User input or message array
	* @param {Object} [config={}] - Runtime configuration
	* @yields {AIMessage} Chunks of generated text
	*
	* @example Basic Streaming
	* ```javascript
	* console.log("Response: ");
	* for await (const chunk of llm.stream("Tell me a story")) {
	* process.stdout.write(chunk.content); // Print without newline
	* }
	* console.log("\nDone!");
	* ```
	*
	* @example Streaming in a Pipeline
	* ```javascript
	* const pipeline = promptFormatter
	* .pipe(llm)
	* .pipe(parser);
	*
	* // Only the last step (parser) gets streamed chunks
	* for await (const chunk of pipeline.stream(input)) {
	* console.log(chunk);
	* }
	* ```
	*
	* @example Building a Chat UI
	* ```javascript
	* async function streamToUI(input) {
	* let fullResponse = '';
	*
	* for await (const chunk of llm.stream(input)) {
	* fullResponse += chunk.content;
	* updateUI(fullResponse); // Update your UI in real-time
	* }
	* }
	* ```
	*/
	async* stream(input, config = {}) {
	await this._initialize();

	// Clear history if requested
	if (config.clearHistory) {
	this._chatSession.setChatHistory([]);
	}

	// Handle different input types (same as _call)
	let messages;
	if (typeof input === 'string') {
	messages = [new HumanMessage(input)];
	} else if (Array.isArray(input)) {
	messages = input;
	} else {
	throw new Error(
	'Input must be a string or array of messages for streaming'
	);
	}

	// Extract system message if present
	const systemMessages = messages.filter(msg => msg._type === 'system');
	const systemPrompt = systemMessages.length > 0
	? systemMessages[0].content
	: '';

	// Set up chat history
	const chatHistory = this._messagesToChatHistory(messages);
	this._chatSession.setChatHistory(chatHistory);

	// ALWAYS set system prompt (either new value or empty string to clear)
	this._chatSession.systemPrompt = systemPrompt;

	try {
	// Build prompt options
	const promptOptions = {
	temperature: config.temperature ?? this.temperature,
	topP: config.topP ?? this.topP,
	topK: config.topK ?? this.topK,
	maxTokens: config.maxTokens ?? this.maxTokens,
	repeatPenalty: config.repeatPenalty ?? this.repeatPenalty,
	customStopTriggers: config.stopStrings ?? this.stopStrings
	};

	// Add random seed if temperature > 0 and no seed specified
	if (promptOptions.temperature > 0 && config.seed === undefined) {
	promptOptions.seed = Math.floor(Math.random() * 1000000);
	} else if (config.seed !== undefined) {
	promptOptions.seed = config.seed;
	}

	// Use onTextChunk callback to stream chunks as they arrive
	const self = this;
	promptOptions.onTextChunk = (chunk) => {
	// This callback is synchronous, so we can't yield directly
	// We'll collect chunks and yield them after
	self._currentStreamChunks = self._currentStreamChunks \|\| [];
	self._currentStreamChunks.push(chunk);
	};

	// Initialize chunk collection
	this._currentStreamChunks = [];

	// Start generation (this will call onTextChunk as it generates)
	const responsePromise = this._chatSession.prompt('', promptOptions);

	// Yield chunks as they become available
	let lastYieldedIndex = 0;

	// Poll for new chunks
	while (true) {
	// Yield any new chunks
	while (lastYieldedIndex < this._currentStreamChunks.length) {
	yield new AIMessage(this._currentStreamChunks[lastYieldedIndex], {
	additionalKwargs: { chunk: true }
	});
	lastYieldedIndex++;
	}

	// Check if generation is complete
	const isDone = await Promise.race([
	responsePromise.then(() => true),
	new Promise(resolve => setTimeout(() => resolve(false), 10))
	]);

	if (isDone) {
	// Yield any remaining chunks
	while (lastYieldedIndex < this._currentStreamChunks.length) {
	yield new AIMessage(this._currentStreamChunks[lastYieldedIndex], {
	additionalKwargs: { chunk: true }
	});
	lastYieldedIndex++;
	}
	break;
	}
	}

	// Wait for the full response
	await responsePromise;

	// Clean up
	delete this._currentStreamChunks;

	} catch (error) {
	throw new Error(`Streaming failed: ${error.message}`);
	}
	}

	/**
	* Cleanup resources
	*
	* LLMs hold resources in memory. Call this when you're done
	* to free them up properly.
	*
	* @async
	* @returns {Promise<void>}
	*
	* @example
	* ```javascript
	* const llm = new LlamaCppLLM({ modelPath: './model.gguf' });
	*
	* try {
	* const response = await llm.invoke("Hello");
	* console.log(response.content);
	* } finally {
	* await llm.dispose(); // Always clean up!
	* }
	* ```
	*
	* @example With Multiple Uses
	* ```javascript
	* const llm = new LlamaCppLLM({ modelPath: './model.gguf' });
	*
	* // Use it many times
	* await llm.invoke("Question 1");
	* await llm.invoke("Question 2");
	* await llm.batch(["Q3", "Q4", "Q5"]);
	*
	* // Clean up when completely done
	* await llm.dispose();
	* ```
	*/
	async dispose() {
	if (this._context) {
	await this._context.dispose();
	this._context = null;
	}
	if (this._model) {
	await this._model.dispose();
	this._model = null;
	}
	this._chatSession = null;
	this._initialized = false;

	if (this.verbose) {
	console.log('✓ Model resources disposed');
	}
	}

	/**
	* String representation for debugging
	*
	* @returns {string} Human-readable representation
	*
	* @example
	* ```javascript
	* const llm = new LlamaCppLLM({ modelPath: './llama-2-7b.gguf' });
	* console.log(llm.toString());
	* // "LlamaCppLLM(model=./llama-2-7b.gguf)"
	*
	* // Useful in pipelines
	* const pipeline = formatter.pipe(llm).pipe(parser);
	* console.log(pipeline.toString());
	* // "PromptFormatter() \| LlamaCppLLM(model=./llama-2-7b.gguf) \| OutputParser()"
	* ```
	*/
	toString() {
	return `LlamaCppLLM(model=${this.modelPath})`;
	}
	}

	export default LlamaCppLLM;