Spaces:

lenzcom
/

Email

Running

File size: 20,698 Bytes

e706de2

/**

 * LlamaCppLLM - node-llama-cpp wrapper as a Runnable

 *

 * @module llm/llama-cpp-llm

 */

import { Runnable } from '../core/runnable.js';
import { AIMessage, HumanMessage } from '../core/message.js';
import { getLlama, LlamaChatSession } from 'node-llama-cpp';

/**

 * LlamaCppLLM - A Runnable wrapper for node-llama-cpp

 *

 * Wraps your LLM calls from agent fundamentals into a reusable,

 * composable Runnable component.

 *

 * Key benefits over raw node-llama-cpp:

 * - Composable with other Runnables via .pipe()

 * - Supports batch processing multiple inputs

 * - Built-in streaming support

 * - Consistent interface across all LLMs

 * - Easy to swap with other LLM providers

 */
export class LlamaCppLLM extends Runnable {
  /**

   * Create a new LlamaCppLLM instance

   *

   * @param {Object} options - Configuration options

   * @param {string} options.modelPath - Path to your GGUF model file (REQUIRED)

   * @param {number} [options.temperature=0.7] - Sampling temperature (0-1)

   *   - Lower (0.1): More focused, deterministic

   *   - Higher (0.9): More creative, random

   * @param {number} [options.topP=0.9] - Nucleus sampling threshold

   * @param {number} [options.topK=40] - Top-K sampling parameter

   * @param {number} [options.maxTokens=2048] - Maximum tokens to generate

   * @param {number} [options.repeatPenalty=1.1] - Penalty for repeating tokens

   * @param {number} [options.contextSize=4096] - Context window size

   * @param {number} [options.batchSize=512] - Batch processing size

   * @param {boolean} [options.verbose=false] - Enable debug logging

   * @param {string[]} [options.stopStrings] - Strings that stop generation

   * @param {Object} [options.chatWrapper] - Custom chat wrapper instance (e.g., QwenChatWrapper)

   *   - If not provided, the library will automatically select the best wrapper for your model

   *

   * @example Basic Setup

   * ```javascript

   * const llm = new LlamaCppLLM({

   *   modelPath: './models/Meta-Llama-3.1-8B-Instruct-Q5_K_S.gguf',

   *   temperature: 0.7

   * });

   * ```

   *

   * @example With Qwen Chat Wrapper (Discourage Thoughts)

   * ```javascript

   * import { QwenChatWrapper } from 'node-llama-cpp';

   *

   * const llm = new LlamaCppLLM({

   *   modelPath: './models/Qwen3-1.7B-Q6_K.gguf',

   *   temperature: 0.7,

   *   chatWrapper: new QwenChatWrapper({

   *     thoughts: 'discourage'

   *   })

   * });

   * ```

   *

   * @example Different Configurations for Different Tasks

   * ```javascript

   * // Creative writing (higher temperature)

   * const creative = new LlamaCppLLM({

   *   modelPath: './model.gguf',

   *   temperature: 0.9,

   *   maxTokens: 1000

   * });

   *

   * // Factual responses (lower temperature)

   * const factual = new LlamaCppLLM({

   *   modelPath: './model.gguf',

   *   temperature: 0.1,

   *   maxTokens: 500

   * });

   * ```

   */
  constructor(options = {}) {
    super();

    // Validate required options
    this.modelPath = options.modelPath;
    if (!this.modelPath) {
      throw new Error(
          'modelPath is required. Example: new LlamaCppLLM({ modelPath: "./model.gguf" })'
      );
    }

    // Generation parameters
    // These control how the LLM generates text - same as in your fundamentals!
    this.temperature = options.temperature ?? 0.7;
    this.topP = options.topP ?? 0.9;
    this.topK = options.topK ?? 40;
    this.maxTokens = options.maxTokens ?? 2048;
    this.repeatPenalty = options.repeatPenalty ?? 1.1;

    // Context configuration
    this.contextSize = options.contextSize ?? 4096;
    this.batchSize = options.batchSize ?? 512;

    // Behavior
    this.verbose = options.verbose ?? false;

    // Chat wrapper configuration
    // If not provided, LlamaChatSession will auto-select the best wrapper
    this.chatWrapper = options.chatWrapper ?? 'auto';

    // Stop strings - when the model sees these, it stops generating
    // Default includes common chat separators
    this.stopStrings = options.stopStrings ?? [
      'Human:',
      'User:',
      '\n\nHuman:',
      '\n\nUser:'
    ];

    // Internal state (lazy initialized)
    this._llama = null;
    this._model = null;
    this._context = null;
    this._chatSession = null;
    this._initialized = false;
  }

  /**

   * Initialize model (lazy loading)

   *

   * This loads the model only when first needed, not at construction.

   * This pattern is useful because model loading is slow - we only

   * want to do it once and only when we actually need it.

   *

   * @private

   * @throws {Error} If model loading fails

   */
  async _initialize() {
    // Skip if already initialized
    if (this._initialized) return;

    if (this.verbose) {
      console.log(`Loading model: ${this.modelPath}`);
    }

    try {
      // Step 1: Get the llama instance
      this._llama = await getLlama();

      // Step 2: Load the model file
      this._model = await this._llama.loadModel({
        modelPath: this.modelPath
      });

      // Step 3: Create a context for generation
      this._context = await this._model.createContext({
        contextSize: this.contextSize,
        batchSize: this.batchSize
      });

      // Step 4: Create a chat session
      // This manages conversation history for us
      const contextSequence = this._context.getSequence();
      const sessionConfig = {
        contextSequence
      };

      // Add chatWrapper if specified (otherwise LlamaChatSession uses "auto")
      if (this.chatWrapper !== 'auto') {
        sessionConfig.chatWrapper = this.chatWrapper;
      }

      this._chatSession = new LlamaChatSession(sessionConfig);

      this._initialized = true;

      if (this.verbose) {
        console.log('✓ Model loaded successfully');
        if (this.chatWrapper !== 'auto') {
          console.log(`✓ Using custom chat wrapper: ${this.chatWrapper.constructor.name}`);
        } else {
          console.log('✓ Using auto-detected chat wrapper');
        }
      }
    } catch (error) {
      throw new Error(
          `Failed to initialize model at ${this.modelPath}: ${error.message}`
      );
    }
  }

  /**

   * Convert our Message objects to node-llama-cpp chat history format

   *

   * This bridges between our standardized Message types and what

   * node-llama-cpp expects. Think of it as a translator.

   *

   * @private

   * @param {Array<Message>} messages - Array of Message objects

   * @returns {Array<Object>} Chat history in llama.cpp format

   *

   * @example

   * ```javascript

   * // Input: Our messages

   * [

   *   new SystemMessage("You are helpful"),

   *   new HumanMessage("Hi"),

   *   new AIMessage("Hello!")

   * ]

   *

   * // Output: llama.cpp format

   * [

   *   { type: 'system', text: 'You are helpful' },

   *   { type: 'user', text: 'Hi' },

   *   { type: 'model', response: 'Hello!' }

   * ]

   * ```

   */
  _messagesToChatHistory(messages) {
    return messages.map(msg => {
      // System messages: instructions for the AI
      if (msg._type === 'system') {
        return { type: 'system', text: msg.content };
      }
      // Human messages: user input
      else if (msg._type === 'human') {
        return { type: 'user', text: msg.content };
      }
      // AI messages: previous AI responses
      else if (msg._type === 'ai') {
        return { type: 'model', response: msg.content };
      }
      // Tool messages: results from tool execution
      else if (msg._type === 'tool') {
        // Convert tool results to system messages
        return { type: 'system', text: `Tool Result: ${msg.content}` };
      }

      // Fallback: treat unknown types as user messages
      return { type: 'user', text: msg.content };
    });
  }

  /**

   * Clean up model response

   *

   * Sometimes models include extra prefixes or suffixes.

   * This cleans them up for a better user experience.

   *

   * @private

   * @param {string} response - Raw model response

   * @returns {string} Cleaned response

   *

   * @example

   * ```javascript

   * // Before: "Assistant: The answer is 42\n\nHuman: "

   * // After:  "The answer is 42"

   * ```

   */
  _cleanResponse(response) {
    let cleaned = response.trim();

    // Remove "Assistant:" or "AI:" prefixes
    cleaned = cleaned.replace(/^(Assistant|AI):\s*/i, '');

    // Remove any conversation continuations
    cleaned = cleaned.replace(/\n\n(Human|User):.*$/s, '');

    return cleaned.trim();
  }

  /**

   * Main generation method - this is where your LLM calls happen!

   *

   * This is the same as calling `llm.chat(messages)` in your fundamentals,

   * but wrapped to work with the Runnable interface.

   *

   * @async

   * @param {string|Array<Message>} input - User input or message array

   * @param {Object} [config={}] - Runtime configuration

   * @param {number} [config.temperature] - Override temperature for this call

   * @param {number} [config.maxTokens] - Override max tokens for this call

   * @param {boolean} [config.clearHistory=false] - Clear chat history before this call

   * @returns {Promise<AIMessage>} Generated response as AIMessage

   *

   * @example String Input (Simplest)

   * ```javascript

   * const response = await llm.invoke("What is AI?");

   * console.log(response.content); // "AI is..."

   * ```

   *

   * @example Message Array Input (Full Control)

   * ```javascript

   * const messages = [

   *   new SystemMessage("You are a helpful assistant"),

   *   new HumanMessage("What is AI?")

   * ];

   * const response = await llm.invoke(messages);

   * ```

   *

   * @example Runtime Configuration

   * ```javascript

   * // Override temperature for this specific call

   * const response = await llm.invoke(

   *   "Write a creative story",

   *   { temperature: 0.9, maxTokens: 500 }

   * );

   * ```

   *

   * @example Clear History Before Call

   * ```javascript

   * // Ensure fresh context with no prior conversation

   * const response = await llm.invoke(

   *   "What is AI?",

   *   { clearHistory: true }

   * );

   * ```

   *

   * @example In a Pipeline (Composition)

   * ```javascript

   * const pipeline = promptFormatter

   *   .pipe(llm)

   *   .pipe(outputParser);

   *

   * const result = await pipeline.invoke("user input");

   * ```

   */
  async _call(input, config = {}) {
    // Ensure model is loaded (only happens once)
    await this._initialize();

    // Clear history if requested (important for batch processing)
    if (config.clearHistory) {
      this._chatSession.setChatHistory([]);
    }

    // Handle different input types
    let messages;
    if (typeof input === 'string') {
      messages = [new HumanMessage(input)];
    } else if (Array.isArray(input)) {
      messages = input;
    } else {
      throw new Error(
          'Input must be a string or array of messages. ' +
          'Example: "Hello" or [new HumanMessage("Hello")]'
      );
    }

    // Extract system message if present
    const systemMessages = messages.filter(msg => msg._type === 'system');
    const systemPrompt = systemMessages.length > 0
        ? systemMessages[0].content
        : '';

    // Convert our Message objects to llama.cpp format
    const chatHistory = this._messagesToChatHistory(messages);
    this._chatSession.setChatHistory(chatHistory);

    // ALWAYS set system prompt (either new value or empty string to clear)
    this._chatSession.systemPrompt = systemPrompt;

    try {
      // Build prompt options
      const promptOptions = {
        temperature: config.temperature ?? this.temperature,
        topP: config.topP ?? this.topP,
        topK: config.topK ?? this.topK,
        maxTokens: config.maxTokens ?? this.maxTokens,
        repeatPenalty: config.repeatPenalty ?? this.repeatPenalty,
        customStopTriggers: config.stopStrings ?? this.stopStrings
      };

      // Add random seed if temperature > 0 and no seed specified
      // This ensures randomness works properly
      if (promptOptions.temperature > 0 && config.seed === undefined) {
        promptOptions.seed = Math.floor(Math.random() * 1000000);
      } else if (config.seed !== undefined) {
        promptOptions.seed = config.seed;
      }

      // Generate response using prompt (simpler than promptWithMeta for non-streaming)
      const response = await this._chatSession.prompt('', promptOptions);

      // Return as AIMessage for consistency
      return new AIMessage(response);
    } catch (error) {
      throw new Error(`Generation failed: ${error.message}`);
    }
  }

  /**

   * Batch processing with history isolation

   *

   * Processes multiple inputs sequentially, ensuring each gets a clean chat history.

   * Note: Local models process requests sequentially, so there's no performance

   * benefit compared to calling invoke() multiple times.

   *

   * @async

   * @param {Array<string|Array<Message>>} inputs - Array of inputs to process

   * @param {Object} [config={}] - Runtime configuration

   * @returns {Promise<Array<AIMessage>>} Array of generated responses

   *

   * @example

   * ```javascript

   * const questions = ["What is AI?", "What is ML?", "What is DL?"];

   * const answers = await llm.batch(questions);

   * ```

   */
  async batch(inputs, config = {}) {
    const results = [];
    for (const input of inputs) {
      // Clear history before each batch item to prevent contamination
      const result = await this._call(input, { ...config, clearHistory: true });
      results.push(result);
    }
    return results;
  }

  /**

   * Streaming generation - show results as they're generated!

   *

   * This is the same as _call() but yields chunks as they arrive,

   * like the typing effect you see in ChatGPT.

   *

   * @async

   * @generator

   * @param {string|Array<Message>} input - User input or message array

   * @param {Object} [config={}] - Runtime configuration

   * @yields {AIMessage} Chunks of generated text

   *

   * @example Basic Streaming

   * ```javascript

   * console.log("Response: ");

   * for await (const chunk of llm.stream("Tell me a story")) {

   *   process.stdout.write(chunk.content); // Print without newline

   * }

   * console.log("\nDone!");

   * ```

   *

   * @example Streaming in a Pipeline

   * ```javascript

   * const pipeline = promptFormatter

   *   .pipe(llm)

   *   .pipe(parser);

   *

   * // Only the last step (parser) gets streamed chunks

   * for await (const chunk of pipeline.stream(input)) {

   *   console.log(chunk);

   * }

   * ```

   *

   * @example Building a Chat UI

   * ```javascript

   * async function streamToUI(input) {

   *   let fullResponse = '';

   *

   *   for await (const chunk of llm.stream(input)) {

   *     fullResponse += chunk.content;

   *     updateUI(fullResponse); // Update your UI in real-time

   *   }

   * }

   * ```

   */
  async* stream(input, config = {}) {
    await this._initialize();

    // Clear history if requested
    if (config.clearHistory) {
      this._chatSession.setChatHistory([]);
    }

    // Handle different input types (same as _call)
    let messages;
    if (typeof input === 'string') {
      messages = [new HumanMessage(input)];
    } else if (Array.isArray(input)) {
      messages = input;
    } else {
      throw new Error(
          'Input must be a string or array of messages for streaming'
      );
    }

    // Extract system message if present
    const systemMessages = messages.filter(msg => msg._type === 'system');
    const systemPrompt = systemMessages.length > 0
        ? systemMessages[0].content
        : '';

    // Set up chat history
    const chatHistory = this._messagesToChatHistory(messages);
    this._chatSession.setChatHistory(chatHistory);

    // ALWAYS set system prompt (either new value or empty string to clear)
    this._chatSession.systemPrompt = systemPrompt;

    try {
      // Build prompt options
      const promptOptions = {
        temperature: config.temperature ?? this.temperature,
        topP: config.topP ?? this.topP,
        topK: config.topK ?? this.topK,
        maxTokens: config.maxTokens ?? this.maxTokens,
        repeatPenalty: config.repeatPenalty ?? this.repeatPenalty,
        customStopTriggers: config.stopStrings ?? this.stopStrings
      };

      // Add random seed if temperature > 0 and no seed specified
      if (promptOptions.temperature > 0 && config.seed === undefined) {
        promptOptions.seed = Math.floor(Math.random() * 1000000);
      } else if (config.seed !== undefined) {
        promptOptions.seed = config.seed;
      }

      // Use onTextChunk callback to stream chunks as they arrive
      const self = this;
      promptOptions.onTextChunk = (chunk) => {
        // This callback is synchronous, so we can't yield directly
        // We'll collect chunks and yield them after
        self._currentStreamChunks = self._currentStreamChunks || [];
        self._currentStreamChunks.push(chunk);
      };

      // Initialize chunk collection
      this._currentStreamChunks = [];

      // Start generation (this will call onTextChunk as it generates)
      const responsePromise = this._chatSession.prompt('', promptOptions);

      // Yield chunks as they become available
      let lastYieldedIndex = 0;

      // Poll for new chunks
      while (true) {
        // Yield any new chunks
        while (lastYieldedIndex < this._currentStreamChunks.length) {
          yield new AIMessage(this._currentStreamChunks[lastYieldedIndex], {
            additionalKwargs: { chunk: true }
          });
          lastYieldedIndex++;
        }

        // Check if generation is complete
        const isDone = await Promise.race([
          responsePromise.then(() => true),
          new Promise(resolve => setTimeout(() => resolve(false), 10))
        ]);

        if (isDone) {
          // Yield any remaining chunks
          while (lastYieldedIndex < this._currentStreamChunks.length) {
            yield new AIMessage(this._currentStreamChunks[lastYieldedIndex], {
              additionalKwargs: { chunk: true }
            });
            lastYieldedIndex++;
          }
          break;
        }
      }

      // Wait for the full response
      await responsePromise;

      // Clean up
      delete this._currentStreamChunks;

    } catch (error) {
      throw new Error(`Streaming failed: ${error.message}`);
    }
  }

  /**

   * Cleanup resources

   *

   * LLMs hold resources in memory. Call this when you're done

   * to free them up properly.

   *

   * @async

   * @returns {Promise<void>}

   *

   * @example

   * ```javascript

   * const llm = new LlamaCppLLM({ modelPath: './model.gguf' });

   *

   * try {

   *   const response = await llm.invoke("Hello");

   *   console.log(response.content);

   * } finally {

   *   await llm.dispose(); // Always clean up!

   * }

   * ```

   *

   * @example With Multiple Uses

   * ```javascript

   * const llm = new LlamaCppLLM({ modelPath: './model.gguf' });

   *

   * // Use it many times

   * await llm.invoke("Question 1");

   * await llm.invoke("Question 2");

   * await llm.batch(["Q3", "Q4", "Q5"]);

   *

   * // Clean up when completely done

   * await llm.dispose();

   * ```

   */
  async dispose() {
    if (this._context) {
      await this._context.dispose();
      this._context = null;
    }
    if (this._model) {
      await this._model.dispose();
      this._model = null;
    }
    this._chatSession = null;
    this._initialized = false;

    if (this.verbose) {
      console.log('✓ Model resources disposed');
    }
  }

  /**

   * String representation for debugging

   *

   * @returns {string} Human-readable representation

   *

   * @example

   * ```javascript

   * const llm = new LlamaCppLLM({ modelPath: './llama-2-7b.gguf' });

   * console.log(llm.toString());

   * // "LlamaCppLLM(model=./llama-2-7b.gguf)"

   *

   * // Useful in pipelines

   * const pipeline = formatter.pipe(llm).pipe(parser);

   * console.log(pipeline.toString());

   * // "PromptFormatter() | LlamaCppLLM(model=./llama-2-7b.gguf) | OutputParser()"

   * ```

   */
  toString() {
    return `LlamaCppLLM(model=${this.modelPath})`;
  }
}

export default LlamaCppLLM;