"use strict"; /** * Embedding Service - Unified embedding generation and management * * This service provides a unified interface for generating, caching, and * managing embeddings from various sources (local models, APIs, etc.) */ Object.defineProperty(exports, "__esModule", { value: true }); exports.EmbeddingService = exports.LocalNGramProvider = exports.MockEmbeddingProvider = void 0; exports.createEmbeddingService = createEmbeddingService; exports.getDefaultEmbeddingService = getDefaultEmbeddingService; /** * Simple hash function for cache keys */ function hashText(text) { let hash = 0; for (let i = 0; i < text.length; i++) { const char = text.charCodeAt(i); hash = ((hash << 5) - hash) + char; hash = hash & hash; } return `h${hash.toString(36)}`; } /** * Mock embedding provider for testing */ class MockEmbeddingProvider { constructor(dimensions = 384) { this.name = 'mock'; this.dimensions = dimensions; } async embed(texts) { return texts.map(text => { // Generate deterministic pseudo-random embeddings based on text const embedding = []; let seed = 0; for (let i = 0; i < text.length; i++) { seed = ((seed << 5) - seed + text.charCodeAt(i)) | 0; } for (let i = 0; i < this.dimensions; i++) { seed = (seed * 1103515245 + 12345) | 0; embedding.push((seed % 1000) / 1000 - 0.5); } // Normalize const norm = Math.sqrt(embedding.reduce((s, v) => s + v * v, 0)); return embedding.map(v => v / (norm || 1)); }); } getDimensions() { return this.dimensions; } } exports.MockEmbeddingProvider = MockEmbeddingProvider; /** * Simple local embedding using character n-grams * This is a fallback when no external provider is available */ class LocalNGramProvider { constructor(dimensions = 256, ngramSize = 3) { this.name = 'local-ngram'; this.dimensions = dimensions; this.ngramSize = ngramSize; } async embed(texts) { return texts.map(text => this.embedSingle(text)); } embedSingle(text) { const embedding = new Array(this.dimensions).fill(0); const normalized = text.toLowerCase().replace(/[^a-z0-9]/g, ' '); // Generate n-grams and hash them into embedding dimensions for (let i = 0; i <= normalized.length - this.ngramSize; i++) { const ngram = normalized.slice(i, i + this.ngramSize); const hash = this.hashNgram(ngram); const idx = Math.abs(hash) % this.dimensions; embedding[idx] += hash > 0 ? 1 : -1; } // Normalize const norm = Math.sqrt(embedding.reduce((s, v) => s + v * v, 0)); return embedding.map(v => v / (norm || 1)); } hashNgram(ngram) { let hash = 0; for (let i = 0; i < ngram.length; i++) { hash = ((hash << 5) - hash + ngram.charCodeAt(i)) | 0; } return hash; } getDimensions() { return this.dimensions; } } exports.LocalNGramProvider = LocalNGramProvider; /** * Embedding service with caching and batching */ class EmbeddingService { constructor(config = {}) { this.providers = new Map(); this.cache = new Map(); this.config = { defaultProvider: config.defaultProvider ?? 'local-ngram', maxCacheSize: config.maxCacheSize ?? 10000, cacheTtl: config.cacheTtl ?? 3600000, // 1 hour batchSize: config.batchSize ?? 32, }; // Register default providers this.registerProvider(new LocalNGramProvider()); this.registerProvider(new MockEmbeddingProvider()); } /** * Register an embedding provider */ registerProvider(provider) { this.providers.set(provider.name, provider); } /** * Get a registered provider */ getProvider(name) { const providerName = name ?? this.config.defaultProvider; const provider = this.providers.get(providerName); if (!provider) { throw new Error(`Provider not found: ${providerName}`); } return provider; } /** * Generate embeddings for texts with caching * * @param texts - Texts to embed * @param provider - Provider name (uses default if not specified) * @returns Array of embeddings */ async embed(texts, provider) { const providerInstance = this.getProvider(provider); const providerName = providerInstance.name; const now = Date.now(); // Check cache and collect texts that need embedding const results = new Array(texts.length).fill(null); const uncachedIndices = []; const uncachedTexts = []; for (let i = 0; i < texts.length; i++) { const cacheKey = `${providerName}:${hashText(texts[i])}`; const cached = this.cache.get(cacheKey); if (cached && now - cached.timestamp < this.config.cacheTtl) { results[i] = cached.embedding; cached.hits++; } else { uncachedIndices.push(i); uncachedTexts.push(texts[i]); } } // Generate embeddings for uncached texts in batches if (uncachedTexts.length > 0) { const batches = []; for (let i = 0; i < uncachedTexts.length; i += this.config.batchSize) { batches.push(uncachedTexts.slice(i, i + this.config.batchSize)); } let batchOffset = 0; for (const batch of batches) { const embeddings = await providerInstance.embed(batch); for (let j = 0; j < embeddings.length; j++) { const originalIndex = uncachedIndices[batchOffset + j]; results[originalIndex] = embeddings[j]; // Cache the result const cacheKey = `${providerName}:${hashText(texts[originalIndex])}`; this.addToCache(cacheKey, embeddings[j], now); } batchOffset += batch.length; } } return results; } /** * Generate a single embedding */ async embedOne(text, provider) { const results = await this.embed([text], provider); return results[0]; } /** * Add entry to cache with LRU eviction */ addToCache(key, embedding, timestamp) { // Evict old entries if cache is full if (this.cache.size >= this.config.maxCacheSize) { // Find and remove least recently used entry let oldestKey = ''; let oldestTime = Infinity; let lowestHits = Infinity; for (const [k, v] of this.cache.entries()) { if (v.hits < lowestHits || (v.hits === lowestHits && v.timestamp < oldestTime)) { oldestKey = k; oldestTime = v.timestamp; lowestHits = v.hits; } } if (oldestKey) { this.cache.delete(oldestKey); } } this.cache.set(key, { embedding, timestamp, hits: 0 }); } /** * Compute cosine similarity between two embeddings */ cosineSimilarity(a, b) { if (a.length !== b.length) { throw new Error('Embeddings must have same dimensions'); } let dotProduct = 0; let normA = 0; let normB = 0; for (let i = 0; i < a.length; i++) { dotProduct += a[i] * b[i]; normA += a[i] * a[i]; normB += b[i] * b[i]; } const denom = Math.sqrt(normA) * Math.sqrt(normB); return denom === 0 ? 0 : dotProduct / denom; } /** * Find most similar texts from a corpus */ async findSimilar(query, corpus, k = 5, provider) { const [queryEmbed, ...corpusEmbeds] = await this.embed([query, ...corpus], provider); const results = corpusEmbeds.map((embed, i) => ({ text: corpus[i], similarity: this.cosineSimilarity(queryEmbed, embed), index: i, })); return results .sort((a, b) => b.similarity - a.similarity) .slice(0, k); } /** * Get cache statistics */ getCacheStats() { let totalHits = 0; for (const entry of this.cache.values()) { totalHits += entry.hits; } return { size: this.cache.size, maxSize: this.config.maxCacheSize, hitRate: this.cache.size > 0 ? totalHits / this.cache.size : 0, }; } /** * Clear the cache */ clearCache() { this.cache.clear(); } /** * Get embedding dimensions for a provider */ getDimensions(provider) { return this.getProvider(provider).getDimensions(); } /** * List available providers */ listProviders() { return Array.from(this.providers.keys()); } } exports.EmbeddingService = EmbeddingService; /** * Create an embedding service instance */ function createEmbeddingService(config) { return new EmbeddingService(config); } // Singleton instance let defaultService = null; /** * Get the default embedding service instance */ function getDefaultEmbeddingService() { if (!defaultService) { defaultService = new EmbeddingService(); } return defaultService; } exports.default = { EmbeddingService, LocalNGramProvider, MockEmbeddingProvider, createEmbeddingService, getDefaultEmbeddingService, };