| | "use strict"; |
| | |
| | |
| | |
| | |
| | |
| | |
| | Object.defineProperty(exports, "__esModule", { value: true }); |
| | exports.EmbeddingService = exports.LocalNGramProvider = exports.MockEmbeddingProvider = void 0; |
| | exports.createEmbeddingService = createEmbeddingService; |
| | exports.getDefaultEmbeddingService = getDefaultEmbeddingService; |
| | |
| | |
| | |
| | function hashText(text) { |
| | let hash = 0; |
| | for (let i = 0; i < text.length; i++) { |
| | const char = text.charCodeAt(i); |
| | hash = ((hash << 5) - hash) + char; |
| | hash = hash & hash; |
| | } |
| | return `h${hash.toString(36)}`; |
| | } |
| | |
| | |
| | |
| | class MockEmbeddingProvider { |
| | constructor(dimensions = 384) { |
| | this.name = 'mock'; |
| | this.dimensions = dimensions; |
| | } |
| | async embed(texts) { |
| | return texts.map(text => { |
| | |
| | const embedding = []; |
| | let seed = 0; |
| | for (let i = 0; i < text.length; i++) { |
| | seed = ((seed << 5) - seed + text.charCodeAt(i)) | 0; |
| | } |
| | for (let i = 0; i < this.dimensions; i++) { |
| | seed = (seed * 1103515245 + 12345) | 0; |
| | embedding.push((seed % 1000) / 1000 - 0.5); |
| | } |
| | |
| | const norm = Math.sqrt(embedding.reduce((s, v) => s + v * v, 0)); |
| | return embedding.map(v => v / (norm || 1)); |
| | }); |
| | } |
| | getDimensions() { |
| | return this.dimensions; |
| | } |
| | } |
| | exports.MockEmbeddingProvider = MockEmbeddingProvider; |
| | |
| | |
| | |
| | |
| | class LocalNGramProvider { |
| | constructor(dimensions = 256, ngramSize = 3) { |
| | this.name = 'local-ngram'; |
| | this.dimensions = dimensions; |
| | this.ngramSize = ngramSize; |
| | } |
| | async embed(texts) { |
| | return texts.map(text => this.embedSingle(text)); |
| | } |
| | embedSingle(text) { |
| | const embedding = new Array(this.dimensions).fill(0); |
| | const normalized = text.toLowerCase().replace(/[^a-z0-9]/g, ' '); |
| | |
| | for (let i = 0; i <= normalized.length - this.ngramSize; i++) { |
| | const ngram = normalized.slice(i, i + this.ngramSize); |
| | const hash = this.hashNgram(ngram); |
| | const idx = Math.abs(hash) % this.dimensions; |
| | embedding[idx] += hash > 0 ? 1 : -1; |
| | } |
| | |
| | const norm = Math.sqrt(embedding.reduce((s, v) => s + v * v, 0)); |
| | return embedding.map(v => v / (norm || 1)); |
| | } |
| | hashNgram(ngram) { |
| | let hash = 0; |
| | for (let i = 0; i < ngram.length; i++) { |
| | hash = ((hash << 5) - hash + ngram.charCodeAt(i)) | 0; |
| | } |
| | return hash; |
| | } |
| | getDimensions() { |
| | return this.dimensions; |
| | } |
| | } |
| | exports.LocalNGramProvider = LocalNGramProvider; |
| | |
| | |
| | |
| | class EmbeddingService { |
| | constructor(config = {}) { |
| | this.providers = new Map(); |
| | this.cache = new Map(); |
| | this.config = { |
| | defaultProvider: config.defaultProvider ?? 'local-ngram', |
| | maxCacheSize: config.maxCacheSize ?? 10000, |
| | cacheTtl: config.cacheTtl ?? 3600000, |
| | batchSize: config.batchSize ?? 32, |
| | }; |
| | |
| | this.registerProvider(new LocalNGramProvider()); |
| | this.registerProvider(new MockEmbeddingProvider()); |
| | } |
| | |
| | |
| | |
| | registerProvider(provider) { |
| | this.providers.set(provider.name, provider); |
| | } |
| | |
| | |
| | |
| | getProvider(name) { |
| | const providerName = name ?? this.config.defaultProvider; |
| | const provider = this.providers.get(providerName); |
| | if (!provider) { |
| | throw new Error(`Provider not found: ${providerName}`); |
| | } |
| | return provider; |
| | } |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | async embed(texts, provider) { |
| | const providerInstance = this.getProvider(provider); |
| | const providerName = providerInstance.name; |
| | const now = Date.now(); |
| | |
| | const results = new Array(texts.length).fill(null); |
| | const uncachedIndices = []; |
| | const uncachedTexts = []; |
| | for (let i = 0; i < texts.length; i++) { |
| | const cacheKey = `${providerName}:${hashText(texts[i])}`; |
| | const cached = this.cache.get(cacheKey); |
| | if (cached && now - cached.timestamp < this.config.cacheTtl) { |
| | results[i] = cached.embedding; |
| | cached.hits++; |
| | } |
| | else { |
| | uncachedIndices.push(i); |
| | uncachedTexts.push(texts[i]); |
| | } |
| | } |
| | |
| | if (uncachedTexts.length > 0) { |
| | const batches = []; |
| | for (let i = 0; i < uncachedTexts.length; i += this.config.batchSize) { |
| | batches.push(uncachedTexts.slice(i, i + this.config.batchSize)); |
| | } |
| | let batchOffset = 0; |
| | for (const batch of batches) { |
| | const embeddings = await providerInstance.embed(batch); |
| | for (let j = 0; j < embeddings.length; j++) { |
| | const originalIndex = uncachedIndices[batchOffset + j]; |
| | results[originalIndex] = embeddings[j]; |
| | |
| | const cacheKey = `${providerName}:${hashText(texts[originalIndex])}`; |
| | this.addToCache(cacheKey, embeddings[j], now); |
| | } |
| | batchOffset += batch.length; |
| | } |
| | } |
| | return results; |
| | } |
| | |
| | |
| | |
| | async embedOne(text, provider) { |
| | const results = await this.embed([text], provider); |
| | return results[0]; |
| | } |
| | |
| | |
| | |
| | addToCache(key, embedding, timestamp) { |
| | |
| | if (this.cache.size >= this.config.maxCacheSize) { |
| | |
| | let oldestKey = ''; |
| | let oldestTime = Infinity; |
| | let lowestHits = Infinity; |
| | for (const [k, v] of this.cache.entries()) { |
| | if (v.hits < lowestHits || (v.hits === lowestHits && v.timestamp < oldestTime)) { |
| | oldestKey = k; |
| | oldestTime = v.timestamp; |
| | lowestHits = v.hits; |
| | } |
| | } |
| | if (oldestKey) { |
| | this.cache.delete(oldestKey); |
| | } |
| | } |
| | this.cache.set(key, { embedding, timestamp, hits: 0 }); |
| | } |
| | |
| | |
| | |
| | cosineSimilarity(a, b) { |
| | if (a.length !== b.length) { |
| | throw new Error('Embeddings must have same dimensions'); |
| | } |
| | let dotProduct = 0; |
| | let normA = 0; |
| | let normB = 0; |
| | for (let i = 0; i < a.length; i++) { |
| | dotProduct += a[i] * b[i]; |
| | normA += a[i] * a[i]; |
| | normB += b[i] * b[i]; |
| | } |
| | const denom = Math.sqrt(normA) * Math.sqrt(normB); |
| | return denom === 0 ? 0 : dotProduct / denom; |
| | } |
| | |
| | |
| | |
| | async findSimilar(query, corpus, k = 5, provider) { |
| | const [queryEmbed, ...corpusEmbeds] = await this.embed([query, ...corpus], provider); |
| | const results = corpusEmbeds.map((embed, i) => ({ |
| | text: corpus[i], |
| | similarity: this.cosineSimilarity(queryEmbed, embed), |
| | index: i, |
| | })); |
| | return results |
| | .sort((a, b) => b.similarity - a.similarity) |
| | .slice(0, k); |
| | } |
| | |
| | |
| | |
| | getCacheStats() { |
| | let totalHits = 0; |
| | for (const entry of this.cache.values()) { |
| | totalHits += entry.hits; |
| | } |
| | return { |
| | size: this.cache.size, |
| | maxSize: this.config.maxCacheSize, |
| | hitRate: this.cache.size > 0 ? totalHits / this.cache.size : 0, |
| | }; |
| | } |
| | |
| | |
| | |
| | clearCache() { |
| | this.cache.clear(); |
| | } |
| | |
| | |
| | |
| | getDimensions(provider) { |
| | return this.getProvider(provider).getDimensions(); |
| | } |
| | |
| | |
| | |
| | listProviders() { |
| | return Array.from(this.providers.keys()); |
| | } |
| | } |
| | exports.EmbeddingService = EmbeddingService; |
| | |
| | |
| | |
| | function createEmbeddingService(config) { |
| | return new EmbeddingService(config); |
| | } |
| | |
| | let defaultService = null; |
| | |
| | |
| | |
| | function getDefaultEmbeddingService() { |
| | if (!defaultService) { |
| | defaultService = new EmbeddingService(); |
| | } |
| | return defaultService; |
| | } |
| | exports.default = { |
| | EmbeddingService, |
| | LocalNGramProvider, |
| | MockEmbeddingProvider, |
| | createEmbeddingService, |
| | getDefaultEmbeddingService, |
| | }; |
| |
|