|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
function isNullOrNaN(value) { |
|
|
if (value === null) return true; |
|
|
return isNaN(value); |
|
|
} |
|
|
|
|
|
class TextSplitter { |
|
|
#splitter; |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
constructor(config = {}) { |
|
|
this.config = config; |
|
|
this.#splitter = this.#setSplitter(config); |
|
|
} |
|
|
|
|
|
log(text, ...args) { |
|
|
console.log(`\x1b[35m[TextSplitter]\x1b[0m ${text}`, ...args); |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
static determineMaxChunkSize(preferred = null, embedderLimit = 1000) { |
|
|
const prefValue = isNullOrNaN(preferred) |
|
|
? Number(embedderLimit) |
|
|
: Number(preferred); |
|
|
const limit = Number(embedderLimit); |
|
|
if (prefValue > limit) |
|
|
console.log( |
|
|
`\x1b[43m[WARN]\x1b[0m Text splitter chunk length of ${prefValue} exceeds embedder model max of ${embedderLimit}. Will use ${embedderLimit}.` |
|
|
); |
|
|
return prefValue > limit ? limit : prefValue; |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
static buildHeaderMeta(metadata = {}) { |
|
|
if (!metadata || Object.keys(metadata).length === 0) return null; |
|
|
const PLUCK_MAP = { |
|
|
title: { |
|
|
as: "sourceDocument", |
|
|
pluck: (metadata) => { |
|
|
return metadata?.title || null; |
|
|
}, |
|
|
}, |
|
|
published: { |
|
|
as: "published", |
|
|
pluck: (metadata) => { |
|
|
return metadata?.published || null; |
|
|
}, |
|
|
}, |
|
|
chunkSource: { |
|
|
as: "source", |
|
|
pluck: (metadata) => { |
|
|
const validPrefixes = ["link://", "youtube://"]; |
|
|
|
|
|
|
|
|
|
|
|
if ( |
|
|
!metadata?.chunkSource || |
|
|
!metadata?.chunkSource.length || |
|
|
typeof metadata.chunkSource !== "string" || |
|
|
!validPrefixes.some( |
|
|
(prefix) => metadata.chunkSource.startsWith(prefix) |
|
|
) |
|
|
) |
|
|
return null; |
|
|
|
|
|
|
|
|
|
|
|
let source = null; |
|
|
for (const prefix of validPrefixes) { |
|
|
source = metadata.chunkSource.split(prefix)?.[1] || null; |
|
|
if (source) break; |
|
|
} |
|
|
|
|
|
return source; |
|
|
}, |
|
|
}, |
|
|
}; |
|
|
|
|
|
const pluckedData = {}; |
|
|
Object.entries(PLUCK_MAP).forEach(([key, value]) => { |
|
|
if (!(key in metadata)) return; |
|
|
const pluckedValue = value.pluck(metadata); |
|
|
if (!pluckedValue) return; |
|
|
pluckedData[value.as] = pluckedValue; |
|
|
}); |
|
|
|
|
|
return pluckedData; |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#applyPrefix(text = "") { |
|
|
if (!this.config.chunkPrefix) return text; |
|
|
return `${this.config.chunkPrefix}${text}`; |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
stringifyHeader() { |
|
|
let content = ""; |
|
|
if (!this.config.chunkHeaderMeta) return this.#applyPrefix(content); |
|
|
Object.entries(this.config.chunkHeaderMeta).map(([key, value]) => { |
|
|
if (!key || !value) return; |
|
|
content += `${key}: ${value}\n`; |
|
|
}); |
|
|
|
|
|
if (!content) return this.#applyPrefix(content); |
|
|
return this.#applyPrefix( |
|
|
`<document_metadata>\n${content}</document_metadata>\n\n` |
|
|
); |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#setSplitter(config = {}) { |
|
|
|
|
|
return new RecursiveSplitter({ |
|
|
chunkSize: isNaN(config?.chunkSize) ? 1_000 : Number(config?.chunkSize), |
|
|
chunkOverlap: isNaN(config?.chunkOverlap) |
|
|
? 20 |
|
|
: Number(config?.chunkOverlap), |
|
|
chunkHeader: this.stringifyHeader(), |
|
|
}); |
|
|
} |
|
|
|
|
|
async splitText(documentText) { |
|
|
return this.#splitter._splitText(documentText); |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
class RecursiveSplitter { |
|
|
constructor({ chunkSize, chunkOverlap, chunkHeader = null }) { |
|
|
const { |
|
|
RecursiveCharacterTextSplitter, |
|
|
} = require("@langchain/textsplitters"); |
|
|
this.log(`Will split with`, { |
|
|
chunkSize, |
|
|
chunkOverlap, |
|
|
chunkHeader: chunkHeader ? `${chunkHeader?.slice(0, 50)}...` : null, |
|
|
}); |
|
|
this.chunkHeader = chunkHeader; |
|
|
this.engine = new RecursiveCharacterTextSplitter({ |
|
|
chunkSize, |
|
|
chunkOverlap, |
|
|
}); |
|
|
} |
|
|
|
|
|
log(text, ...args) { |
|
|
console.log(`\x1b[35m[RecursiveSplitter]\x1b[0m ${text}`, ...args); |
|
|
} |
|
|
|
|
|
async _splitText(documentText) { |
|
|
if (!this.chunkHeader) return this.engine.splitText(documentText); |
|
|
const strings = await this.engine.splitText(documentText); |
|
|
const documents = await this.engine.createDocuments(strings, [], { |
|
|
chunkHeader: this.chunkHeader, |
|
|
}); |
|
|
return documents |
|
|
.filter((doc) => !!doc.pageContent) |
|
|
.map((doc) => doc.pageContent); |
|
|
} |
|
|
} |
|
|
|
|
|
module.exports.TextSplitter = TextSplitter; |
|
|
|