Spaces:

leowuming
/

rag

Runtime error

App Files Files Community

rag / server /utils /TextSplitter /index.js

gaojintao01

Add files using Git LFS

f8b5d42 4 months ago

history blame contribute delete

7.55 kB

	/**
	* @typedef {object} DocumentMetadata
	* @property {string} id - eg; "123e4567-e89b-12d3-a456-426614174000"
	* @property {string} url - eg; "file://example.com/index.html"
	* @property {string} title - eg; "example.com/index.html"
	* @property {string} docAuthor - eg; "no author found"
	* @property {string} description - eg; "No description found."
	* @property {string} docSource - eg; "URL link uploaded by the user."
	* @property {string} chunkSource - eg; link://https://example.com
	* @property {string} published - ISO 8601 date string
	* @property {number} wordCount - Number of words in the document
	* @property {string} pageContent - The raw text content of the document
	* @property {number} token_count_estimate - Number of tokens in the document
	*/

	function isNullOrNaN(value) {
	if (value === null) return true;
	return isNaN(value);
	}

	class TextSplitter {
	#splitter;

	/**
	* Creates a new TextSplitter instance.
	* @param {Object} config
	* @param {string} [config.chunkPrefix = ""] - Prefix to be added to the start of each chunk.
	* @param {number} [config.chunkSize = 1000] - The size of each chunk.
	* @param {number} [config.chunkOverlap = 20] - The overlap between chunks.
	* @param {Object} [config.chunkHeaderMeta = null] - Metadata to be added to the start of each chunk - will come after the prefix.
	*/
	constructor(config = {}) {
	this.config = config;
	this.#splitter = this.#setSplitter(config);
	}

	log(text, ...args) {
	console.log(`\x1b[35m[TextSplitter]\x1b[0m ${text}`, ...args);
	}

	/**
	* Does a quick check to determine the text chunk length limit.
	* Embedder models have hard-set limits that cannot be exceeded, just like an LLM context
	* so here we want to allow override of the default 1000, but up to the models maximum, which is
	* sometimes user defined.
	*/
	static determineMaxChunkSize(preferred = null, embedderLimit = 1000) {
	const prefValue = isNullOrNaN(preferred)
	? Number(embedderLimit)
	: Number(preferred);
	const limit = Number(embedderLimit);
	if (prefValue > limit)
	console.log(
	`\x1b[43m[WARN]\x1b[0m Text splitter chunk length of ${prefValue} exceeds embedder model max of ${embedderLimit}. Will use ${embedderLimit}.`
	);
	return prefValue > limit ? limit : prefValue;
	}

	/**
	* Creates a string of metadata to be prepended to each chunk.
	* @param {DocumentMetadata} metadata - Metadata to be prepended to each chunk.
	* @returns {{[key: ('title' \| 'published' \| 'source')]: string}} Object of metadata that will be prepended to each chunk.
	*/
	static buildHeaderMeta(metadata = {}) {
	if (!metadata \|\| Object.keys(metadata).length === 0) return null;
	const PLUCK_MAP = {
	title: {
	as: "sourceDocument",
	pluck: (metadata) => {
	return metadata?.title \|\| null;
	},
	},
	published: {
	as: "published",
	pluck: (metadata) => {
	return metadata?.published \|\| null;
	},
	},
	chunkSource: {
	as: "source",
	pluck: (metadata) => {
	const validPrefixes = ["link://", "youtube://"];
	// If the chunkSource is a link or youtube link, we can add the URL
	// as its source in the metadata so the LLM can use it for context.
	// eg prompt: Where did you get this information? -> answer: "from https://example.com"
	if (
	!metadata?.chunkSource \|\| // Exists
	!metadata?.chunkSource.length \|\| // Is not empty
	typeof metadata.chunkSource !== "string" \|\| // Is a string
	!validPrefixes.some(
	(prefix) => metadata.chunkSource.startsWith(prefix) // Has a valid prefix we respect
	)
	)
	return null;

	// We know a prefix is present, so we can split on it and return the rest.
	// If nothing is found, return null and it will not be added to the metadata.
	let source = null;
	for (const prefix of validPrefixes) {
	source = metadata.chunkSource.split(prefix)?.[1] \|\| null;
	if (source) break;
	}

	return source;
	},
	},
	};

	const pluckedData = {};
	Object.entries(PLUCK_MAP).forEach(([key, value]) => {
	if (!(key in metadata)) return; // Skip if the metadata key is not present.
	const pluckedValue = value.pluck(metadata);
	if (!pluckedValue) return; // Skip if the plucked value is null/empty.
	pluckedData[value.as] = pluckedValue;
	});

	return pluckedData;
	}

	/**
	* Apply the chunk prefix to the text if it is present.
	* @param {string} text - The text to apply the prefix to.
	* @returns {string} The text with the embedder model prefix applied.
	*/
	#applyPrefix(text = "") {
	if (!this.config.chunkPrefix) return text;
	return `${this.config.chunkPrefix}${text}`;
	}

	/**
	* Creates a string of metadata to be prepended to each chunk.
	* Will additionally prepend a prefix to the text if it was provided (requirement for some embedders).
	* @returns {string} The text with the embedder model prefix applied.
	*/
	stringifyHeader() {
	let content = "";
	if (!this.config.chunkHeaderMeta) return this.#applyPrefix(content);
	Object.entries(this.config.chunkHeaderMeta).map(([key, value]) => {
	if (!key \|\| !value) return;
	content += `${key}: ${value}\n`;
	});

	if (!content) return this.#applyPrefix(content);
	return this.#applyPrefix(
	`<document_metadata>\n${content}</document_metadata>\n\n`
	);
	}

	/**
	* Sets the splitter to use a defined config passes to other subclasses.
	* @param {Object} config
	* @param {string} [config.chunkPrefix = ""] - Prefix to be added to the start of each chunk.
	* @param {number} [config.chunkSize = 1000] - The size of each chunk.
	* @param {number} [config.chunkOverlap = 20] - The overlap between chunks.
	*/
	#setSplitter(config = {}) {
	// if (!config?.splitByFilename) {// TODO do something when specific extension is present? }
	return new RecursiveSplitter({
	chunkSize: isNaN(config?.chunkSize) ? 1_000 : Number(config?.chunkSize),
	chunkOverlap: isNaN(config?.chunkOverlap)
	? 20
	: Number(config?.chunkOverlap),
	chunkHeader: this.stringifyHeader(),
	});
	}

	async splitText(documentText) {
	return this.#splitter._splitText(documentText);
	}
	}

	// Wrapper for Langchain default RecursiveCharacterTextSplitter class.
	class RecursiveSplitter {
	constructor({ chunkSize, chunkOverlap, chunkHeader = null }) {
	const {
	RecursiveCharacterTextSplitter,
	} = require("@langchain/textsplitters");
	this.log(`Will split with`, {
	chunkSize,
	chunkOverlap,
	chunkHeader: chunkHeader ? `${chunkHeader?.slice(0, 50)}...` : null,
	});
	this.chunkHeader = chunkHeader;
	this.engine = new RecursiveCharacterTextSplitter({
	chunkSize,
	chunkOverlap,
	});
	}

	log(text, ...args) {
	console.log(`\x1b[35m[RecursiveSplitter]\x1b[0m ${text}`, ...args);
	}

	async _splitText(documentText) {
	if (!this.chunkHeader) return this.engine.splitText(documentText);
	const strings = await this.engine.splitText(documentText);
	const documents = await this.engine.createDocuments(strings, [], {
	chunkHeader: this.chunkHeader,
	});
	return documents
	.filter((doc) => !!doc.pageContent)
	.map((doc) => doc.pageContent);
	}
	}

	module.exports.TextSplitter = TextSplitter;