const MAX_TEXT_CHUNK_LENGTH = 2000; // 你可以根据需要调整这个值
export function splitText(
text: string = "",
maxLength: number = MAX_TEXT_CHUNK_LENGTH
): string[] {
const paragraphs = text.split("\n");
const chunks: string[] = [];
let currentChunk = "";
for (const paragraph of paragraphs) {
if (currentChunk.length + paragraph.length + 1 <= maxLength) {
// +1 是为了加上换行符
currentChunk += (currentChunk.length > 0 ? "\n" : "") + paragraph;
} else {
if (currentChunk.length > 0) {
chunks.push(currentChunk);
}
currentChunk = paragraph;
}
}
if (currentChunk.length > 0) {
chunks.push(currentChunk);
}
return chunks;
}
export function removeJsonMarkdown(text: string) {
text = text.trim();
if (text.startsWith("```json")) {
text = text.slice(7);
} else if (text.startsWith("json")) {
text = text.slice(4);
} else if (text.startsWith("```")) {
text = text.slice(3);
}
if (text.endsWith("```")) {
text = text.slice(0, -3);
}
return text.trim();
}
/**
* Check if a text contains XML or HTML tags.
* Consider various scenarios, including:
* - Regular tags (such as
,
)
* - Tags with attributes (such as
)
* - Self-closing tags (such as
,
)
* - Closed tags (such as )
* - XML/HTML comments (such as )
* - XML processing instructions (such as )
* - CDATA sections (such as )
* - DOCTYPE declarations (such as )
*
* Note: This method is a fast detection based on pattern matching, not a complete parser.
* It may misjudge some non-tag but similarly structured text as tags, but it is sufficient in most detection scenarios.
* Strict validation requires a full parser.
*
* @param text The text to be detected
* @returns Returns true if the text contains any structure that looks like an XML/HTML tag, otherwise returns false.
*/
export function containsXmlHtmlTags(text: string): boolean {
// Check if the input is a string and is not empty
if (typeof text !== "string" || text.length === 0) {
return false;
}
// Build regular expressions to match various possible tag structures
// This regular expression tries to cover common XML/HTML structures:
// 1. : matches HTML/XML comments (non-greedy matching)
// 2. : matches CDATA sections (non-greedy matching)
// 3. ]*?> : matches DOCTYPE declarations (non-greedy matching)
// 4. <\?.*?\?> : matches XML processing instructions (e.g. ) (non-greedy matching)
// 5. <[!\/]?[a-zA-Z][^>]*?> : matches normal tags, tags with attributes, self-closing tags, closing tags, and , etc.
// < : matches '<'
// [!\/]? : optional '!' (for ) or '/' (for closing tags)
// [a-zA-Z] : tag names start with letters (XML/HTML standard)
// [^>]*? : non-greedy matches any non-'>' character (remaining part of tag name, attributes, self-closing '/')
// > : matches '>'
//
// Use the 'i' flag for case-insensitive matching (HTML tag names and attribute names are usually case-insensitive)
// Use the 'test()' method, which only needs to find the first match to return true, which is more efficient
const xmlHtmlTagRegex =
/(||]*?>|<\?.*?\?>|<[!\/]?[a-zA-Z][^>]*?>)/i;
return xmlHtmlTagRegex.test(text);
}
export class ThinkTagStreamProcessor {
private buffer: string = "";
private hasSkippedThinkBlock: boolean = false;
/**
* Process the received text block.
* @param chunk The received text block.
* @param outputCallback The callback function called when there is non-thinking content to be output.
*/
processChunk(
chunk: string,
contentOutput: (data: string) => void,
thinkingOutput?: (data: string) => void
): void {
// If the think block has been skipped, all new data is output directly
if (this.hasSkippedThinkBlock) {
contentOutput(chunk);
return;
}
// Otherwise, while still looking for or processing a think block, add the new block to the buffer
this.buffer += chunk;
const startTag = this.buffer.startsWith("");
const endTagIndex = this.buffer.indexOf("");
if (startTag) {
if (endTagIndex !== -1) {
const contentAfterThink = this.buffer.substring(
endTagIndex + "".length
);
// Output the content after
if (contentAfterThink.length > 0) {
contentOutput(contentAfterThink);
}
this.hasSkippedThinkBlock = true;
this.buffer = "";
} else {
if (thinkingOutput) thinkingOutput(chunk);
}
} else {
this.hasSkippedThinkBlock = true;
contentOutput(chunk);
}
}
end(): void {
this.buffer = "";
this.hasSkippedThinkBlock = false;
}
}