| const MAX_TEXT_CHUNK_LENGTH = 2000; // 你可以根据需要调整这个值 | |
| export function splitText( | |
| text: string = "", | |
| maxLength: number = MAX_TEXT_CHUNK_LENGTH | |
| ): string[] { | |
| const paragraphs = text.split("\n"); | |
| const chunks: string[] = []; | |
| let currentChunk = ""; | |
| for (const paragraph of paragraphs) { | |
| if (currentChunk.length + paragraph.length + 1 <= maxLength) { | |
| // +1 是为了加上换行符 | |
| currentChunk += (currentChunk.length > 0 ? "\n" : "") + paragraph; | |
| } else { | |
| if (currentChunk.length > 0) { | |
| chunks.push(currentChunk); | |
| } | |
| currentChunk = paragraph; | |
| } | |
| } | |
| if (currentChunk.length > 0) { | |
| chunks.push(currentChunk); | |
| } | |
| return chunks; | |
| } | |
| export function removeJsonMarkdown(text: string) { | |
| text = text.trim(); | |
| if (text.startsWith("```json")) { | |
| text = text.slice(7); | |
| } else if (text.startsWith("json")) { | |
| text = text.slice(4); | |
| } else if (text.startsWith("```")) { | |
| text = text.slice(3); | |
| } | |
| if (text.endsWith("```")) { | |
| text = text.slice(0, -3); | |
| } | |
| return text.trim(); | |
| } | |
| /** | |
| * Check if a text contains XML or HTML tags. | |
| * Consider various scenarios, including: | |
| * - Regular tags (such as <p>, <div>) | |
| * - Tags with attributes (such as <a href="...">) | |
| * - Self-closing tags (such as <img />, <br>) | |
| * - Closed tags (such as </p>) | |
| * - XML/HTML comments (such as <!-- ... -->) | |
| * - XML processing instructions (such as <?xml ... ?>) | |
| * - CDATA sections (such as <![CDATA[ ... ]]> ) | |
| * - DOCTYPE declarations (such as <!DOCTYPE html>) | |
| * | |
| * Note: This method is a fast detection based on pattern matching, not a complete parser. | |
| * It may misjudge some non-tag but similarly structured text as tags, but it is sufficient in most detection scenarios. | |
| * Strict validation requires a full parser. | |
| * | |
| * @param text The text to be detected | |
| * @returns Returns true if the text contains any structure that looks like an XML/HTML tag, otherwise returns false. | |
| */ | |
| export function containsXmlHtmlTags(text: string): boolean { | |
| // Check if the input is a string and is not empty | |
| if (typeof text !== "string" || text.length === 0) { | |
| return false; | |
| } | |
| // Build regular expressions to match various possible tag structures | |
| // This regular expression tries to cover common XML/HTML structures: | |
| // 1. <!--.*?--> : matches HTML/XML comments (non-greedy matching) | |
| // 2. <![CDATA[.*?]]> : matches CDATA sections (non-greedy matching) | |
| // 3. <!DOCTYPE[^>]*?> : matches DOCTYPE declarations (non-greedy matching) | |
| // 4. <\?.*?\?> : matches XML processing instructions (e.g. <?xml ... ?>) (non-greedy matching) | |
| // 5. <[!\/]?[a-zA-Z][^>]*?> : matches normal tags, tags with attributes, self-closing tags, closing tags, and <!ELEMENT>, etc. | |
| // < : matches '<' | |
| // [!\/]? : optional '!' (for <!ELEMENT>) or '/' (for closing tags) | |
| // [a-zA-Z] : tag names start with letters (XML/HTML standard) | |
| // [^>]*? : non-greedy matches any non-'>' character (remaining part of tag name, attributes, self-closing '/') | |
| // > : matches '>' | |
| // | |
| // Use the 'i' flag for case-insensitive matching (HTML tag names and attribute names are usually case-insensitive) | |
| // Use the 'test()' method, which only needs to find the first match to return true, which is more efficient | |
| const xmlHtmlTagRegex = | |
| /(<!--.*?-->|<!\[CDATA\[.*?]]>|<!DOCTYPE[^>]*?>|<\?.*?\?>|<[!\/]?[a-zA-Z][^>]*?>)/i; | |
| return xmlHtmlTagRegex.test(text); | |
| } | |
| export class ThinkTagStreamProcessor { | |
| private buffer: string = ""; | |
| private hasSkippedThinkBlock: boolean = false; | |
| /** | |
| * Process the received text block. | |
| * @param chunk The received text block. | |
| * @param outputCallback The callback function called when there is non-thinking content to be output. | |
| */ | |
| processChunk( | |
| chunk: string, | |
| contentOutput: (data: string) => void, | |
| thinkingOutput?: (data: string) => void | |
| ): void { | |
| // If the think block has been skipped, all new data is output directly | |
| if (this.hasSkippedThinkBlock) { | |
| contentOutput(chunk); | |
| return; | |
| } | |
| // Otherwise, while still looking for or processing a think block, add the new block to the buffer | |
| this.buffer += chunk; | |
| const startTag = this.buffer.startsWith("<think>"); | |
| const endTagIndex = this.buffer.indexOf("</think>"); | |
| if (startTag) { | |
| if (endTagIndex !== -1) { | |
| const contentAfterThink = this.buffer.substring( | |
| endTagIndex + "</think>".length | |
| ); | |
| // Output the content after </think> | |
| if (contentAfterThink.length > 0) { | |
| contentOutput(contentAfterThink); | |
| } | |
| this.hasSkippedThinkBlock = true; | |
| this.buffer = ""; | |
| } else { | |
| if (thinkingOutput) thinkingOutput(chunk); | |
| } | |
| } else { | |
| this.hasSkippedThinkBlock = true; | |
| contentOutput(chunk); | |
| } | |
| } | |
| end(): void { | |
| this.buffer = ""; | |
| this.hasSkippedThinkBlock = false; | |
| } | |
| } | |