File size: 4,918 Bytes

676fc08

const MAX_TEXT_CHUNK_LENGTH = 2000; // 你可以根据需要调整这个值

export function splitText(
  text: string = "",
  maxLength: number = MAX_TEXT_CHUNK_LENGTH
): string[] {
  const paragraphs = text.split("\n");
  const chunks: string[] = [];
  let currentChunk = "";

  for (const paragraph of paragraphs) {
    if (currentChunk.length + paragraph.length + 1 <= maxLength) {
      // +1 是为了加上换行符
      currentChunk += (currentChunk.length > 0 ? "\n" : "") + paragraph;
    } else {
      if (currentChunk.length > 0) {
        chunks.push(currentChunk);
      }
      currentChunk = paragraph;
    }
  }

  if (currentChunk.length > 0) {
    chunks.push(currentChunk);
  }

  return chunks;
}

export function removeJsonMarkdown(text: string) {
  text = text.trim();
  if (text.startsWith("```json")) {
    text = text.slice(7);
  } else if (text.startsWith("json")) {
    text = text.slice(4);
  } else if (text.startsWith("```")) {
    text = text.slice(3);
  }
  if (text.endsWith("```")) {
    text = text.slice(0, -3);
  }
  return text.trim();
}

/**
 * Check if a text contains XML or HTML tags.
 * Consider various scenarios, including:
 * - Regular tags (such as <p>, <div>)
 * - Tags with attributes (such as <a href="...">)
 * - Self-closing tags (such as <img />, <br>)
 * - Closed tags (such as </p>)
 * - XML/HTML comments (such as <!-- ... -->)
 * - XML processing instructions (such as <?xml ... ?>)
 * - CDATA sections (such as <![CDATA[ ... ]]> )
 * - DOCTYPE declarations (such as <!DOCTYPE html>)
 *
 * Note: This method is a fast detection based on pattern matching, not a complete parser.
 * It may misjudge some non-tag but similarly structured text as tags, but it is sufficient in most detection scenarios.
 * Strict validation requires a full parser.
 *
 * @param text The text to be detected
 * @returns Returns true if the text contains any structure that looks like an XML/HTML tag, otherwise returns false.
 */
export function containsXmlHtmlTags(text: string): boolean {
  // Check if the input is a string and is not empty
  if (typeof text !== "string" || text.length === 0) {
    return false;
  }

  // Build regular expressions to match various possible tag structures
  // This regular expression tries to cover common XML/HTML structures:
  // 1. <!--.*?--> : matches HTML/XML comments (non-greedy matching)
  // 2. <![CDATA[.*?]]> : matches CDATA sections (non-greedy matching)
  // 3. <!DOCTYPE[^>]*?> : matches DOCTYPE declarations (non-greedy matching)
  // 4. <\?.*?\?> : matches XML processing instructions (e.g. <?xml ... ?>) (non-greedy matching)
  // 5. <[!\/]?[a-zA-Z][^>]*?> : matches normal tags, tags with attributes, self-closing tags, closing tags, and <!ELEMENT>, etc.
  // < : matches '<'
  // [!\/]? : optional '!' (for <!ELEMENT>) or '/' (for closing tags)
  // [a-zA-Z] : tag names start with letters (XML/HTML standard)
  // [^>]*? : non-greedy matches any non-'>' character (remaining part of tag name, attributes, self-closing '/')
  // > : matches '>'
  //
  // Use the 'i' flag for case-insensitive matching (HTML tag names and attribute names are usually case-insensitive)
  // Use the 'test()' method, which only needs to find the first match to return true, which is more efficient
  const xmlHtmlTagRegex =
    /(<!--.*?-->|<!\[CDATA\[.*?]]>|<!DOCTYPE[^>]*?>|<\?.*?\?>|<[!\/]?[a-zA-Z][^>]*?>)/i;

  return xmlHtmlTagRegex.test(text);
}

export class ThinkTagStreamProcessor {
  private buffer: string = "";
  private hasSkippedThinkBlock: boolean = false;

  /**
   * Process the received text block.
   * @param chunk The received text block.
   * @param outputCallback The callback function called when there is non-thinking content to be output.
   */
  processChunk(
    chunk: string,
    contentOutput: (data: string) => void,
    thinkingOutput?: (data: string) => void
  ): void {
    // If the think block has been skipped, all new data is output directly
    if (this.hasSkippedThinkBlock) {
      contentOutput(chunk);
      return;
    }

    // Otherwise, while still looking for or processing a think block, add the new block to the buffer
    this.buffer += chunk;

    const startTag = this.buffer.startsWith("<think>");
    const endTagIndex = this.buffer.indexOf("</think>");

    if (startTag) {
      if (endTagIndex !== -1) {
        const contentAfterThink = this.buffer.substring(
          endTagIndex + "</think>".length
        );

        // Output the content after </think>
        if (contentAfterThink.length > 0) {
          contentOutput(contentAfterThink);
        }

        this.hasSkippedThinkBlock = true;
        this.buffer = "";
      } else {
        if (thinkingOutput) thinkingOutput(chunk);
      }
    } else {
      this.hasSkippedThinkBlock = true;
      contentOutput(chunk);
    }
  }
  end(): void {
    this.buffer = "";
    this.hasSkippedThinkBlock = false;
  }
}