File size: 4,918 Bytes
676fc08
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
const MAX_TEXT_CHUNK_LENGTH = 2000; // 你可以根据需要调整这个值

export function splitText(
  text: string = "",
  maxLength: number = MAX_TEXT_CHUNK_LENGTH
): string[] {
  const paragraphs = text.split("\n");
  const chunks: string[] = [];
  let currentChunk = "";

  for (const paragraph of paragraphs) {
    if (currentChunk.length + paragraph.length + 1 <= maxLength) {
      // +1 是为了加上换行符
      currentChunk += (currentChunk.length > 0 ? "\n" : "") + paragraph;
    } else {
      if (currentChunk.length > 0) {
        chunks.push(currentChunk);
      }
      currentChunk = paragraph;
    }
  }

  if (currentChunk.length > 0) {
    chunks.push(currentChunk);
  }

  return chunks;
}

export function removeJsonMarkdown(text: string) {
  text = text.trim();
  if (text.startsWith("```json")) {
    text = text.slice(7);
  } else if (text.startsWith("json")) {
    text = text.slice(4);
  } else if (text.startsWith("```")) {
    text = text.slice(3);
  }
  if (text.endsWith("```")) {
    text = text.slice(0, -3);
  }
  return text.trim();
}

/**
 * Check if a text contains XML or HTML tags.
 * Consider various scenarios, including:
 * - Regular tags (such as <p>, <div>)
 * - Tags with attributes (such as <a href="...">)
 * - Self-closing tags (such as <img />, <br>)
 * - Closed tags (such as </p>)
 * - XML/HTML comments (such as <!-- ... -->)
 * - XML ​​processing instructions (such as <?xml ... ?>)
 * - CDATA sections (such as <![CDATA[ ... ]]> )
 * - DOCTYPE declarations (such as <!DOCTYPE html>)
 *
 * Note: This method is a fast detection based on pattern matching, not a complete parser.
 * It may misjudge some non-tag but similarly structured text as tags, but it is sufficient in most detection scenarios.
 * Strict validation requires a full parser.
 *
 * @param text The text to be detected
 * @returns Returns true if the text contains any structure that looks like an XML/HTML tag, otherwise returns false.
 */
export function containsXmlHtmlTags(text: string): boolean {
  // Check if the input is a string and is not empty
  if (typeof text !== "string" || text.length === 0) {
    return false;
  }

  // Build regular expressions to match various possible tag structures
  // This regular expression tries to cover common XML/HTML structures:
  // 1. <!--.*?--> : matches HTML/XML comments (non-greedy matching)
  // 2. <![CDATA[.*?]]> : matches CDATA sections (non-greedy matching)
  // 3. <!DOCTYPE[^>]*?> : matches DOCTYPE declarations (non-greedy matching)
  // 4. <\?.*?\?> : matches XML processing instructions (e.g. <?xml ... ?>) (non-greedy matching)
  // 5. <[!\/]?[a-zA-Z][^>]*?> : matches normal tags, tags with attributes, self-closing tags, closing tags, and <!ELEMENT>, etc.
  // < : matches '<'
  // [!\/]? : optional '!' (for <!ELEMENT>) or '/' (for closing tags)
  // [a-zA-Z] : tag names start with letters (XML/HTML standard)
  // [^>]*? : non-greedy matches any non-'>' character (remaining part of tag name, attributes, self-closing '/')
  // > : matches '>'
  //
  // Use the 'i' flag for case-insensitive matching (HTML tag names and attribute names are usually case-insensitive)
  // Use the 'test()' method, which only needs to find the first match to return true, which is more efficient
  const xmlHtmlTagRegex =
    /(<!--.*?-->|<!\[CDATA\[.*?]]>|<!DOCTYPE[^>]*?>|<\?.*?\?>|<[!\/]?[a-zA-Z][^>]*?>)/i;

  return xmlHtmlTagRegex.test(text);
}

export class ThinkTagStreamProcessor {
  private buffer: string = "";
  private hasSkippedThinkBlock: boolean = false;

  /**
   * Process the received text block.
   * @param chunk The received text block.
   * @param outputCallback The callback function called when there is non-thinking content to be output.
   */
  processChunk(
    chunk: string,
    contentOutput: (data: string) => void,
    thinkingOutput?: (data: string) => void
  ): void {
    // If the think block has been skipped, all new data is output directly
    if (this.hasSkippedThinkBlock) {
      contentOutput(chunk);
      return;
    }

    // Otherwise, while still looking for or processing a think block, add the new block to the buffer
    this.buffer += chunk;

    const startTag = this.buffer.startsWith("<think>");
    const endTagIndex = this.buffer.indexOf("</think>");

    if (startTag) {
      if (endTagIndex !== -1) {
        const contentAfterThink = this.buffer.substring(
          endTagIndex + "</think>".length
        );

        // Output the content after </think>
        if (contentAfterThink.length > 0) {
          contentOutput(contentAfterThink);
        }

        this.hasSkippedThinkBlock = true;
        this.buffer = "";
      } else {
        if (thinkingOutput) thinkingOutput(chunk);
      }
    } else {
      this.hasSkippedThinkBlock = true;
      contentOutput(chunk);
    }
  }
  end(): void {
    this.buffer = "";
    this.hasSkippedThinkBlock = false;
  }
}