Spaces:

Felladrin
/

MiniSearch

Running

File size: 7,621 Bytes

10d1fd4

import { describe, expect, it } from "vitest";
import { sanitizeUnicodeSurrogates } from "./rerankerService";

describe("sanitizeUnicodeSurrogates", () => {
  describe("valid input passthrough", () => {
    it("should return empty string unchanged", () => {
      expect(sanitizeUnicodeSurrogates("")).toBe("");
    });

    it("should return ASCII text unchanged", () => {
      const input = "Hello, World! 123";
      expect(sanitizeUnicodeSurrogates(input)).toBe(input);
    });

    it("should return valid Unicode text unchanged", () => {
      const input = "Héllo Wörld 日本語 🎉";
      expect(sanitizeUnicodeSurrogates(input)).toBe(input);
    });

    it("should preserve valid surrogate pairs (emoji)", () => {
      const input = "Text with emoji 😀🎊🚀";
      expect(sanitizeUnicodeSurrogates(input)).toBe(input);
    });

    it("should preserve valid surrogate pairs in complex text", () => {
      const input = "Start 🎉 middle 🚀 end";
      expect(sanitizeUnicodeSurrogates(input)).toBe(input);
    });
  });

  describe("unpaired high surrogate handling", () => {
    it("should replace lone high surrogate at end of string", () => {
      const highSurrogate = String.fromCharCode(0xd800);
      const input = `text${highSurrogate}`;
      expect(sanitizeUnicodeSurrogates(input)).toBe("text\ufffd");
    });

    it("should replace high surrogate followed by non-surrogate", () => {
      const highSurrogate = String.fromCharCode(0xd800);
      const input = `${highSurrogate}A`;
      expect(sanitizeUnicodeSurrogates(input)).toBe("\ufffdA");
    });

    it("should replace high surrogate followed by another high surrogate", () => {
      const high1 = String.fromCharCode(0xd800);
      const high2 = String.fromCharCode(0xd801);
      const input = `${high1}${high2}`;
      expect(sanitizeUnicodeSurrogates(input)).toBe("\ufffd\ufffd");
    });

    it("should replace multiple consecutive unpaired high surrogates", () => {
      const high = String.fromCharCode(0xd800);
      const input = `${high}${high}${high}`;
      expect(sanitizeUnicodeSurrogates(input)).toBe("\ufffd\ufffd\ufffd");
    });
  });

  describe("unpaired low surrogate handling", () => {
    it("should replace lone low surrogate at start of string", () => {
      const lowSurrogate = String.fromCharCode(0xdc00);
      const input = `${lowSurrogate}text`;
      expect(sanitizeUnicodeSurrogates(input)).toBe("\ufffdtext");
    });

    it("should replace lone low surrogate in middle of string", () => {
      const lowSurrogate = String.fromCharCode(0xdc00);
      const input = `before${lowSurrogate}after`;
      expect(sanitizeUnicodeSurrogates(input)).toBe("before\ufffdafter");
    });

    it("should replace multiple consecutive unpaired low surrogates", () => {
      const low = String.fromCharCode(0xdc00);
      const input = `${low}${low}`;
      expect(sanitizeUnicodeSurrogates(input)).toBe("\ufffd\ufffd");
    });
  });

  describe("mixed surrogate scenarios", () => {
    it("should handle low surrogate followed by high surrogate (reversed pair)", () => {
      const low = String.fromCharCode(0xdc00);
      const high = String.fromCharCode(0xd800);
      const input = `${low}${high}`;
      expect(sanitizeUnicodeSurrogates(input)).toBe("\ufffd\ufffd");
    });

    it("should handle valid pair followed by unpaired high", () => {
      const validEmoji = "😀";
      const unpairedHigh = String.fromCharCode(0xd83d);
      const input = `${validEmoji}${unpairedHigh}`;
      expect(sanitizeUnicodeSurrogates(input)).toBe("😀\ufffd");
    });

    it("should handle unpaired low followed by valid pair", () => {
      const unpairedLow = String.fromCharCode(0xdc00);
      const validEmoji = "🎉";
      const input = `${unpairedLow}${validEmoji}`;
      expect(sanitizeUnicodeSurrogates(input)).toBe("\ufffd🎉");
    });

    it("should handle interleaved valid and invalid surrogates", () => {
      const high = String.fromCharCode(0xd800);
      const low = String.fromCharCode(0xdc00);
      const input = `A${high}B${low}C`;
      expect(sanitizeUnicodeSurrogates(input)).toBe("A\ufffdB\ufffdC");
    });
  });

  describe("edge cases from real-world scenarios", () => {
    it("should handle text that might come from corrupted web content", () => {
      const corruptedChar = String.fromCharCode(0xd834);
      const input = `Search result: ${corruptedChar} more text`;
      expect(sanitizeUnicodeSurrogates(input)).toBe(
        "Search result: \ufffd more text",
      );
    });

    it("should preserve valid content around invalid surrogates", () => {
      const badHigh = String.fromCharCode(0xd83d);
      const input = `Valid text 日本語 ${badHigh} more valid 🎉 end`;
      expect(sanitizeUnicodeSurrogates(input)).toBe(
        "Valid text 日本語 \ufffd more valid 🎉 end",
      );
    });

    it("should handle boundary surrogate values", () => {
      const minHigh = String.fromCharCode(0xd800);
      const maxHigh = String.fromCharCode(0xdbff);
      const minLow = String.fromCharCode(0xdc00);
      const maxLow = String.fromCharCode(0xdfff);

      expect(sanitizeUnicodeSurrogates(minHigh)).toBe("\ufffd");
      expect(sanitizeUnicodeSurrogates(maxHigh)).toBe("\ufffd");
      expect(sanitizeUnicodeSurrogates(minLow)).toBe("\ufffd");
      expect(sanitizeUnicodeSurrogates(maxLow)).toBe("\ufffd");

      expect(sanitizeUnicodeSurrogates(`${minHigh}${minLow}`)).toBe(
        `${minHigh}${minLow}`,
      );
      expect(sanitizeUnicodeSurrogates(`${maxHigh}${maxLow}`)).toBe(
        `${maxHigh}${maxLow}`,
      );
    });

    it("should handle long strings with scattered invalid surrogates", () => {
      const unpairedHigh = String.fromCharCode(0xd800);
      const unpairedLow = String.fromCharCode(0xdc00);
      const chunks = [
        "Start of document.",
        unpairedHigh,
        " Some middle content.",
        unpairedLow,
        " More content here.",
        unpairedHigh,
        " End of document.",
      ];
      const input = chunks.join("");
      const expected =
        "Start of document.\ufffd Some middle content.\ufffd More content here.\ufffd End of document.";
      expect(sanitizeUnicodeSurrogates(input)).toBe(expected);
    });

    it("should preserve adjacent high+low as valid pair even in mixed context", () => {
      const high = String.fromCharCode(0xd800);
      const low = String.fromCharCode(0xdc00);
      const validPair = `${high}${low}`;
      const input = `Text ${high} orphan, then valid pair: ${validPair} end`;
      expect(sanitizeUnicodeSurrogates(input)).toBe(
        `Text \ufffd orphan, then valid pair: ${validPair} end`,
      );
    });
  });

  describe("literal syntax and complex sequences", () => {
    it("should handle mixed valid and invalid surrogates using literals", () => {
      const input = "A\uD800B\uD83D\uDE00C\uDC00D";
      expect(sanitizeUnicodeSurrogates(input)).toBe(
        "A\uFFFDB\uD83D\uDE00C\uFFFDD",
      );
    });

    it("should handle surrogate pair followed by lone high surrogate", () => {
      const input = "😀\uD800";
      expect(sanitizeUnicodeSurrogates(input)).toBe("😀\uFFFD");
    });

    it("should handle lone high surrogate followed by valid surrogate pair", () => {
      const input = "\uD801\uD800\uDC00";
      expect(sanitizeUnicodeSurrogates(input)).toBe("\uFFFD\uD800\uDC00");
    });

    it("should handle multiple lone surrogates in a row", () => {
      const input = "\uD800\uDC00\uD801";
      expect(sanitizeUnicodeSurrogates(input)).toBe("\uD800\uDC00\uFFFD");
    });
  });
});