Spaces:
Running
Running
File size: 7,621 Bytes
10d1fd4 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 | import { describe, expect, it } from "vitest";
import { sanitizeUnicodeSurrogates } from "./rerankerService";
describe("sanitizeUnicodeSurrogates", () => {
describe("valid input passthrough", () => {
it("should return empty string unchanged", () => {
expect(sanitizeUnicodeSurrogates("")).toBe("");
});
it("should return ASCII text unchanged", () => {
const input = "Hello, World! 123";
expect(sanitizeUnicodeSurrogates(input)).toBe(input);
});
it("should return valid Unicode text unchanged", () => {
const input = "HΓ©llo WΓΆrld ζ₯ζ¬θͺ π";
expect(sanitizeUnicodeSurrogates(input)).toBe(input);
});
it("should preserve valid surrogate pairs (emoji)", () => {
const input = "Text with emoji πππ";
expect(sanitizeUnicodeSurrogates(input)).toBe(input);
});
it("should preserve valid surrogate pairs in complex text", () => {
const input = "Start π middle π end";
expect(sanitizeUnicodeSurrogates(input)).toBe(input);
});
});
describe("unpaired high surrogate handling", () => {
it("should replace lone high surrogate at end of string", () => {
const highSurrogate = String.fromCharCode(0xd800);
const input = `text${highSurrogate}`;
expect(sanitizeUnicodeSurrogates(input)).toBe("text\ufffd");
});
it("should replace high surrogate followed by non-surrogate", () => {
const highSurrogate = String.fromCharCode(0xd800);
const input = `${highSurrogate}A`;
expect(sanitizeUnicodeSurrogates(input)).toBe("\ufffdA");
});
it("should replace high surrogate followed by another high surrogate", () => {
const high1 = String.fromCharCode(0xd800);
const high2 = String.fromCharCode(0xd801);
const input = `${high1}${high2}`;
expect(sanitizeUnicodeSurrogates(input)).toBe("\ufffd\ufffd");
});
it("should replace multiple consecutive unpaired high surrogates", () => {
const high = String.fromCharCode(0xd800);
const input = `${high}${high}${high}`;
expect(sanitizeUnicodeSurrogates(input)).toBe("\ufffd\ufffd\ufffd");
});
});
describe("unpaired low surrogate handling", () => {
it("should replace lone low surrogate at start of string", () => {
const lowSurrogate = String.fromCharCode(0xdc00);
const input = `${lowSurrogate}text`;
expect(sanitizeUnicodeSurrogates(input)).toBe("\ufffdtext");
});
it("should replace lone low surrogate in middle of string", () => {
const lowSurrogate = String.fromCharCode(0xdc00);
const input = `before${lowSurrogate}after`;
expect(sanitizeUnicodeSurrogates(input)).toBe("before\ufffdafter");
});
it("should replace multiple consecutive unpaired low surrogates", () => {
const low = String.fromCharCode(0xdc00);
const input = `${low}${low}`;
expect(sanitizeUnicodeSurrogates(input)).toBe("\ufffd\ufffd");
});
});
describe("mixed surrogate scenarios", () => {
it("should handle low surrogate followed by high surrogate (reversed pair)", () => {
const low = String.fromCharCode(0xdc00);
const high = String.fromCharCode(0xd800);
const input = `${low}${high}`;
expect(sanitizeUnicodeSurrogates(input)).toBe("\ufffd\ufffd");
});
it("should handle valid pair followed by unpaired high", () => {
const validEmoji = "π";
const unpairedHigh = String.fromCharCode(0xd83d);
const input = `${validEmoji}${unpairedHigh}`;
expect(sanitizeUnicodeSurrogates(input)).toBe("π\ufffd");
});
it("should handle unpaired low followed by valid pair", () => {
const unpairedLow = String.fromCharCode(0xdc00);
const validEmoji = "π";
const input = `${unpairedLow}${validEmoji}`;
expect(sanitizeUnicodeSurrogates(input)).toBe("\ufffdπ");
});
it("should handle interleaved valid and invalid surrogates", () => {
const high = String.fromCharCode(0xd800);
const low = String.fromCharCode(0xdc00);
const input = `A${high}B${low}C`;
expect(sanitizeUnicodeSurrogates(input)).toBe("A\ufffdB\ufffdC");
});
});
describe("edge cases from real-world scenarios", () => {
it("should handle text that might come from corrupted web content", () => {
const corruptedChar = String.fromCharCode(0xd834);
const input = `Search result: ${corruptedChar} more text`;
expect(sanitizeUnicodeSurrogates(input)).toBe(
"Search result: \ufffd more text",
);
});
it("should preserve valid content around invalid surrogates", () => {
const badHigh = String.fromCharCode(0xd83d);
const input = `Valid text ζ₯ζ¬θͺ ${badHigh} more valid π end`;
expect(sanitizeUnicodeSurrogates(input)).toBe(
"Valid text ζ₯ζ¬θͺ \ufffd more valid π end",
);
});
it("should handle boundary surrogate values", () => {
const minHigh = String.fromCharCode(0xd800);
const maxHigh = String.fromCharCode(0xdbff);
const minLow = String.fromCharCode(0xdc00);
const maxLow = String.fromCharCode(0xdfff);
expect(sanitizeUnicodeSurrogates(minHigh)).toBe("\ufffd");
expect(sanitizeUnicodeSurrogates(maxHigh)).toBe("\ufffd");
expect(sanitizeUnicodeSurrogates(minLow)).toBe("\ufffd");
expect(sanitizeUnicodeSurrogates(maxLow)).toBe("\ufffd");
expect(sanitizeUnicodeSurrogates(`${minHigh}${minLow}`)).toBe(
`${minHigh}${minLow}`,
);
expect(sanitizeUnicodeSurrogates(`${maxHigh}${maxLow}`)).toBe(
`${maxHigh}${maxLow}`,
);
});
it("should handle long strings with scattered invalid surrogates", () => {
const unpairedHigh = String.fromCharCode(0xd800);
const unpairedLow = String.fromCharCode(0xdc00);
const chunks = [
"Start of document.",
unpairedHigh,
" Some middle content.",
unpairedLow,
" More content here.",
unpairedHigh,
" End of document.",
];
const input = chunks.join("");
const expected =
"Start of document.\ufffd Some middle content.\ufffd More content here.\ufffd End of document.";
expect(sanitizeUnicodeSurrogates(input)).toBe(expected);
});
it("should preserve adjacent high+low as valid pair even in mixed context", () => {
const high = String.fromCharCode(0xd800);
const low = String.fromCharCode(0xdc00);
const validPair = `${high}${low}`;
const input = `Text ${high} orphan, then valid pair: ${validPair} end`;
expect(sanitizeUnicodeSurrogates(input)).toBe(
`Text \ufffd orphan, then valid pair: ${validPair} end`,
);
});
});
describe("literal syntax and complex sequences", () => {
it("should handle mixed valid and invalid surrogates using literals", () => {
const input = "A\uD800B\uD83D\uDE00C\uDC00D";
expect(sanitizeUnicodeSurrogates(input)).toBe(
"A\uFFFDB\uD83D\uDE00C\uFFFDD",
);
});
it("should handle surrogate pair followed by lone high surrogate", () => {
const input = "π\uD800";
expect(sanitizeUnicodeSurrogates(input)).toBe("π\uFFFD");
});
it("should handle lone high surrogate followed by valid surrogate pair", () => {
const input = "\uD801\uD800\uDC00";
expect(sanitizeUnicodeSurrogates(input)).toBe("\uFFFD\uD800\uDC00");
});
it("should handle multiple lone surrogates in a row", () => {
const input = "\uD800\uDC00\uD801";
expect(sanitizeUnicodeSurrogates(input)).toBe("\uD800\uDC00\uFFFD");
});
});
});
|