import { removeDefaultProperty } from "./llmExtract"; import { trimToTokenLimit } from "./llmExtract"; import { encoding_for_model } from "@dqbd/tiktoken"; jest.mock("@dqbd/tiktoken", () => ({ encoding_for_model: jest.fn(), })); describe("removeDefaultProperty", () => { it("should remove the default property from a simple object", () => { const input = { default: "test", test: "test" }; const expectedOutput = { test: "test" }; expect(removeDefaultProperty(input)).toEqual(expectedOutput); }); it("should remove the default property from a nested object", () => { const input = { default: "test", nested: { default: "nestedTest", test: "nestedTest" }, }; const expectedOutput = { nested: { test: "nestedTest" } }; expect(removeDefaultProperty(input)).toEqual(expectedOutput); }); it("should remove the default property from an array of objects", () => { const input = { array: [ { default: "test1", test: "test1" }, { default: "test2", test: "test2" }, ], }; const expectedOutput = { array: [{ test: "test1" }, { test: "test2" }] }; expect(removeDefaultProperty(input)).toEqual(expectedOutput); }); it("should handle objects without a default property", () => { const input = { test: "test" }; const expectedOutput = { test: "test" }; expect(removeDefaultProperty(input)).toEqual(expectedOutput); }); it("should handle null and non-object inputs", () => { expect(removeDefaultProperty(null)).toBeNull(); expect(removeDefaultProperty("string")).toBe("string"); expect(removeDefaultProperty(123)).toBe(123); }); }); describe("trimToTokenLimit", () => { const mockEncode = jest.fn(); const mockFree = jest.fn(); const mockEncoder = { encode: mockEncode, free: mockFree, }; beforeEach(() => { jest.clearAllMocks(); (encoding_for_model as jest.Mock).mockReturnValue(mockEncoder); }); it("should return original text if within token limit", () => { const text = "This is a test text"; mockEncode.mockReturnValue(new Array(5)); // Simulate 5 tokens const result = trimToTokenLimit(text, 10, "gpt-4o"); expect(result).toEqual({ text, numTokens: 5, warning: undefined }); expect(mockEncode).toHaveBeenCalledWith(text); expect(mockFree).toHaveBeenCalled(); }); it("should trim text and return warning when exceeding token limit", () => { const text = "This is a longer text that needs to be trimmed"; mockEncode .mockReturnValueOnce(new Array(20)) // First call for full text .mockReturnValueOnce(new Array(8)); // Second call for trimmed text const result = trimToTokenLimit(text, 10, "gpt-4o"); expect(result.text.length).toBeLessThan(text.length); expect(result.numTokens).toBe(8); expect(result.warning).toContain("automatically trimmed"); expect(mockEncode).toHaveBeenCalledTimes(2); expect(mockFree).toHaveBeenCalled(); }); it("should append previous warning if provided", () => { const text = "This is a test text that is too long"; const previousWarning = "Previous warning message"; mockEncode .mockReturnValueOnce(new Array(15)) .mockReturnValueOnce(new Array(8)); const result = trimToTokenLimit(text, 10, "gpt-4o", previousWarning); expect(result.warning).toContain("automatically trimmed"); expect(result.warning).toContain(previousWarning); }); it("should use fallback approach when encoder throws error", () => { const text = "This is some text to test fallback"; mockEncode.mockImplementation(() => { throw new Error("Encoder error"); }); const result = trimToTokenLimit(text, 10, "gpt-4o"); expect(result.text.length).toBeLessThanOrEqual(30); // 10 tokens * 3 chars per token expect(result.numTokens).toBe(10); expect(result.warning).toContain("Failed to derive number of LLM tokens"); }); it("should handle empty text", () => { const text = ""; mockEncode.mockReturnValue([]); const result = trimToTokenLimit(text, 10, "gpt-4o"); expect(result).toEqual({ text: "", numTokens: 0, warning: undefined }); expect(mockFree).toHaveBeenCalled(); }); it("should handle large token limits (128k)", () => { const text = "A".repeat(384000); // Assuming ~3 chars per token, this would be ~128k tokens mockEncode .mockReturnValueOnce(new Array(130000)) // First check shows it's too long .mockReturnValueOnce(new Array(127000)); // Second check shows it's within limit after trim const result = trimToTokenLimit(text, 128000, "gpt-4o"); expect(result.text.length).toBeLessThan(text.length); expect(result.numTokens).toBe(127000); expect(result.warning).toContain("automatically trimmed"); expect(mockEncode).toHaveBeenCalledTimes(2); expect(mockFree).toHaveBeenCalled(); }); it("should handle large token limits (512k) with 32k context window", () => { const text = "A".repeat(1536000); // Assuming ~3 chars per token, this would be ~512k tokens mockEncode .mockReturnValueOnce(new Array(520000)) // First check shows it's too long .mockReturnValueOnce(new Array(32000)); // Second check shows it's within context limit after trim const result = trimToTokenLimit(text, 32000, "gpt-4o"); expect(result.text.length).toBeLessThan(text.length); expect(result.numTokens).toBe(32000); expect(result.warning).toContain("automatically trimmed"); expect(mockEncode).toHaveBeenCalledTimes(2); expect(mockFree).toHaveBeenCalled(); }); it("should preserve text when under token limit", () => { const text = "Short text"; mockEncode.mockReturnValue(new Array(5)); // 5 tokens const result = trimToTokenLimit(text, 10, "gpt-4o"); expect(result.text).toBe(text); expect(result.numTokens).toBe(5); expect(result.warning).toBeUndefined(); expect(mockFree).toHaveBeenCalled(); }); it("should append new warning to previous warning", () => { const text = "A".repeat(300); const previousWarning = "Previous warning message"; mockEncode .mockReturnValueOnce(new Array(100)) .mockReturnValueOnce(new Array(50)); const result = trimToTokenLimit(text, 50, "gpt-4o", previousWarning); expect(result.warning).toContain("automatically trimmed"); expect(result.warning).toContain(previousWarning); expect(mockFree).toHaveBeenCalled(); }); it("should handle encoder initialization failure gracefully", () => { const text = "Sample text"; (encoding_for_model as jest.Mock).mockImplementationOnce(() => { throw new Error("Encoder initialization failed"); }); const result = trimToTokenLimit(text, 10, "gpt-4o"); expect(result.text.length).toBeLessThanOrEqual(30); // 10 tokens * 3 chars expect(result.warning).toContain("Failed to derive number of LLM tokens"); expect(mockFree).not.toHaveBeenCalled(); }); it("should handle encoding errors during trimming", () => { const text = "Sample text"; mockEncode.mockImplementation(() => { throw new Error("Encoding failed"); }); const result = trimToTokenLimit(text, 10, "gpt-4o"); expect(result.text.length).toBeLessThanOrEqual(30); expect(result.warning).toContain("Failed to derive number of LLM tokens"); expect(mockFree).toHaveBeenCalled(); }); it("should handle very small token limits", () => { const text = "This is a test sentence that should be trimmed significantly"; mockEncode .mockReturnValueOnce(new Array(20)) .mockReturnValueOnce(new Array(3)); const result = trimToTokenLimit(text, 3, "gpt-4o"); expect(result.text.length).toBeLessThan(text.length); expect(result.numTokens).toBe(3); expect(result.warning).toContain("automatically trimmed"); expect(mockFree).toHaveBeenCalled(); }); it("should handle unicode characters", () => { const text = "Hello 👋 World 🌍"; mockEncode .mockReturnValueOnce(new Array(8)) .mockReturnValueOnce(new Array(4)); const result = trimToTokenLimit(text, 4, "gpt-4o"); expect(result.text.length).toBeLessThan(text.length); expect(result.numTokens).toBe(4); expect(result.warning).toContain("automatically trimmed"); expect(mockFree).toHaveBeenCalled(); }); it("should handle multiple trimming iterations", () => { const text = "A".repeat(1000); mockEncode .mockReturnValueOnce(new Array(300)) .mockReturnValueOnce(new Array(200)) .mockReturnValueOnce(new Array(100)) .mockReturnValueOnce(new Array(50)); const result = trimToTokenLimit(text, 50, "gpt-4o"); expect(result.text.length).toBeLessThan(text.length); expect(result.numTokens).toBe(50); expect(result.warning).toContain("automatically trimmed"); expect(mockEncode).toHaveBeenCalledTimes(4); expect(mockFree).toHaveBeenCalled(); }); it("should handle exact token limit match", () => { const text = "Exact token limit text"; mockEncode.mockReturnValue(new Array(10)); const result = trimToTokenLimit(text, 10, "gpt-4o"); expect(result.text).toBe(text); expect(result.numTokens).toBe(10); expect(result.warning).toBeUndefined(); expect(mockFree).toHaveBeenCalled(); }); });