Spaces:
Paused
Paused
| import { removeDefaultProperty } from "./llmExtract"; | |
| import { trimToTokenLimit } from "./llmExtract"; | |
| import { encoding_for_model } from "@dqbd/tiktoken"; | |
| jest.mock("@dqbd/tiktoken", () => ({ | |
| encoding_for_model: jest.fn(), | |
| })); | |
| describe("removeDefaultProperty", () => { | |
| it("should remove the default property from a simple object", () => { | |
| const input = { default: "test", test: "test" }; | |
| const expectedOutput = { test: "test" }; | |
| expect(removeDefaultProperty(input)).toEqual(expectedOutput); | |
| }); | |
| it("should remove the default property from a nested object", () => { | |
| const input = { | |
| default: "test", | |
| nested: { default: "nestedTest", test: "nestedTest" }, | |
| }; | |
| const expectedOutput = { nested: { test: "nestedTest" } }; | |
| expect(removeDefaultProperty(input)).toEqual(expectedOutput); | |
| }); | |
| it("should remove the default property from an array of objects", () => { | |
| const input = { | |
| array: [ | |
| { default: "test1", test: "test1" }, | |
| { default: "test2", test: "test2" }, | |
| ], | |
| }; | |
| const expectedOutput = { array: [{ test: "test1" }, { test: "test2" }] }; | |
| expect(removeDefaultProperty(input)).toEqual(expectedOutput); | |
| }); | |
| it("should handle objects without a default property", () => { | |
| const input = { test: "test" }; | |
| const expectedOutput = { test: "test" }; | |
| expect(removeDefaultProperty(input)).toEqual(expectedOutput); | |
| }); | |
| it("should handle null and non-object inputs", () => { | |
| expect(removeDefaultProperty(null)).toBeNull(); | |
| expect(removeDefaultProperty("string")).toBe("string"); | |
| expect(removeDefaultProperty(123)).toBe(123); | |
| }); | |
| }); | |
| describe("trimToTokenLimit", () => { | |
| const mockEncode = jest.fn(); | |
| const mockFree = jest.fn(); | |
| const mockEncoder = { | |
| encode: mockEncode, | |
| free: mockFree, | |
| }; | |
| beforeEach(() => { | |
| jest.clearAllMocks(); | |
| (encoding_for_model as jest.Mock).mockReturnValue(mockEncoder); | |
| }); | |
| it("should return original text if within token limit", () => { | |
| const text = "This is a test text"; | |
| mockEncode.mockReturnValue(new Array(5)); // Simulate 5 tokens | |
| const result = trimToTokenLimit(text, 10, "gpt-4o"); | |
| expect(result).toEqual({ | |
| text, | |
| numTokens: 5, | |
| warning: undefined | |
| }); | |
| expect(mockEncode).toHaveBeenCalledWith(text); | |
| expect(mockFree).toHaveBeenCalled(); | |
| }); | |
| it("should trim text and return warning when exceeding token limit", () => { | |
| const text = "This is a longer text that needs to be trimmed"; | |
| mockEncode | |
| .mockReturnValueOnce(new Array(20)) // First call for full text | |
| .mockReturnValueOnce(new Array(8)); // Second call for trimmed text | |
| const result = trimToTokenLimit(text, 10, "gpt-4o"); | |
| expect(result.text.length).toBeLessThan(text.length); | |
| expect(result.numTokens).toBe(8); | |
| expect(result.warning).toContain("automatically trimmed"); | |
| expect(mockEncode).toHaveBeenCalledTimes(2); | |
| expect(mockFree).toHaveBeenCalled(); | |
| }); | |
| it("should append previous warning if provided", () => { | |
| const text = "This is a test text that is too long"; | |
| const previousWarning = "Previous warning message"; | |
| mockEncode | |
| .mockReturnValueOnce(new Array(15)) | |
| .mockReturnValueOnce(new Array(8)); | |
| const result = trimToTokenLimit(text, 10, "gpt-4o", previousWarning); | |
| expect(result.warning).toContain("automatically trimmed"); | |
| expect(result.warning).toContain(previousWarning); | |
| }); | |
| it("should use fallback approach when encoder throws error", () => { | |
| const text = "This is some text to test fallback"; | |
| mockEncode.mockImplementation(() => { | |
| throw new Error("Encoder error"); | |
| }); | |
| const result = trimToTokenLimit(text, 10, "gpt-4o"); | |
| expect(result.text.length).toBeLessThanOrEqual(30); // 10 tokens * 3 chars per token | |
| expect(result.numTokens).toBe(10); | |
| expect(result.warning).toContain("Failed to derive number of LLM tokens"); | |
| }); | |
| it("should handle empty text", () => { | |
| const text = ""; | |
| mockEncode.mockReturnValue([]); | |
| const result = trimToTokenLimit(text, 10, "gpt-4o"); | |
| expect(result).toEqual({ | |
| text: "", | |
| numTokens: 0, | |
| warning: undefined | |
| }); | |
| expect(mockFree).toHaveBeenCalled(); | |
| }); | |
| it("should handle large token limits (128k)", () => { | |
| const text = "A".repeat(384000); // Assuming ~3 chars per token, this would be ~128k tokens | |
| mockEncode | |
| .mockReturnValueOnce(new Array(130000)) // First check shows it's too long | |
| .mockReturnValueOnce(new Array(127000)); // Second check shows it's within limit after trim | |
| const result = trimToTokenLimit(text, 128000, "gpt-4o"); | |
| expect(result.text.length).toBeLessThan(text.length); | |
| expect(result.numTokens).toBe(127000); | |
| expect(result.warning).toContain("automatically trimmed"); | |
| expect(mockEncode).toHaveBeenCalledTimes(2); | |
| expect(mockFree).toHaveBeenCalled(); | |
| }); | |
| it("should handle large token limits (512k) with 32k context window", () => { | |
| const text = "A".repeat(1536000); // Assuming ~3 chars per token, this would be ~512k tokens | |
| mockEncode | |
| .mockReturnValueOnce(new Array(520000)) // First check shows it's too long | |
| .mockReturnValueOnce(new Array(32000)); // Second check shows it's within context limit after trim | |
| const result = trimToTokenLimit(text, 32000, "gpt-4o"); | |
| expect(result.text.length).toBeLessThan(text.length); | |
| expect(result.numTokens).toBe(32000); | |
| expect(result.warning).toContain("automatically trimmed"); | |
| expect(mockEncode).toHaveBeenCalledTimes(2); | |
| expect(mockFree).toHaveBeenCalled(); | |
| }); | |
| it("should preserve text when under token limit", () => { | |
| const text = "Short text"; | |
| mockEncode.mockReturnValue(new Array(5)); // 5 tokens | |
| const result = trimToTokenLimit(text, 10, "gpt-4o"); | |
| expect(result.text).toBe(text); | |
| expect(result.numTokens).toBe(5); | |
| expect(result.warning).toBeUndefined(); | |
| expect(mockFree).toHaveBeenCalled(); | |
| }); | |
| it("should append new warning to previous warning", () => { | |
| const text = "A".repeat(300); | |
| const previousWarning = "Previous warning message"; | |
| mockEncode | |
| .mockReturnValueOnce(new Array(100)) | |
| .mockReturnValueOnce(new Array(50)); | |
| const result = trimToTokenLimit(text, 50, "gpt-4o", previousWarning); | |
| expect(result.warning).toContain("automatically trimmed"); | |
| expect(result.warning).toContain(previousWarning); | |
| expect(mockFree).toHaveBeenCalled(); | |
| }); | |
| it("should handle encoder initialization failure gracefully", () => { | |
| const text = "Sample text"; | |
| (encoding_for_model as jest.Mock).mockImplementationOnce(() => { | |
| throw new Error("Encoder initialization failed"); | |
| }); | |
| const result = trimToTokenLimit(text, 10, "gpt-4o"); | |
| expect(result.text.length).toBeLessThanOrEqual(30); // 10 tokens * 3 chars | |
| expect(result.warning).toContain("Failed to derive number of LLM tokens"); | |
| expect(mockFree).not.toHaveBeenCalled(); | |
| }); | |
| it("should handle encoding errors during trimming", () => { | |
| const text = "Sample text"; | |
| mockEncode.mockImplementation(() => { | |
| throw new Error("Encoding failed"); | |
| }); | |
| const result = trimToTokenLimit(text, 10, "gpt-4o"); | |
| expect(result.text.length).toBeLessThanOrEqual(30); | |
| expect(result.warning).toContain("Failed to derive number of LLM tokens"); | |
| expect(mockFree).toHaveBeenCalled(); | |
| }); | |
| it("should handle very small token limits", () => { | |
| const text = "This is a test sentence that should be trimmed significantly"; | |
| mockEncode | |
| .mockReturnValueOnce(new Array(20)) | |
| .mockReturnValueOnce(new Array(3)); | |
| const result = trimToTokenLimit(text, 3, "gpt-4o"); | |
| expect(result.text.length).toBeLessThan(text.length); | |
| expect(result.numTokens).toBe(3); | |
| expect(result.warning).toContain("automatically trimmed"); | |
| expect(mockFree).toHaveBeenCalled(); | |
| }); | |
| it("should handle unicode characters", () => { | |
| const text = "Hello ๐ World ๐"; | |
| mockEncode | |
| .mockReturnValueOnce(new Array(8)) | |
| .mockReturnValueOnce(new Array(4)); | |
| const result = trimToTokenLimit(text, 4, "gpt-4o"); | |
| expect(result.text.length).toBeLessThan(text.length); | |
| expect(result.numTokens).toBe(4); | |
| expect(result.warning).toContain("automatically trimmed"); | |
| expect(mockFree).toHaveBeenCalled(); | |
| }); | |
| it("should handle multiple trimming iterations", () => { | |
| const text = "A".repeat(1000); | |
| mockEncode | |
| .mockReturnValueOnce(new Array(300)) | |
| .mockReturnValueOnce(new Array(200)) | |
| .mockReturnValueOnce(new Array(100)) | |
| .mockReturnValueOnce(new Array(50)); | |
| const result = trimToTokenLimit(text, 50, "gpt-4o"); | |
| expect(result.text.length).toBeLessThan(text.length); | |
| expect(result.numTokens).toBe(50); | |
| expect(result.warning).toContain("automatically trimmed"); | |
| expect(mockEncode).toHaveBeenCalledTimes(4); | |
| expect(mockFree).toHaveBeenCalled(); | |
| }); | |
| it("should handle exact token limit match", () => { | |
| const text = "Exact token limit text"; | |
| mockEncode.mockReturnValue(new Array(10)); | |
| const result = trimToTokenLimit(text, 10, "gpt-4o"); | |
| expect(result.text).toBe(text); | |
| expect(result.numTokens).toBe(10); | |
| expect(result.warning).toBeUndefined(); | |
| expect(mockFree).toHaveBeenCalled(); | |
| }); | |
| }); | |