import { extractLinks, extractMetadata, transformHtml, } from "../html-transformer"; describe("HTML Transformer", () => { describe("extractLinks", () => { it("should return empty array for null or undefined input", async () => { expect(await extractLinks(null)).toEqual([]); expect(await extractLinks(undefined)).toEqual([]); }); it("should extract links from HTML content", async () => { const html = ` Example Test `; const links = await extractLinks(html); expect(links).toContain("https://example.com"); expect(links).toContain("https://test.com"); }); it("should handle relative links", async () => { const html = ` Relative Parent Path Local Path Implicit Relative Query Param Hash Link `; const links = await extractLinks(html); expect(links).toEqual([ "/path/to/page", "../another/page", "./local/page", "relative/path", "?param=value", "#section", ]); }); it("should handle complex nested HTML structure", async () => { const html = `
`; const links = await extractLinks(html); expect(links).toContain("https://nav1.com"); expect(links).toContain("https://nav2.com"); expect(links).toContain("https://inline.com"); expect(links).toContain("https://nested.com"); }); it("should handle malformed HTML gracefully", async () => { const html = `
Valid Invalid No href Empty href JavaScript href Email link
`; const links = await extractLinks(html); expect(links).toContain("https://valid.com"); // Other links should be filtered out or handled appropriately }); }); describe("extractMetadata", () => { it("should return empty array for null or undefined input", async () => { expect(await extractMetadata(null)).toEqual([]); expect(await extractMetadata(undefined)).toEqual([]); }); it("should extract comprehensive metadata from HTML content", async () => { const html = ` Test Page Title `; const metadata = await extractMetadata(html); expect(metadata).toMatchObject({ "twitter:title": "Twitter Title", ogImage: "https://example.com/image.jpg", "og:image": "https://example.com/image.jpg", ogDescription: "OpenGraph Description", "twitter:card": "summary", title: "Test Page Title", ogTitle: "OpenGraph Title", author: "Test Author", keywords: "test,page,keywords", "og:title": "OpenGraph Title", "og:description": "OpenGraph Description", description: "Detailed page description", }); }); it("should handle metadata with special characters and encoding", async () => { const html = ` Test & Page with ©️ symbols `; const metadata = await extractMetadata(html); expect(metadata.title).toContain("&"); expect(metadata.description).toContain("quotes"); }); it("should handle missing or malformed metadata gracefully", async () => { const html = ` `; const metadata = await extractMetadata(html); expect(metadata).toBeDefined(); }); }); describe("transformHtml", () => { it("should transform HTML content according to options", async () => { const options = { html: "

Test

Remove me
", url: "https://example.com", include_tags: ["p"], exclude_tags: ["span"], only_main_content: true, }; const result = await transformHtml(options); expect(result).toContain("

"); expect(result).not.toContain(""); }); it("should handle complex content filtering", async () => { const options = { html: `

Title

Important content

Advertisement
`, url: "https://example.com", include_tags: ["article", "h1", "p"], exclude_tags: ["nav", "aside", "footer", ".ads", ".social-share"], only_main_content: true, }; const result = await transformHtml(options); expect(result).toContain("

Title

"); expect(result).toContain("

Important content

"); expect(result).not.toContain("Navigation"); expect(result).not.toContain("Advertisement"); expect(result).not.toContain("Share buttons"); expect(result).not.toContain("Footer content"); }); it("should handle nested content preservation and absolute links", async () => { const options = { html: `

Section

Text with bold and emphasis

`, url: "https://example.com", include_tags: ["article", "p", "ul", "li"], exclude_tags: [], only_main_content: true, }; const result = await transformHtml(options); expect(result).toContain("bold"); expect(result).toContain("emphasis"); expect(result).toContain(''); }); it("should handle empty HTML content", async () => { const options = { html: "", url: "https://example.com", include_tags: [], exclude_tags: [], only_main_content: false, }; const result = await transformHtml(options); expect(result).toBe(""); }); it("should handle malformed HTML", async () => { const options = { html: "
Unclosed div", url: "https://example.com", include_tags: [], exclude_tags: [], only_main_content: false, }; const result = await transformHtml(options); expect(result).toBe("
Unclosed div
"); }); it("should handle HTML with comments and scripts", async () => { const options = { html: `

Real content

`, url: "https://example.com", include_tags: ["p"], exclude_tags: ["script", "style", "noscript"], only_main_content: true, }; const result = await transformHtml(options); expect(result).toContain("

Real content

"); expect(result).not.toContain("alert"); expect(result).not.toContain("color: red"); expect(result).not.toContain("Enable JavaScript"); }); it("should handle special characters and encoding", async () => { const options = { html: `

© 2024

<tag>

Special chars: á é í ó ú ñ

Emojis: 🎉 👍 🚀

`, url: "https://example.com", include_tags: ["p"], exclude_tags: [], only_main_content: true, }; const result = await transformHtml(options); expect(result).toContain("©"); expect(result).toContain("á é í ó ú ñ"); expect(result).toContain("🎉 👍 🚀"); }); it("should make all URLs absolute", async () => { const options = { html: `
hi hi hi hi hi hi
`, url: "https://example.com", include_tags: [], exclude_tags: [], only_main_content: true, }; const result = await transformHtml(options); console.log(result) expect(result).toContain("https://example.com/fullurl"); expect(result).toContain("http://example.net/fullurl"); expect(result).toContain("https://example.com/pathurl"); expect(result).toContain("https://example.net/proturl"); expect(result).toContain("https://example.com/?queryurl"); expect(result).toContain("https://example.com/#hashurl"); expect(result).toContain("https://example.com/#q1"); expect(result).toContain("https://example.com/#q2"); }); }); });