import { parseMarkdown } from "../../../lib/html-to-markdown";
import { Meta } from "..";
import { Document } from "../../../controllers/v1/types";
import { htmlTransform } from "../lib/removeUnwantedElements";
import { extractLinks } from "../lib/extractLinks";
import { extractMetadata } from "../lib/extractMetadata";
import { performLLMExtract } from "./llmExtract";
import { uploadScreenshot } from "./uploadScreenshot";
import { removeBase64Images } from "./removeBase64Images";
import { saveToCache } from "./cache";
import { performAgent } from "./agent";
import { deriveDiff } from "./diff";
export type Transformer = (
meta: Meta,
document: Document,
) => Document | Promise;
export async function deriveMetadataFromRawHTML(
meta: Meta,
document: Document,
): Promise {
if (document.rawHtml === undefined) {
throw new Error(
"rawHtml is undefined -- this transformer is being called out of order",
);
}
document.metadata = {
...(await extractMetadata(meta, document.rawHtml)),
...document.metadata,
};
return document;
}
export async function deriveHTMLFromRawHTML(
meta: Meta,
document: Document,
): Promise {
if (document.rawHtml === undefined) {
throw new Error(
"rawHtml is undefined -- this transformer is being called out of order",
);
}
document.html = await htmlTransform(
document.rawHtml,
document.metadata.url ?? document.metadata.sourceURL ?? meta.url,
meta.options,
);
return document;
}
export async function deriveMarkdownFromHTML(
_meta: Meta,
document: Document,
): Promise {
if (document.html === undefined) {
throw new Error(
"html is undefined -- this transformer is being called out of order",
);
}
document.markdown = await parseMarkdown(document.html);
return document;
}
export async function deriveLinksFromHTML(meta: Meta, document: Document): Promise {
// Only derive if the formats has links
if (meta.options.formats.includes("links")) {
if (document.html === undefined) {
throw new Error(
"html is undefined -- this transformer is being called out of order",
);
}
document.links = await extractLinks(document.html, meta.url);
}
return document;
}
export function coerceFieldsToFormats(
meta: Meta,
document: Document,
): Document {
const formats = new Set(meta.options.formats);
if (!formats.has("markdown") && document.markdown !== undefined) {
delete document.markdown;
} else if (formats.has("markdown") && document.markdown === undefined) {
meta.logger.warn(
"Request had format: markdown, but there was no markdown field in the result.",
);
}
if (!formats.has("rawHtml") && document.rawHtml !== undefined) {
delete document.rawHtml;
} else if (formats.has("rawHtml") && document.rawHtml === undefined) {
meta.logger.warn(
"Request had format: rawHtml, but there was no rawHtml field in the result.",
);
}
if (!formats.has("html") && document.html !== undefined) {
delete document.html;
} else if (formats.has("html") && document.html === undefined) {
meta.logger.warn(
"Request had format: html, but there was no html field in the result.",
);
}
if (
!formats.has("screenshot") &&
!formats.has("screenshot@fullPage") &&
document.screenshot !== undefined
) {
meta.logger.warn(
"Removed screenshot from Document because it wasn't in formats -- this is very wasteful and indicates a bug.",
);
delete document.screenshot;
} else if (
(formats.has("screenshot") || formats.has("screenshot@fullPage")) &&
document.screenshot === undefined
) {
meta.logger.warn(
"Request had format: screenshot / screenshot@fullPage, but there was no screenshot field in the result.",
);
}
if (!formats.has("links") && document.links !== undefined) {
meta.logger.warn(
"Removed links from Document because it wasn't in formats -- this is wasteful and indicates a bug.",
);
delete document.links;
} else if (formats.has("links") && document.links === undefined) {
meta.logger.warn(
"Request had format: links, but there was no links field in the result.",
);
}
if (!formats.has("extract") && (document.extract !== undefined || document.json !== undefined)) {
meta.logger.warn(
"Removed extract from Document because it wasn't in formats -- this is extremely wasteful and indicates a bug.",
);
delete document.extract;
} else if (formats.has("extract") && document.extract === undefined && document.json === undefined) {
meta.logger.warn(
"Request had format extract, but there was no extract field in the result.",
);
}
if (!formats.has("changeTracking") && document.changeTracking !== undefined) {
meta.logger.warn(
"Removed changeTracking from Document because it wasn't in formats -- this is extremely wasteful and indicates a bug.",
);
delete document.changeTracking;
} else if (formats.has("changeTracking") && document.changeTracking === undefined) {
meta.logger.warn(
"Request had format changeTracking, but there was no changeTracking field in the result.",
);
}
if (document.changeTracking &&
(!meta.options.changeTrackingOptions?.modes?.includes("git-diff")) &&
document.changeTracking.diff !== undefined) {
meta.logger.warn(
"Removed diff from changeTracking because git-diff mode wasn't specified in changeTrackingOptions.modes.",
);
delete document.changeTracking.diff;
}
if (document.changeTracking &&
(!meta.options.changeTrackingOptions?.modes?.includes("json")) &&
document.changeTracking.json !== undefined) {
meta.logger.warn(
"Removed structured from changeTracking because structured mode wasn't specified in changeTrackingOptions.modes.",
);
delete document.changeTracking.json;
}
if (meta.options.actions === undefined || meta.options.actions.length === 0) {
delete document.actions;
}
return document;
}
// TODO: allow some of these to run in parallel
export const transformerStack: Transformer[] = [
saveToCache,
deriveHTMLFromRawHTML,
deriveMarkdownFromHTML,
deriveLinksFromHTML,
deriveMetadataFromRawHTML,
uploadScreenshot,
performLLMExtract,
performAgent,
deriveDiff,
coerceFieldsToFormats,
removeBase64Images,
];
export async function executeTransformers(
meta: Meta,
document: Document,
): Promise {
const executions: [string, number][] = [];
for (const transformer of transformerStack) {
const _meta = {
...meta,
logger: meta.logger.child({
method: "executeTransformers/" + transformer.name,
}),
};
const start = Date.now();
document = await transformer(_meta, document);
executions.push([transformer.name, Date.now() - start]);
}
meta.logger.debug("Executed transformers.", { executions });
return document;
}