Spaces:
Paused
Paused
File size: 7,014 Bytes
0e759d2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 | import { parseMarkdown } from "../../../lib/html-to-markdown";
import { Meta } from "..";
import { Document } from "../../../controllers/v1/types";
import { htmlTransform } from "../lib/removeUnwantedElements";
import { extractLinks } from "../lib/extractLinks";
import { extractMetadata } from "../lib/extractMetadata";
import { performLLMExtract } from "./llmExtract";
import { uploadScreenshot } from "./uploadScreenshot";
import { removeBase64Images } from "./removeBase64Images";
import { saveToCache } from "./cache";
import { performAgent } from "./agent";
import { deriveDiff } from "./diff";
export type Transformer = (
meta: Meta,
document: Document,
) => Document | Promise<Document>;
export async function deriveMetadataFromRawHTML(
meta: Meta,
document: Document,
): Promise<Document> {
if (document.rawHtml === undefined) {
throw new Error(
"rawHtml is undefined -- this transformer is being called out of order",
);
}
document.metadata = {
...(await extractMetadata(meta, document.rawHtml)),
...document.metadata,
};
return document;
}
export async function deriveHTMLFromRawHTML(
meta: Meta,
document: Document,
): Promise<Document> {
if (document.rawHtml === undefined) {
throw new Error(
"rawHtml is undefined -- this transformer is being called out of order",
);
}
document.html = await htmlTransform(
document.rawHtml,
document.metadata.url ?? document.metadata.sourceURL ?? meta.url,
meta.options,
);
return document;
}
export async function deriveMarkdownFromHTML(
_meta: Meta,
document: Document,
): Promise<Document> {
if (document.html === undefined) {
throw new Error(
"html is undefined -- this transformer is being called out of order",
);
}
document.markdown = await parseMarkdown(document.html);
return document;
}
export async function deriveLinksFromHTML(meta: Meta, document: Document): Promise<Document> {
// Only derive if the formats has links
if (meta.options.formats.includes("links")) {
if (document.html === undefined) {
throw new Error(
"html is undefined -- this transformer is being called out of order",
);
}
document.links = await extractLinks(document.html, meta.url);
}
return document;
}
export function coerceFieldsToFormats(
meta: Meta,
document: Document,
): Document {
const formats = new Set(meta.options.formats);
if (!formats.has("markdown") && document.markdown !== undefined) {
delete document.markdown;
} else if (formats.has("markdown") && document.markdown === undefined) {
meta.logger.warn(
"Request had format: markdown, but there was no markdown field in the result.",
);
}
if (!formats.has("rawHtml") && document.rawHtml !== undefined) {
delete document.rawHtml;
} else if (formats.has("rawHtml") && document.rawHtml === undefined) {
meta.logger.warn(
"Request had format: rawHtml, but there was no rawHtml field in the result.",
);
}
if (!formats.has("html") && document.html !== undefined) {
delete document.html;
} else if (formats.has("html") && document.html === undefined) {
meta.logger.warn(
"Request had format: html, but there was no html field in the result.",
);
}
if (
!formats.has("screenshot") &&
!formats.has("screenshot@fullPage") &&
document.screenshot !== undefined
) {
meta.logger.warn(
"Removed screenshot from Document because it wasn't in formats -- this is very wasteful and indicates a bug.",
);
delete document.screenshot;
} else if (
(formats.has("screenshot") || formats.has("screenshot@fullPage")) &&
document.screenshot === undefined
) {
meta.logger.warn(
"Request had format: screenshot / screenshot@fullPage, but there was no screenshot field in the result.",
);
}
if (!formats.has("links") && document.links !== undefined) {
meta.logger.warn(
"Removed links from Document because it wasn't in formats -- this is wasteful and indicates a bug.",
);
delete document.links;
} else if (formats.has("links") && document.links === undefined) {
meta.logger.warn(
"Request had format: links, but there was no links field in the result.",
);
}
if (!formats.has("extract") && (document.extract !== undefined || document.json !== undefined)) {
meta.logger.warn(
"Removed extract from Document because it wasn't in formats -- this is extremely wasteful and indicates a bug.",
);
delete document.extract;
} else if (formats.has("extract") && document.extract === undefined && document.json === undefined) {
meta.logger.warn(
"Request had format extract, but there was no extract field in the result.",
);
}
if (!formats.has("changeTracking") && document.changeTracking !== undefined) {
meta.logger.warn(
"Removed changeTracking from Document because it wasn't in formats -- this is extremely wasteful and indicates a bug.",
);
delete document.changeTracking;
} else if (formats.has("changeTracking") && document.changeTracking === undefined) {
meta.logger.warn(
"Request had format changeTracking, but there was no changeTracking field in the result.",
);
}
if (document.changeTracking &&
(!meta.options.changeTrackingOptions?.modes?.includes("git-diff")) &&
document.changeTracking.diff !== undefined) {
meta.logger.warn(
"Removed diff from changeTracking because git-diff mode wasn't specified in changeTrackingOptions.modes.",
);
delete document.changeTracking.diff;
}
if (document.changeTracking &&
(!meta.options.changeTrackingOptions?.modes?.includes("json")) &&
document.changeTracking.json !== undefined) {
meta.logger.warn(
"Removed structured from changeTracking because structured mode wasn't specified in changeTrackingOptions.modes.",
);
delete document.changeTracking.json;
}
if (meta.options.actions === undefined || meta.options.actions.length === 0) {
delete document.actions;
}
return document;
}
// TODO: allow some of these to run in parallel
export const transformerStack: Transformer[] = [
saveToCache,
deriveHTMLFromRawHTML,
deriveMarkdownFromHTML,
deriveLinksFromHTML,
deriveMetadataFromRawHTML,
uploadScreenshot,
performLLMExtract,
performAgent,
deriveDiff,
coerceFieldsToFormats,
removeBase64Images,
];
export async function executeTransformers(
meta: Meta,
document: Document,
): Promise<Document> {
const executions: [string, number][] = [];
for (const transformer of transformerStack) {
const _meta = {
...meta,
logger: meta.logger.child({
method: "executeTransformers/" + transformer.name,
}),
};
const start = Date.now();
document = await transformer(_meta, document);
executions.push([transformer.name, Date.now() - start]);
}
meta.logger.debug("Executed transformers.", { executions });
return document;
}
|