Echo-AI-official's picture
Upload 280 files
0e759d2 verified
import { supabase_service } from "../../../services/supabase";
import { Document } from "../../../controllers/v1/types";
import { Meta } from "../index";
import { getJob } from "../../../controllers/v1/crawl-status";
import gitDiff from 'git-diff';
import parseDiff from 'parse-diff';
import { generateCompletions } from "./llmExtract";
async function extractDataWithSchema(content: string, meta: Meta): Promise<{ extract: any, cost: number } | null> {
try {
const { extract, cost } = await generateCompletions({
logger: meta.logger.child({
method: "extractDataWithSchema/generateCompletions",
}),
options: {
mode: "llm",
schema: meta.options.changeTrackingOptions?.schema,
systemPrompt: "Extract the requested information from the content based on the provided schema.",
temperature: 0
},
markdown: content
});
return { extract, cost };
} catch (error) {
meta.logger.error("Error extracting data with schema", { error });
return null;
}
}
function compareExtractedData(previousData: any, currentData: any): any {
const result: Record<string, { previous: any, current: any }> = {};
const allKeys = new Set([
...Object.keys(previousData || {}),
...Object.keys(currentData || {})
]);
for (const key of allKeys) {
const oldValue = previousData?.[key];
const newValue = currentData?.[key];
if (JSON.stringify(oldValue) !== JSON.stringify(newValue)) {
result[key] = {
previous: oldValue,
current: newValue
};
}
}
return result;
}
export async function deriveDiff(meta: Meta, document: Document): Promise<Document> {
if (meta.options.formats.includes("changeTracking")) {
const res = await supabase_service
.rpc("diff_get_last_scrape_3", {
i_team_id: meta.internalOptions.teamId,
i_url: document.metadata.sourceURL ?? meta.url,
});
const data: {
o_job_id: string,
o_date_added: string,
} | undefined | null = (res.data ?? [])[0] as any;
const job: {
returnvalue: Document,
} | null = data?.o_job_id ? await getJob(data.o_job_id) : null;
if (data && job && job?.returnvalue) {
const previousMarkdown = job.returnvalue.markdown!;
const currentMarkdown = document.markdown!;
const transformer = (x: string) => [...x.replace(/\s+/g, "").replace(/\[iframe\]\(.+?\)/g, "")].sort().join("");
const isChanged = transformer(previousMarkdown) !== transformer(currentMarkdown);
const changeStatus = document.metadata.statusCode === 404 ? "removed" : isChanged ? "changed" : "same";
document.changeTracking = {
previousScrapeAt: data.o_date_added,
changeStatus,
visibility: meta.internalOptions.urlInvisibleInCurrentCrawl ? "hidden" : "visible",
}
if (meta.options.changeTrackingOptions?.modes?.includes("git-diff") && changeStatus === "changed") {
const diffText = gitDiff(previousMarkdown, currentMarkdown, {
color: false,
wordDiff: false
});
meta.logger.debug("Diff text", { diffText });
if (diffText) {
const diffStructured = parseDiff(diffText);
meta.logger.debug("Diff structured", { diffStructured });
document.changeTracking.diff = {
text: diffText,
json: {
files: diffStructured.map(file => ({
from: file.from || null,
to: file.to || null,
chunks: file.chunks.map(chunk => ({
content: chunk.content,
changes: chunk.changes.map(change => {
const baseChange = {
type: change.type,
content: change.content
};
if (change.type === 'normal' && 'ln1' in change && 'ln2' in change) {
return {
...baseChange,
normal: true,
ln1: change.ln1,
ln2: change.ln2
};
} else if (change.type === 'add' && 'ln' in change) {
return {
...baseChange,
add: true,
ln: change.ln
};
} else if (change.type === 'del' && 'ln' in change) {
return {
...baseChange,
del: true,
ln: change.ln
};
}
return baseChange;
})
}))
}))
}
};
}
}
if (meta.options.changeTrackingOptions?.modes?.includes("json") &&
meta.options.changeTrackingOptions && changeStatus === "changed") {
try {
const previousData = meta.options.changeTrackingOptions.schema ?
await extractDataWithSchema(previousMarkdown, meta) : null;
const currentData = meta.options.changeTrackingOptions.schema ?
await extractDataWithSchema(currentMarkdown, meta) : null;
if (previousData && currentData) {
document.changeTracking.json = compareExtractedData(previousData.extract, currentData.extract);
if (document.metadata.costTracking) {
document.metadata.costTracking.otherCallCount += 2;
document.metadata.costTracking.otherCost = document.metadata.costTracking.otherCost + previousData.cost + currentData.cost;
} else {
document.metadata.costTracking = {
smartScrapeCallCount: 0,
smartScrapeCost: 0,
otherCallCount: 2,
otherCost: previousData.cost + currentData.cost,
totalCost: previousData.cost + currentData.cost
}
}
} else {
const { extract } = await generateCompletions({
logger: meta.logger.child({
method: "deriveDiff/generateCompletions",
}),
options: {
mode: "llm",
systemPrompt: "Analyze the differences between the previous and current content and provide a structured summary of the changes.",
schema: meta.options.changeTrackingOptions.schema,
prompt: meta.options.changeTrackingOptions.prompt,
temperature: 0
},
markdown: `Previous Content:\n${previousMarkdown}\n\nCurrent Content:\n${currentMarkdown}`,
previousWarning: document.warning
});
document.changeTracking.json = extract;
}
} catch (error) {
meta.logger.error("Error generating structured diff with LLM", { error });
document.warning = "Structured diff generation failed." + (document.warning ? ` ${document.warning}` : "");
}
}
} else if (!res.error) {
document.changeTracking = {
previousScrapeAt: null,
changeStatus: document.metadata.statusCode === 404 ? "removed" : "new",
visibility: meta.internalOptions.urlInvisibleInCurrentCrawl ? "hidden" : "visible",
}
} else {
meta.logger.error("Error fetching previous scrape", { error: res.error });
document.warning = "Comparing failed, please try again later." + (document.warning ? ` ${document.warning}` : "");
}
}
return document;
}