import { supabase_service } from "../../../services/supabase"; import { Document } from "../../../controllers/v1/types"; import { Meta } from "../index"; import { getJob } from "../../../controllers/v1/crawl-status"; import gitDiff from 'git-diff'; import parseDiff from 'parse-diff'; import { generateCompletions } from "./llmExtract"; async function extractDataWithSchema(content: string, meta: Meta): Promise<{ extract: any, cost: number } | null> { try { const { extract, cost } = await generateCompletions({ logger: meta.logger.child({ method: "extractDataWithSchema/generateCompletions", }), options: { mode: "llm", schema: meta.options.changeTrackingOptions?.schema, systemPrompt: "Extract the requested information from the content based on the provided schema.", temperature: 0 }, markdown: content }); return { extract, cost }; } catch (error) { meta.logger.error("Error extracting data with schema", { error }); return null; } } function compareExtractedData(previousData: any, currentData: any): any { const result: Record = {}; const allKeys = new Set([ ...Object.keys(previousData || {}), ...Object.keys(currentData || {}) ]); for (const key of allKeys) { const oldValue = previousData?.[key]; const newValue = currentData?.[key]; if (JSON.stringify(oldValue) !== JSON.stringify(newValue)) { result[key] = { previous: oldValue, current: newValue }; } } return result; } export async function deriveDiff(meta: Meta, document: Document): Promise { if (meta.options.formats.includes("changeTracking")) { const res = await supabase_service .rpc("diff_get_last_scrape_3", { i_team_id: meta.internalOptions.teamId, i_url: document.metadata.sourceURL ?? meta.url, }); const data: { o_job_id: string, o_date_added: string, } | undefined | null = (res.data ?? [])[0] as any; const job: { returnvalue: Document, } | null = data?.o_job_id ? await getJob(data.o_job_id) : null; if (data && job && job?.returnvalue) { const previousMarkdown = job.returnvalue.markdown!; const currentMarkdown = document.markdown!; const transformer = (x: string) => [...x.replace(/\s+/g, "").replace(/\[iframe\]\(.+?\)/g, "")].sort().join(""); const isChanged = transformer(previousMarkdown) !== transformer(currentMarkdown); const changeStatus = document.metadata.statusCode === 404 ? "removed" : isChanged ? "changed" : "same"; document.changeTracking = { previousScrapeAt: data.o_date_added, changeStatus, visibility: meta.internalOptions.urlInvisibleInCurrentCrawl ? "hidden" : "visible", } if (meta.options.changeTrackingOptions?.modes?.includes("git-diff") && changeStatus === "changed") { const diffText = gitDiff(previousMarkdown, currentMarkdown, { color: false, wordDiff: false }); meta.logger.debug("Diff text", { diffText }); if (diffText) { const diffStructured = parseDiff(diffText); meta.logger.debug("Diff structured", { diffStructured }); document.changeTracking.diff = { text: diffText, json: { files: diffStructured.map(file => ({ from: file.from || null, to: file.to || null, chunks: file.chunks.map(chunk => ({ content: chunk.content, changes: chunk.changes.map(change => { const baseChange = { type: change.type, content: change.content }; if (change.type === 'normal' && 'ln1' in change && 'ln2' in change) { return { ...baseChange, normal: true, ln1: change.ln1, ln2: change.ln2 }; } else if (change.type === 'add' && 'ln' in change) { return { ...baseChange, add: true, ln: change.ln }; } else if (change.type === 'del' && 'ln' in change) { return { ...baseChange, del: true, ln: change.ln }; } return baseChange; }) })) })) } }; } } if (meta.options.changeTrackingOptions?.modes?.includes("json") && meta.options.changeTrackingOptions && changeStatus === "changed") { try { const previousData = meta.options.changeTrackingOptions.schema ? await extractDataWithSchema(previousMarkdown, meta) : null; const currentData = meta.options.changeTrackingOptions.schema ? await extractDataWithSchema(currentMarkdown, meta) : null; if (previousData && currentData) { document.changeTracking.json = compareExtractedData(previousData.extract, currentData.extract); if (document.metadata.costTracking) { document.metadata.costTracking.otherCallCount += 2; document.metadata.costTracking.otherCost = document.metadata.costTracking.otherCost + previousData.cost + currentData.cost; } else { document.metadata.costTracking = { smartScrapeCallCount: 0, smartScrapeCost: 0, otherCallCount: 2, otherCost: previousData.cost + currentData.cost, totalCost: previousData.cost + currentData.cost } } } else { const { extract } = await generateCompletions({ logger: meta.logger.child({ method: "deriveDiff/generateCompletions", }), options: { mode: "llm", systemPrompt: "Analyze the differences between the previous and current content and provide a structured summary of the changes.", schema: meta.options.changeTrackingOptions.schema, prompt: meta.options.changeTrackingOptions.prompt, temperature: 0 }, markdown: `Previous Content:\n${previousMarkdown}\n\nCurrent Content:\n${currentMarkdown}`, previousWarning: document.warning }); document.changeTracking.json = extract; } } catch (error) { meta.logger.error("Error generating structured diff with LLM", { error }); document.warning = "Structured diff generation failed." + (document.warning ? ` ${document.warning}` : ""); } } } else if (!res.error) { document.changeTracking = { previousScrapeAt: null, changeStatus: document.metadata.statusCode === 404 ? "removed" : "new", visibility: meta.internalOptions.urlInvisibleInCurrentCrawl ? "hidden" : "visible", } } else { meta.logger.error("Error fetching previous scrape", { error: res.error }); document.warning = "Comparing failed, please try again later." + (document.warning ? ` ${document.warning}` : ""); } } return document; }