Spaces:
Paused
Paused
| import { supabase_service } from "../../../services/supabase"; | |
| import { Document } from "../../../controllers/v1/types"; | |
| import { Meta } from "../index"; | |
| import { getJob } from "../../../controllers/v1/crawl-status"; | |
| import gitDiff from 'git-diff'; | |
| import parseDiff from 'parse-diff'; | |
| import { generateCompletions } from "./llmExtract"; | |
| async function extractDataWithSchema(content: string, meta: Meta): Promise<{ extract: any, cost: number } | null> { | |
| try { | |
| const { extract, cost } = await generateCompletions({ | |
| logger: meta.logger.child({ | |
| method: "extractDataWithSchema/generateCompletions", | |
| }), | |
| options: { | |
| mode: "llm", | |
| schema: meta.options.changeTrackingOptions?.schema, | |
| systemPrompt: "Extract the requested information from the content based on the provided schema.", | |
| temperature: 0 | |
| }, | |
| markdown: content | |
| }); | |
| return { extract, cost }; | |
| } catch (error) { | |
| meta.logger.error("Error extracting data with schema", { error }); | |
| return null; | |
| } | |
| } | |
| function compareExtractedData(previousData: any, currentData: any): any { | |
| const result: Record<string, { previous: any, current: any }> = {}; | |
| const allKeys = new Set([ | |
| ...Object.keys(previousData || {}), | |
| ...Object.keys(currentData || {}) | |
| ]); | |
| for (const key of allKeys) { | |
| const oldValue = previousData?.[key]; | |
| const newValue = currentData?.[key]; | |
| if (JSON.stringify(oldValue) !== JSON.stringify(newValue)) { | |
| result[key] = { | |
| previous: oldValue, | |
| current: newValue | |
| }; | |
| } | |
| } | |
| return result; | |
| } | |
| export async function deriveDiff(meta: Meta, document: Document): Promise<Document> { | |
| if (meta.options.formats.includes("changeTracking")) { | |
| const res = await supabase_service | |
| .rpc("diff_get_last_scrape_3", { | |
| i_team_id: meta.internalOptions.teamId, | |
| i_url: document.metadata.sourceURL ?? meta.url, | |
| }); | |
| const data: { | |
| o_job_id: string, | |
| o_date_added: string, | |
| } | undefined | null = (res.data ?? [])[0] as any; | |
| const job: { | |
| returnvalue: Document, | |
| } | null = data?.o_job_id ? await getJob(data.o_job_id) : null; | |
| if (data && job && job?.returnvalue) { | |
| const previousMarkdown = job.returnvalue.markdown!; | |
| const currentMarkdown = document.markdown!; | |
| const transformer = (x: string) => [...x.replace(/\s+/g, "").replace(/\[iframe\]\(.+?\)/g, "")].sort().join(""); | |
| const isChanged = transformer(previousMarkdown) !== transformer(currentMarkdown); | |
| const changeStatus = document.metadata.statusCode === 404 ? "removed" : isChanged ? "changed" : "same"; | |
| document.changeTracking = { | |
| previousScrapeAt: data.o_date_added, | |
| changeStatus, | |
| visibility: meta.internalOptions.urlInvisibleInCurrentCrawl ? "hidden" : "visible", | |
| } | |
| if (meta.options.changeTrackingOptions?.modes?.includes("git-diff") && changeStatus === "changed") { | |
| const diffText = gitDiff(previousMarkdown, currentMarkdown, { | |
| color: false, | |
| wordDiff: false | |
| }); | |
| meta.logger.debug("Diff text", { diffText }); | |
| if (diffText) { | |
| const diffStructured = parseDiff(diffText); | |
| meta.logger.debug("Diff structured", { diffStructured }); | |
| document.changeTracking.diff = { | |
| text: diffText, | |
| json: { | |
| files: diffStructured.map(file => ({ | |
| from: file.from || null, | |
| to: file.to || null, | |
| chunks: file.chunks.map(chunk => ({ | |
| content: chunk.content, | |
| changes: chunk.changes.map(change => { | |
| const baseChange = { | |
| type: change.type, | |
| content: change.content | |
| }; | |
| if (change.type === 'normal' && 'ln1' in change && 'ln2' in change) { | |
| return { | |
| ...baseChange, | |
| normal: true, | |
| ln1: change.ln1, | |
| ln2: change.ln2 | |
| }; | |
| } else if (change.type === 'add' && 'ln' in change) { | |
| return { | |
| ...baseChange, | |
| add: true, | |
| ln: change.ln | |
| }; | |
| } else if (change.type === 'del' && 'ln' in change) { | |
| return { | |
| ...baseChange, | |
| del: true, | |
| ln: change.ln | |
| }; | |
| } | |
| return baseChange; | |
| }) | |
| })) | |
| })) | |
| } | |
| }; | |
| } | |
| } | |
| if (meta.options.changeTrackingOptions?.modes?.includes("json") && | |
| meta.options.changeTrackingOptions && changeStatus === "changed") { | |
| try { | |
| const previousData = meta.options.changeTrackingOptions.schema ? | |
| await extractDataWithSchema(previousMarkdown, meta) : null; | |
| const currentData = meta.options.changeTrackingOptions.schema ? | |
| await extractDataWithSchema(currentMarkdown, meta) : null; | |
| if (previousData && currentData) { | |
| document.changeTracking.json = compareExtractedData(previousData.extract, currentData.extract); | |
| if (document.metadata.costTracking) { | |
| document.metadata.costTracking.otherCallCount += 2; | |
| document.metadata.costTracking.otherCost = document.metadata.costTracking.otherCost + previousData.cost + currentData.cost; | |
| } else { | |
| document.metadata.costTracking = { | |
| smartScrapeCallCount: 0, | |
| smartScrapeCost: 0, | |
| otherCallCount: 2, | |
| otherCost: previousData.cost + currentData.cost, | |
| totalCost: previousData.cost + currentData.cost | |
| } | |
| } | |
| } else { | |
| const { extract } = await generateCompletions({ | |
| logger: meta.logger.child({ | |
| method: "deriveDiff/generateCompletions", | |
| }), | |
| options: { | |
| mode: "llm", | |
| systemPrompt: "Analyze the differences between the previous and current content and provide a structured summary of the changes.", | |
| schema: meta.options.changeTrackingOptions.schema, | |
| prompt: meta.options.changeTrackingOptions.prompt, | |
| temperature: 0 | |
| }, | |
| markdown: `Previous Content:\n${previousMarkdown}\n\nCurrent Content:\n${currentMarkdown}`, | |
| previousWarning: document.warning | |
| }); | |
| document.changeTracking.json = extract; | |
| } | |
| } catch (error) { | |
| meta.logger.error("Error generating structured diff with LLM", { error }); | |
| document.warning = "Structured diff generation failed." + (document.warning ? ` ${document.warning}` : ""); | |
| } | |
| } | |
| } else if (!res.error) { | |
| document.changeTracking = { | |
| previousScrapeAt: null, | |
| changeStatus: document.metadata.statusCode === 404 ? "removed" : "new", | |
| visibility: meta.internalOptions.urlInvisibleInCurrentCrawl ? "hidden" : "visible", | |
| } | |
| } else { | |
| meta.logger.error("Error fetching previous scrape", { error: res.error }); | |
| document.warning = "Comparing failed, please try again later." + (document.warning ? ` ${document.warning}` : ""); | |
| } | |
| } | |
| return document; | |
| } | |