| | const fs = require("fs"); |
| | const path = require("path"); |
| | const { v5: uuidv5 } = require("uuid"); |
| | const { Document } = require("../../models/documents"); |
| | const { DocumentSyncQueue } = require("../../models/documentSyncQueue"); |
| | const documentsPath = |
| | process.env.NODE_ENV === "development" |
| | ? path.resolve(__dirname, `../../storage/documents`) |
| | : path.resolve(process.env.STORAGE_DIR, `documents`); |
| | const directUploadsPath = |
| | process.env.NODE_ENV === "development" |
| | ? path.resolve(__dirname, `../../storage/direct-uploads`) |
| | : path.resolve(process.env.STORAGE_DIR, `direct-uploads`); |
| | const vectorCachePath = |
| | process.env.NODE_ENV === "development" |
| | ? path.resolve(__dirname, `../../storage/vector-cache`) |
| | : path.resolve(process.env.STORAGE_DIR, `vector-cache`); |
| |
|
| | |
| | |
| | async function fileData(filePath = null) { |
| | if (!filePath) throw new Error("No docPath provided in request"); |
| | const fullFilePath = path.resolve(documentsPath, normalizePath(filePath)); |
| | if (!fs.existsSync(fullFilePath) || !isWithin(documentsPath, fullFilePath)) |
| | return null; |
| |
|
| | const data = fs.readFileSync(fullFilePath, "utf8"); |
| | return JSON.parse(data); |
| | } |
| |
|
| | async function viewLocalFiles() { |
| | if (!fs.existsSync(documentsPath)) fs.mkdirSync(documentsPath); |
| | const liveSyncAvailable = await DocumentSyncQueue.enabled(); |
| | const directory = { |
| | name: "documents", |
| | type: "folder", |
| | items: [], |
| | }; |
| |
|
| | for (const file of fs.readdirSync(documentsPath)) { |
| | if (path.extname(file) === ".md") continue; |
| | const folderPath = path.resolve(documentsPath, file); |
| | const isFolder = fs.lstatSync(folderPath).isDirectory(); |
| | if (isFolder) { |
| | const subdocs = { |
| | name: file, |
| | type: "folder", |
| | items: [], |
| | }; |
| |
|
| | const subfiles = fs.readdirSync(folderPath); |
| | const filenames = {}; |
| | const filePromises = []; |
| |
|
| | for (let i = 0; i < subfiles.length; i++) { |
| | const subfile = subfiles[i]; |
| | const cachefilename = `${file}/${subfile}`; |
| | if (path.extname(subfile) !== ".json") continue; |
| | filePromises.push( |
| | fileToPickerData({ |
| | pathToFile: path.join(folderPath, subfile), |
| | liveSyncAvailable, |
| | cachefilename, |
| | }) |
| | ); |
| | filenames[cachefilename] = subfile; |
| | } |
| | const results = await Promise.all(filePromises) |
| | .then((results) => results.filter((i) => !!i)) |
| | .then((results) => results.filter((i) => hasRequiredMetadata(i))); |
| | subdocs.items.push(...results); |
| |
|
| | |
| | |
| | const pinnedWorkspacesByDocument = |
| | await getPinnedWorkspacesByDocument(filenames); |
| | const watchedDocumentsFilenames = |
| | await getWatchedDocumentFilenames(filenames); |
| | for (const item of subdocs.items) { |
| | item.pinnedWorkspaces = pinnedWorkspacesByDocument[item.name] || []; |
| | item.watched = |
| | watchedDocumentsFilenames.hasOwnProperty(item.name) || false; |
| | } |
| |
|
| | directory.items.push(subdocs); |
| | } |
| | } |
| |
|
| | |
| | directory.items = [ |
| | directory.items.find((folder) => folder.name === "custom-documents"), |
| | ...directory.items.filter((folder) => folder.name !== "custom-documents"), |
| | ].filter((i) => !!i); |
| |
|
| | return directory; |
| | } |
| |
|
| | |
| | |
| | |
| | |
| | |
| | async function getDocumentsByFolder(folderName = "") { |
| | if (!folderName) { |
| | return { |
| | folder: folderName, |
| | documents: [], |
| | code: 400, |
| | error: "Folder name must be provided.", |
| | }; |
| | } |
| |
|
| | const folderPath = path.resolve(documentsPath, normalizePath(folderName)); |
| | if ( |
| | !isWithin(documentsPath, folderPath) || |
| | !fs.existsSync(folderPath) || |
| | !fs.lstatSync(folderPath).isDirectory() |
| | ) { |
| | return { |
| | folder: folderName, |
| | documents: [], |
| | code: 404, |
| | error: `Folder "${folderName}" does not exist.`, |
| | }; |
| | } |
| |
|
| | const documents = []; |
| | const filenames = {}; |
| | const files = fs.readdirSync(folderPath); |
| | for (const file of files) { |
| | if (path.extname(file) !== ".json") continue; |
| | const filePath = path.join(folderPath, file); |
| | const rawData = fs.readFileSync(filePath, "utf8"); |
| | const cachefilename = `${folderName}/${file}`; |
| | const { pageContent, ...metadata } = JSON.parse(rawData); |
| | documents.push({ |
| | name: file, |
| | type: "file", |
| | ...metadata, |
| | cached: await cachedVectorInformation(cachefilename, true), |
| | }); |
| | filenames[cachefilename] = file; |
| | } |
| |
|
| | |
| | const pinnedWorkspacesByDocument = |
| | await getPinnedWorkspacesByDocument(filenames); |
| | const watchedDocumentsFilenames = |
| | await getWatchedDocumentFilenames(filenames); |
| | for (let doc of documents) { |
| | doc.pinnedWorkspaces = pinnedWorkspacesByDocument[doc.name] || []; |
| | doc.watched = Object.prototype.hasOwnProperty.call( |
| | watchedDocumentsFilenames, |
| | doc.name |
| | ); |
| | } |
| |
|
| | return { folder: folderName, documents, code: 200, error: null }; |
| | } |
| |
|
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | async function cachedVectorInformation(filename = null, checkOnly = false) { |
| | if (!filename) return checkOnly ? false : { exists: false, chunks: [] }; |
| |
|
| | const digest = uuidv5(filename, uuidv5.URL); |
| | const file = path.resolve(vectorCachePath, `${digest}.json`); |
| | const exists = fs.existsSync(file); |
| |
|
| | if (checkOnly) return exists; |
| | if (!exists) return { exists, chunks: [] }; |
| |
|
| | console.log( |
| | `Cached vectorized results of ${filename} found! Using cached data to save on embed costs.` |
| | ); |
| | const rawData = fs.readFileSync(file, "utf8"); |
| | return { exists: true, chunks: JSON.parse(rawData) }; |
| | } |
| |
|
| | |
| | |
| | async function storeVectorResult(vectorData = [], filename = null) { |
| | if (!filename) return; |
| | console.log( |
| | `Caching vectorized results of ${filename} to prevent duplicated embedding.` |
| | ); |
| | if (!fs.existsSync(vectorCachePath)) fs.mkdirSync(vectorCachePath); |
| |
|
| | const digest = uuidv5(filename, uuidv5.URL); |
| | const writeTo = path.resolve(vectorCachePath, `${digest}.json`); |
| | fs.writeFileSync(writeTo, JSON.stringify(vectorData), "utf8"); |
| | return; |
| | } |
| |
|
| | |
| | async function purgeSourceDocument(filename = null) { |
| | if (!filename) return; |
| | const filePath = path.resolve(documentsPath, normalizePath(filename)); |
| |
|
| | if ( |
| | !fs.existsSync(filePath) || |
| | !isWithin(documentsPath, filePath) || |
| | !fs.lstatSync(filePath).isFile() |
| | ) |
| | return; |
| |
|
| | console.log(`Purging source document of ${filename}.`); |
| | fs.rmSync(filePath); |
| | return; |
| | } |
| |
|
| | |
| | async function purgeVectorCache(filename = null) { |
| | if (!filename) return; |
| | const digest = uuidv5(filename, uuidv5.URL); |
| | const filePath = path.resolve(vectorCachePath, `${digest}.json`); |
| |
|
| | if (!fs.existsSync(filePath) || !fs.lstatSync(filePath).isFile()) return; |
| | console.log(`Purging vector-cache of ${filename}.`); |
| | fs.rmSync(filePath); |
| | return; |
| | } |
| |
|
| | |
| | |
| | async function findDocumentInDocuments(documentName = null) { |
| | if (!documentName) return null; |
| | for (const folder of fs.readdirSync(documentsPath)) { |
| | const isFolder = fs |
| | .lstatSync(path.join(documentsPath, folder)) |
| | .isDirectory(); |
| | if (!isFolder) continue; |
| |
|
| | const targetFilename = normalizePath(documentName); |
| | const targetFileLocation = path.join(documentsPath, folder, targetFilename); |
| |
|
| | if ( |
| | !fs.existsSync(targetFileLocation) || |
| | !isWithin(documentsPath, targetFileLocation) |
| | ) |
| | continue; |
| |
|
| | const fileData = fs.readFileSync(targetFileLocation, "utf8"); |
| | const cachefilename = `${folder}/${targetFilename}`; |
| | const { pageContent, ...metadata } = JSON.parse(fileData); |
| | return { |
| | name: targetFilename, |
| | type: "file", |
| | ...metadata, |
| | cached: await cachedVectorInformation(cachefilename, true), |
| | }; |
| | } |
| |
|
| | return null; |
| | } |
| |
|
| | |
| | |
| | |
| | |
| | |
| | |
| | function isWithin(outer, inner) { |
| | if (outer === inner) return false; |
| | const rel = path.relative(outer, inner); |
| | return !rel.startsWith("../") && rel !== ".."; |
| | } |
| |
|
| | function normalizePath(filepath = "") { |
| | const result = path |
| | .normalize(filepath.trim()) |
| | .replace(/^(\.\.(\/|\\|$))+/, "") |
| | .trim(); |
| | if (["..", ".", "/"].includes(result)) throw new Error("Invalid path."); |
| | return result; |
| | } |
| |
|
| | |
| | |
| | |
| | function hasVectorCachedFiles() { |
| | try { |
| | return ( |
| | fs.readdirSync(vectorCachePath)?.filter((name) => name.endsWith(".json")) |
| | .length !== 0 |
| | ); |
| | } catch {} |
| | return false; |
| | } |
| |
|
| | |
| | |
| | |
| | |
| | async function getPinnedWorkspacesByDocument(filenames = []) { |
| | return ( |
| | await Document.where( |
| | { |
| | docpath: { |
| | in: Object.keys(filenames), |
| | }, |
| | pinned: true, |
| | }, |
| | null, |
| | null, |
| | null, |
| | { |
| | workspaceId: true, |
| | docpath: true, |
| | } |
| | ) |
| | ).reduce((result, { workspaceId, docpath }) => { |
| | const filename = filenames[docpath]; |
| | if (!result[filename]) result[filename] = []; |
| | if (!result[filename].includes(workspaceId)) |
| | result[filename].push(workspaceId); |
| | return result; |
| | }, {}); |
| | } |
| |
|
| | |
| | |
| | |
| | |
| | |
| | |
| | async function getWatchedDocumentFilenames(filenames = []) { |
| | return ( |
| | await Document.where( |
| | { |
| | docpath: { in: Object.keys(filenames) }, |
| | watched: true, |
| | }, |
| | null, |
| | null, |
| | null, |
| | { workspaceId: true, docpath: true } |
| | ) |
| | ).reduce((result, { workspaceId, docpath }) => { |
| | const filename = filenames[docpath]; |
| | result[filename] = workspaceId; |
| | return result; |
| | }, {}); |
| | } |
| |
|
| | |
| | |
| | |
| | |
| | function purgeEntireVectorCache() { |
| | fs.rmSync(vectorCachePath, { recursive: true, force: true }); |
| | fs.mkdirSync(vectorCachePath); |
| | return; |
| | } |
| |
|
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | const FILE_READ_SIZE_THRESHOLD = 150 * (1024 * 1024); |
| |
|
| | |
| | |
| | |
| | |
| | |
| | |
| | async function fileToPickerData({ |
| | pathToFile, |
| | liveSyncAvailable = false, |
| | cachefilename = null, |
| | }) { |
| | let metadata = {}; |
| | const filename = path.basename(pathToFile); |
| | const fileStats = fs.statSync(pathToFile); |
| | const cachedStatus = await cachedVectorInformation(cachefilename, true); |
| |
|
| | if (fileStats.size < FILE_READ_SIZE_THRESHOLD) { |
| | const rawData = fs.readFileSync(pathToFile, "utf8"); |
| | try { |
| | metadata = JSON.parse(rawData); |
| | |
| | delete metadata.pageContent; |
| | } catch (err) { |
| | console.error("Error parsing file", err); |
| | return null; |
| | } |
| |
|
| | return { |
| | name: filename, |
| | type: "file", |
| | ...metadata, |
| | cached: cachedStatus, |
| | canWatch: liveSyncAvailable |
| | ? DocumentSyncQueue.canWatch(metadata) |
| | : false, |
| | |
| | |
| | }; |
| | } |
| |
|
| | console.log( |
| | `Stream-parsing ${path.basename(pathToFile)} because it exceeds the ${FILE_READ_SIZE_THRESHOLD} byte limit.` |
| | ); |
| | const stream = fs.createReadStream(pathToFile, { encoding: "utf8" }); |
| | try { |
| | let fileContent = ""; |
| | metadata = await new Promise((resolve, reject) => { |
| | stream |
| | .on("data", (chunk) => { |
| | fileContent += chunk; |
| | }) |
| | .on("end", () => { |
| | metadata = JSON.parse(fileContent); |
| | |
| | delete metadata.pageContent; |
| | resolve(metadata); |
| | }) |
| | .on("error", (err) => { |
| | console.error("Error parsing file", err); |
| | reject(null); |
| | }); |
| | }).catch((err) => { |
| | console.error("Error parsing file", err); |
| | }); |
| | } catch (err) { |
| | console.error("Error parsing file", err); |
| | metadata = null; |
| | } finally { |
| | stream.destroy(); |
| | } |
| |
|
| | |
| | if (!metadata || !Object.keys(metadata)?.length) { |
| | console.log(`Stream-parsing failed for ${path.basename(pathToFile)}`); |
| | return null; |
| | } |
| |
|
| | return { |
| | name: filename, |
| | type: "file", |
| | ...metadata, |
| | cached: cachedStatus, |
| | canWatch: liveSyncAvailable ? DocumentSyncQueue.canWatch(metadata) : false, |
| | }; |
| | } |
| |
|
| | const REQUIRED_FILE_OBJECT_FIELDS = [ |
| | "name", |
| | "type", |
| | "url", |
| | "title", |
| | "docAuthor", |
| | "description", |
| | "docSource", |
| | "chunkSource", |
| | "published", |
| | "wordCount", |
| | "token_count_estimate", |
| | ]; |
| |
|
| | |
| | |
| | |
| | |
| | |
| | function hasRequiredMetadata(metadata = {}) { |
| | return REQUIRED_FILE_OBJECT_FIELDS.every((field) => |
| | metadata.hasOwnProperty(field) |
| | ); |
| | } |
| |
|
| | module.exports = { |
| | findDocumentInDocuments, |
| | cachedVectorInformation, |
| | viewLocalFiles, |
| | purgeSourceDocument, |
| | purgeVectorCache, |
| | storeVectorResult, |
| | fileData, |
| | normalizePath, |
| | isWithin, |
| | documentsPath, |
| | directUploadsPath, |
| | hasVectorCachedFiles, |
| | purgeEntireVectorCache, |
| | getDocumentsByFolder, |
| | }; |
| |
|