Spaces:
Paused
Paused
| import express from 'express'; | |
| import logger from '../utils/logger'; | |
| import multer from 'multer'; | |
| import path from 'path'; | |
| import crypto from 'crypto'; | |
| import fs from 'fs'; | |
| import { Embeddings } from '@langchain/core/embeddings'; | |
| import { getAvailableEmbeddingModelProviders } from '../lib/providers'; | |
| import { PDFLoader } from '@langchain/community/document_loaders/fs/pdf'; | |
| import { DocxLoader } from '@langchain/community/document_loaders/fs/docx'; | |
| import { RecursiveCharacterTextSplitter } from '@langchain/textsplitters'; | |
| import { Document } from 'langchain/document'; | |
| const router = express.Router(); | |
| const splitter = new RecursiveCharacterTextSplitter({ | |
| chunkSize: 500, | |
| chunkOverlap: 100, | |
| }); | |
| const storage = multer.diskStorage({ | |
| destination: (req, file, cb) => { | |
| cb(null, path.join(process.cwd(), './uploads')); | |
| }, | |
| filename: (req, file, cb) => { | |
| const splitedFileName = file.originalname.split('.'); | |
| const fileExtension = splitedFileName[splitedFileName.length - 1]; | |
| if (!['pdf', 'docx', 'txt'].includes(fileExtension)) { | |
| return cb(new Error('File type is not supported'), ''); | |
| } | |
| cb(null, `${crypto.randomBytes(16).toString('hex')}.${fileExtension}`); | |
| }, | |
| }); | |
| const upload = multer({ storage }); | |
| router.post( | |
| '/', | |
| upload.fields([ | |
| { name: 'files' }, | |
| { name: 'embedding_model', maxCount: 1 }, | |
| { name: 'embedding_model_provider', maxCount: 1 }, | |
| ]), | |
| async (req, res) => { | |
| try { | |
| const { embedding_model, embedding_model_provider } = req.body; | |
| if (!embedding_model || !embedding_model_provider) { | |
| res | |
| .status(400) | |
| .json({ message: 'Missing embedding model or provider' }); | |
| return; | |
| } | |
| const embeddingModels = await getAvailableEmbeddingModelProviders(); | |
| const provider = | |
| embedding_model_provider ?? Object.keys(embeddingModels)[0]; | |
| const embeddingModel: Embeddings = | |
| embedding_model ?? Object.keys(embeddingModels[provider])[0]; | |
| let embeddingsModel: Embeddings | undefined; | |
| if ( | |
| embeddingModels[provider] && | |
| embeddingModels[provider][embeddingModel] | |
| ) { | |
| embeddingsModel = embeddingModels[provider][embeddingModel].model as | |
| | Embeddings | |
| | undefined; | |
| } | |
| if (!embeddingsModel) { | |
| res.status(400).json({ message: 'Invalid LLM model selected' }); | |
| return; | |
| } | |
| const files = req.files['files'] as Express.Multer.File[]; | |
| if (!files || files.length === 0) { | |
| res.status(400).json({ message: 'No files uploaded' }); | |
| return; | |
| } | |
| await Promise.all( | |
| files.map(async (file) => { | |
| let docs: Document[] = []; | |
| if (file.mimetype === 'application/pdf') { | |
| const loader = new PDFLoader(file.path); | |
| docs = await loader.load(); | |
| } else if ( | |
| file.mimetype === | |
| 'application/vnd.openxmlformats-officedocument.wordprocessingml.document' | |
| ) { | |
| const loader = new DocxLoader(file.path); | |
| docs = await loader.load(); | |
| } else if (file.mimetype === 'text/plain') { | |
| const text = fs.readFileSync(file.path, 'utf-8'); | |
| docs = [ | |
| new Document({ | |
| pageContent: text, | |
| metadata: { | |
| title: file.originalname, | |
| }, | |
| }), | |
| ]; | |
| } | |
| const splitted = await splitter.splitDocuments(docs); | |
| const json = JSON.stringify({ | |
| title: file.originalname, | |
| contents: splitted.map((doc) => doc.pageContent), | |
| }); | |
| const pathToSave = file.path.replace(/\.\w+$/, '-extracted.json'); | |
| fs.writeFileSync(pathToSave, json); | |
| const embeddings = await embeddingsModel.embedDocuments( | |
| splitted.map((doc) => doc.pageContent), | |
| ); | |
| const embeddingsJSON = JSON.stringify({ | |
| title: file.originalname, | |
| embeddings: embeddings, | |
| }); | |
| const pathToSaveEmbeddings = file.path.replace( | |
| /\.\w+$/, | |
| '-embeddings.json', | |
| ); | |
| fs.writeFileSync(pathToSaveEmbeddings, embeddingsJSON); | |
| }), | |
| ); | |
| res.status(200).json({ | |
| files: files.map((file) => { | |
| return { | |
| fileName: file.originalname, | |
| fileExtension: file.filename.split('.').pop(), | |
| fileId: file.filename.replace(/\.\w+$/, ''), | |
| }; | |
| }), | |
| }); | |
| } catch (err: any) { | |
| logger.error(`Error in uploading file results: ${err.message}`); | |
| res.status(500).json({ message: 'An error has occurred.' }); | |
| } | |
| }, | |
| ); | |
| export default router; | |