Spaces:
Sleeping
Sleeping
| from langchain_community.document_loaders.parsers.pdf import PyPDFParser | |
| from langchain_community.document_loaders.generic import GenericLoader | |
| from langchain_core.document_loaders.blob_loaders import BlobLoader | |
| from io import BytesIO | |
| from starlette.datastructures import UploadFile | |
| from typing import List, Iterable, ByteString | |
| # Ensure this is the correct path for your custom loader | |
| from custon_generic_loader import CustomGenericLoader | |
| from langchain_core.documents import Document | |
| from langchain_community.document_loaders.blob_loaders.schema import Blob | |
| from parser.msword_parser import MsWordParser | |
| from parser.pptx_parser import PptxParser | |
| from parser.xlsx_parser import XlsxParser | |
| from parser.txt_parser import TxtParser | |
| from parser.audio_parser import AudioParser | |
| from parser.video_parser import VideoParser | |
| class InMemoryBlobLoader(BlobLoader): | |
| def __init__(self, upload_file: UploadFile): | |
| self.upload_file = upload_file | |
| async def yield_blobs(self) -> Iterable[ByteString]: | |
| data = await self.upload_file.read() | |
| yield Blob.from_data(data, mime_type=self.upload_file.content_type, metadata={ | |
| 'name': self.upload_file.filename, | |
| 'size': self.upload_file.size, | |
| 'source': self.upload_file.filename | |
| }) | |
| async def load_document(upload_file: UploadFile) -> List[Document]: | |
| blob_loader = InMemoryBlobLoader(upload_file) | |
| if upload_file.content_type == 'application/pdf': | |
| blob_parser = PyPDFParser() | |
| print(f'Loading PDF: {upload_file.filename}') | |
| elif upload_file.content_type in [ | |
| 'application/msword', | |
| 'application/vnd.openxmlformats-officedocument.wordprocessingml.document', | |
| 'application/vnd.openxmlformats-officedocument.themeManager+xml' | |
| ]: | |
| blob_parser = MsWordParser() | |
| print(f'Loading Word Document: {upload_file.filename}') | |
| elif upload_file.content_type in [ | |
| 'application/vnd.ms-powerpoint', | |
| 'application/vnd.openxmlformats-officedocument.presentationml.presentation' | |
| ]: | |
| blob_parser = PptxParser() | |
| print(f'Loading PowerPoint: {upload_file.filename}') | |
| elif upload_file.content_type == 'text/plain': | |
| blob_parser = TxtParser() | |
| print(f'Loading Text File: {upload_file.filename}') | |
| elif upload_file.content_type.startswith('audio/'): | |
| blob_parser = AudioParser() | |
| print(f'Loading Audio File: {upload_file.filename}') | |
| elif upload_file.content_type.startswith('video/'): | |
| blob_parser = VideoParser() | |
| print(f'Loading Video File: {upload_file.filename}') | |
| # Suggested code may be subject to a license. Learn more: ~LicenseLog:3330720155. | |
| elif upload_file.content_type in [ | |
| 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet', | |
| 'application/vnd.ms-excel' | |
| ]: | |
| blob_parser = XlsxParser() | |
| print(f'Loading Excel File: {upload_file.filename}') | |
| else: | |
| raise ValueError(f"Unsupported file type: {upload_file.content_type}") | |
| loader = CustomGenericLoader(blob_loader, blob_parser) | |
| documents = [] | |
| # async for document in loader.lazy_load(): | |
| # documents.append(document) | |
| document = await loader.load_all() | |
| documents.append(document) | |
| if not documents: | |
| raise ValueError( | |
| f"No documents were loaded for file: {upload_file.filename}") | |
| return documents | |
| async def load_all_documents(upload_files: List[UploadFile]) -> List[List[Document]]: | |
| all_documents = [] | |
| for upload_file in upload_files: | |
| try: | |
| documents = await load_document(upload_file) | |
| all_documents.extend(documents) | |
| except ValueError as e: | |
| print(f"Error loading {upload_file.filename}: {e}") | |
| if not all_documents: | |
| raise ValueError("No documents were loaded from the provided files.") | |
| return all_documents | |
| # Example usage: | |
| # Note: You would typically run this inside an async function or an async event loop. | |
| # Example: | |
| # upload_files = [UploadFile1, UploadFile2, ...] | |
| # documents = await load_all_documents(upload_files) | |