Spaces:
Sleeping
Sleeping
| """ | |
| Utilities for processing uploaded files. | |
| """ | |
| import os | |
| import tempfile | |
| import shutil | |
| from typing import List, Optional | |
| from pathlib import Path | |
| from langchain_text_splitters import RecursiveCharacterTextSplitter | |
| from langchain_community.document_loaders import ( | |
| PyPDFLoader, | |
| TextLoader, | |
| CSVLoader, | |
| UnstructuredExcelLoader, | |
| Docx2txtLoader | |
| ) | |
| from langchain_core.documents import Document | |
| from chainlit.types import AskFileResponse | |
| import config | |
| def get_document_loader(file_path: str): | |
| """ | |
| Get appropriate document loader based on file extension. | |
| Args: | |
| file_path: Path to the file | |
| Returns: | |
| Document loader instance | |
| """ | |
| file_extension = Path(file_path).suffix.lower() | |
| # Select appropriate loader based on file extension | |
| if file_extension == '.pdf': | |
| return PyPDFLoader(file_path) | |
| elif file_extension == '.txt' or file_extension == '.md' or file_extension == '.py': | |
| return TextLoader(file_path) | |
| elif file_extension == '.csv': | |
| return CSVLoader(file_path) | |
| elif file_extension == '.xlsx' or file_extension == '.xls': | |
| return UnstructuredExcelLoader(file_path) | |
| elif file_extension == '.docx' or file_extension == '.doc': | |
| return Docx2txtLoader(file_path) | |
| else: | |
| # Default to text loader | |
| return TextLoader(file_path) | |
| def create_text_splitter(): | |
| """ | |
| Create a text splitter with the configured settings. | |
| Returns: | |
| Initialized text splitter | |
| """ | |
| return RecursiveCharacterTextSplitter( | |
| chunk_size=config.CHUNK_SIZE, | |
| chunk_overlap=config.CHUNK_OVERLAP, | |
| length_function=len, | |
| is_separator_regex=False, | |
| separators=config.SEPARATORS | |
| ) | |
| def process_file(file: AskFileResponse) -> Optional[List[Document]]: | |
| """ | |
| Process an uploaded file and split it into text chunks. | |
| Args: | |
| file: The uploaded file response from Chainlit | |
| Returns: | |
| List of document chunks or None if processing fails | |
| """ | |
| print(f"Processing file: {file.name}") | |
| # Create a temporary file with the correct extension | |
| suffix = f".{file.name.split('.')[-1]}" | |
| with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as temp_file: | |
| try: | |
| # Copy the uploaded file content to the temporary file | |
| shutil.copyfile(file.path, temp_file.name) | |
| print(f"Created temporary file at: {temp_file.name}") | |
| # Get the appropriate loader | |
| loader = get_document_loader(temp_file.name) | |
| # Load documents | |
| documents = loader.load() | |
| # Initialize text splitter | |
| text_splitter = create_text_splitter() | |
| # Split documents into chunks | |
| texts = text_splitter.split_documents(documents) | |
| return texts | |
| except Exception as e: | |
| print(f"Error processing file: {e}") | |
| return None | |
| finally: | |
| # Clean up the temporary file | |
| try: | |
| os.unlink(temp_file.name) | |
| except Exception as e: | |
| print(f"Error cleaning up temporary file: {e}") |