Spaces:

mbudisic
/

AIE-RAG

Sleeping

File size: 1,742 Bytes

import os
import tempfile
from typing import List
from fastapi import UploadFile
from aimakerspace.text_utils import CharacterTextSplitter, TextFileLoader, PDFLoader


class FileProcessor:
    def __init__(self):
        self.text_splitter = CharacterTextSplitter()

    async def process_file(self, file: UploadFile) -> List[str]:
        """Process an uploaded file and return text chunks."""
        print(f"Processing file: {file.filename}")

        # Create a temporary file with the correct extension
        suffix = f".{file.filename.split('.')[-1]}"
        with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as temp_file:
            # Write the uploaded file content to the temporary file
            content = await file.read()
            temp_file.write(content)
            temp_file.flush()
            print(f"Created temporary file at: {temp_file.name}")

            try:
                # Create appropriate loader based on file type
                loader = self._get_loader(temp_file.name, file.filename)

                # Load and process the documents
                documents = loader.load_documents()
                texts = self.text_splitter.split_texts(documents)
                return texts
            finally:
                # Clean up the temporary file
                try:
                    os.unlink(temp_file.name)
                except Exception as e:
                    print(f"Error cleaning up temporary file: {e}")

    def _get_loader(self, file_path: str, original_filename: str):
        """Get the appropriate loader based on file type."""
        if original_filename.lower().endswith(".pdf"):
            return PDFLoader(file_path)
        return TextFileLoader(file_path)