|
|
|
|
|
""" |
|
|
Utility Functions Module |
|
|
|
|
|
Collection of utility functions used throughout the application. |
|
|
This module contains helper functions for file operations, formatting, |
|
|
and document processing utilities. |
|
|
""" |
|
|
|
|
|
from typing import List, Optional |
|
|
from pathlib import Path |
|
|
|
|
|
|
|
|
def get_mime_type(file_path: Path) -> str: |
|
|
"""Get MIME type based on file extension""" |
|
|
file_extension = file_path.suffix.lower() |
|
|
if file_extension == '.pdf': |
|
|
return 'application/pdf' |
|
|
elif file_extension in ['.doc', '.docx']: |
|
|
return 'application/msword' |
|
|
elif file_extension == '.txt': |
|
|
return 'text/plain' |
|
|
elif file_extension == '.md': |
|
|
return 'text/markdown' |
|
|
else: |
|
|
return 'application/octet-stream' |
|
|
|
|
|
|
|
|
def format_document_title(doc_name: str) -> str: |
|
|
"""Format document name into a readable title""" |
|
|
if '.' in doc_name: |
|
|
doc_title = doc_name.rsplit('.', 1)[0].replace('_', ' ').replace('-', ' ').title() |
|
|
else: |
|
|
doc_title = doc_name.replace('_', ' ').replace('-', ' ').title() |
|
|
return doc_title |
|
|
|
|
|
|
|
|
def count_documents_in_directory(directory: Path, supported_extensions: Optional[List[str]] = None) -> int: |
|
|
"""Count supported documents in a directory recursively""" |
|
|
if supported_extensions is None: |
|
|
supported_extensions = ['.pdf', '.docx', '.doc', '.txt', '.md'] |
|
|
|
|
|
return sum(1 for f in directory.rglob('*') |
|
|
if f.is_file() and f.suffix.lower() in supported_extensions) |
|
|
|
|
|
|
|
|
def create_document_processor(store_name: Optional[str] = None) -> 'DocumentProcessor': |
|
|
""" |
|
|
Create and initialize a DocumentProcessor. |
|
|
|
|
|
This utility function encapsulates the common pattern of creating a DocumentProcessor |
|
|
instance. |
|
|
|
|
|
Args: |
|
|
store_name: Optional name for the FAISS store (uses config default if None) |
|
|
|
|
|
Returns: |
|
|
Initialized DocumentProcessor instance |
|
|
""" |
|
|
from app.core.document_processor import DocumentProcessor |
|
|
|
|
|
|
|
|
processor = DocumentProcessor(store_name=store_name) |
|
|
|
|
|
return processor |
|
|
|