Spaces:
Runtime error
Runtime error
| from pathlib import Path | |
| from typing import List, Union | |
| import logging | |
| from dataclasses import dataclass | |
| from langchain_core.documents import Document as LCDocument | |
| from langchain_core.document_loaders import BaseLoader | |
| from docling.document_converter import DocumentConverter, PdfFormatOption | |
| from docling.datamodel.base_models import InputFormat, ConversionStatus | |
| from docling.datamodel.pipeline_options import ( | |
| PdfPipelineOptions, | |
| EasyOcrOptions | |
| ) | |
| logging.basicConfig(level=logging.INFO) | |
| _log = logging.getLogger(__name__) | |
| class ProcessingResult: | |
| """Store results of document processing""" | |
| success_count: int = 0 | |
| failure_count: int = 0 | |
| partial_success_count: int = 0 | |
| failed_files: List[str] = None | |
| def __post_init__(self): | |
| if self.failed_files is None: | |
| self.failed_files = [] | |
| class MultiFormatDocumentLoader(BaseLoader): | |
| """Loader for multiple document formats that converts to LangChain documents""" | |
| def __init__( | |
| self, | |
| file_paths: Union[str, List[str]], | |
| enable_ocr: bool = True, | |
| enable_tables: bool = True | |
| ): | |
| self._file_paths = [file_paths] if isinstance(file_paths, str) else file_paths | |
| self._enable_ocr = enable_ocr | |
| self._enable_tables = enable_tables | |
| self._converter = self._setup_converter() | |
| def _setup_converter(self): | |
| """Set up the document converter with appropriate options""" | |
| # Configure pipeline options | |
| pipeline_options = PdfPipelineOptions(do_ocr=False, do_table_structure=False, ocr_options=EasyOcrOptions( | |
| force_full_page_ocr=True | |
| )) | |
| if self._enable_ocr: | |
| pipeline_options.do_ocr = True | |
| if self._enable_tables: | |
| pipeline_options.do_table_structure = True | |
| pipeline_options.table_structure_options.do_cell_matching = True | |
| # Create converter with supported formats | |
| return DocumentConverter( | |
| allowed_formats=[ | |
| InputFormat.PDF, | |
| InputFormat.IMAGE, | |
| InputFormat.DOCX, | |
| InputFormat.HTML, | |
| InputFormat.PPTX, | |
| InputFormat.ASCIIDOC, | |
| InputFormat.MD, | |
| ], | |
| format_options={ | |
| InputFormat.PDF: PdfFormatOption( | |
| pipeline_options=pipeline_options, | |
| )} | |
| ) | |
| def lazy_load(self): | |
| """Convert documents and yield LangChain documents""" | |
| results = ProcessingResult() | |
| for file_path in self._file_paths: | |
| try: | |
| path = Path(file_path) | |
| if not path.exists(): | |
| _log.warning(f"File not found: {file_path}") | |
| results.failure_count += 1 | |
| results.failed_files.append(file_path) | |
| continue | |
| conversion_result = self._converter.convert(path) | |
| if conversion_result.status == ConversionStatus.SUCCESS: | |
| results.success_count += 1 | |
| text = conversion_result.document.export_to_markdown() | |
| metadata = { | |
| 'source': str(path), | |
| 'file_type': path.suffix, | |
| } | |
| yield LCDocument( | |
| page_content=text, | |
| metadata=metadata | |
| ) | |
| elif conversion_result.status == ConversionStatus.PARTIAL_SUCCESS: | |
| results.partial_success_count += 1 | |
| _log.warning(f"Partial conversion for {file_path}") | |
| text = conversion_result.document.export_to_markdown() | |
| metadata = { | |
| 'source': str(path), | |
| 'file_type': path.suffix, | |
| 'conversion_status': 'partial' | |
| } | |
| yield LCDocument( | |
| page_content=text, | |
| metadata=metadata | |
| ) | |
| else: | |
| results.failure_count += 1 | |
| results.failed_files.append(file_path) | |
| _log.error(f"Failed to convert {file_path}") | |
| except Exception as e: | |
| _log.error(f"Error processing {file_path}: {str(e)}") | |
| results.failure_count += 1 | |
| results.failed_files.append(file_path) | |
| # Log final results | |
| total = results.success_count + results.partial_success_count + results.failure_count | |
| _log.info( | |
| f"Processed {total} documents:\n" | |
| f"- Successfully converted: {results.success_count}\n" | |
| f"- Partially converted: {results.partial_success_count}\n" | |
| f"- Failed: {results.failure_count}" | |
| ) | |
| if results.failed_files: | |
| _log.info("Failed files:") | |
| for file in results.failed_files: | |
| _log.info(f"- {file}") | |
| if __name__ == '__main__': | |
| # Load documents from a list of file paths | |
| loader = MultiFormatDocumentLoader( | |
| file_paths=[ | |
| # './data/2404.19756v1.pdf', | |
| # './data/OD429347375590223100.pdf', | |
| '/teamspace/studios/this_studio/TabularRAG/data/FeesPaymentReceipt_7thsem.pdf', | |
| # './data/UNIT 2 GENDER BASED VIOLENCE.pptx' | |
| ], | |
| enable_ocr=False, | |
| enable_tables=True | |
| ) | |
| for doc in loader.lazy_load(): | |
| print(doc.page_content) | |
| print(doc.metadata) | |
| # save document in .md file | |
| with open('/teamspace/studios/this_studio/TabularRAG/data/output.md', 'w') as f: | |
| f.write(doc.page_content) |