Spaces:
Sleeping
Sleeping
| import os | |
| from pathlib import Path | |
| from docling.datamodel.document import ConversionResult | |
| from huggingface_hub import snapshot_download | |
| from docling.datamodel.base_models import InputFormat | |
| from docling.datamodel.pipeline_options import EasyOcrOptions, OcrMacOptions, PdfPipeline, PdfPipelineOptions, PipelineOptions, RapidOcrOptions, TesseractOcrOptions | |
| from docling.document_converter import DocumentConverter, ImageFormatOption, PdfFormatOption | |
| from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend, PyPdfiumPageBackend | |
| from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline | |
| from docling.pipeline.simple_pipeline import SimplePipeline | |
| # TODO: REFACTOR LOAD OCR MODEL TO JUST EITHER USE SERVER MODELS OR MOBILE MODELS | |
| def load_rapid_ocr_model(det_model: str, rec_model: str, cls_model: str) -> DocumentConverter: | |
| """ | |
| Load the RapidOCR model from Hugging Face Hub. | |
| Args: | |
| det_model (str): Path to the detection model. | |
| rec_model (str): Path to the recognition model. | |
| cls_model (str): Path to the classification model. | |
| Returns: | |
| DocumentConverter: The loaded RapidOCR model. | |
| """ | |
| print("Downloading RapidOCR models") | |
| download_path = snapshot_download(repo_id="SWHL/RapidOCR") | |
| det_model_path = os.path.join( | |
| download_path, det_model | |
| ) | |
| rec_model_path = os.path.join( | |
| download_path, rec_model | |
| ) | |
| cls_model_path = os.path.join( | |
| download_path, cls_model | |
| ) | |
| ocr_options = RapidOcrOptions( | |
| det_model_path=det_model_path, | |
| rec_model_path=rec_model_path, | |
| cls_model_path=cls_model_path | |
| ) | |
| pipeline_options = PdfPipelineOptions( | |
| ocr_options=ocr_options | |
| ) | |
| doc_converter = DocumentConverter( | |
| format_options={ | |
| InputFormat.IMAGE: ImageFormatOption( | |
| pipeline_options=pipeline_options | |
| ) | |
| } | |
| ) | |
| return doc_converter | |
| def load_ocr_mac_model() -> DocumentConverter: | |
| """ | |
| Load the OCR Mac model. | |
| Returns: | |
| DocumentConverter: The loaded OCR Mac model. | |
| """ | |
| ocr_options = OcrMacOptions( | |
| framework='vision' | |
| ) | |
| pipeline_options = PdfPipelineOptions( | |
| ocr_options=ocr_options | |
| ) | |
| doc_converter = DocumentConverter( | |
| allowed_formats=[ | |
| InputFormat.PDF, | |
| InputFormat.IMAGE, | |
| ], | |
| format_options={ | |
| InputFormat.PDF: PdfFormatOption( | |
| pipeline_cls=StandardPdfPipeline, backend=PyPdfiumDocumentBackend, pipeline_options=pipeline_options | |
| ), | |
| InputFormat.IMAGE: PdfFormatOption( | |
| pipeline_cls=StandardPdfPipeline, backend=PyPdfiumDocumentBackend, pipeline_options=pipeline_options | |
| ) | |
| } | |
| ) | |
| return doc_converter | |
| def load_tesseract_model(tessdata_path: str) -> DocumentConverter: | |
| """ | |
| Load the Tesseract OCR model. | |
| Args: | |
| tessdata_path (str): Path to the Tesseract data directory. | |
| Returns: | |
| DocumentConverter: The loaded Tesseract OCR model. | |
| """ | |
| os.environ["TESSDATA_PREFIX"] = tessdata_path | |
| ocr_options = TesseractOcrOptions() | |
| pipeline_options = PdfPipelineOptions( | |
| ocr_options=ocr_options | |
| ) | |
| doc_converter = DocumentConverter( | |
| allowed_formats=[ | |
| InputFormat.PDF, | |
| InputFormat.IMAGE | |
| ], | |
| format_options={ | |
| InputFormat.PDF: PdfFormatOption( | |
| pipeline_cls=StandardPdfPipeline, backend=PyPdfiumDocumentBackend, pipeline_options=pipeline_options | |
| ), | |
| InputFormat.IMAGE: PdfFormatOption( | |
| pipeline_cls=StandardPdfPipeline, backend=PyPdfiumDocumentBackend, pipeline_options=pipeline_options | |
| ) | |
| } | |
| ) | |
| return doc_converter | |
| def load_easy_ocr_model() -> DocumentConverter: | |
| """ | |
| Load the EasyOCR model. | |
| Returns: | |
| DocumentConverter: The loaded EasyOCR model. | |
| """ | |
| ocr_options = EasyOcrOptions() | |
| pipeline_options = PdfPipelineOptions( | |
| ocr_options=ocr_options | |
| ) | |
| doc_converter = DocumentConverter( | |
| allowed_formats=[ | |
| InputFormat.PDF, | |
| InputFormat.IMAGE | |
| ], | |
| format_options={ | |
| InputFormat.PDF: PdfFormatOption( | |
| pipeline_cls=StandardPdfPipeline, backend=PyPdfiumDocumentBackend, pipeline_options=pipeline_options | |
| ), | |
| InputFormat.IMAGE: PdfFormatOption( | |
| pipeline_cls=StandardPdfPipeline, backend=PyPdfiumDocumentBackend, pipeline_options=pipeline_options | |
| ) | |
| } | |
| ) | |
| return doc_converter | |
| def image_to_text(document_converter: DocumentConverter, file_path: Path) -> ConversionResult: | |
| """ | |
| Convert an image to text using the specified document converter. | |
| Args: | |
| document_converter (DocumentConverter): The document converter to use. | |
| file_path (Path): Path to the image file. | |
| Returns: | |
| ConversionResult: The result of the conversion. | |
| """ | |
| conv_results = document_converter.convert(file_path) | |
| return conv_results | |