Spaces:

salverz
/

llm-document-parser

Running

App Files Files Community

salverz commited on May 28, 2025

Commit

3f7e152

1 Parent(s): f5bd856

Add project files

Browse files

Files changed (7) hide show

__init__.py +0 -0
app.py +137 -4
config.py +50 -0
convert_doc_docling.py +160 -0
export_data.py +72 -0
instructor_llm.py +52 -0
requirements.txt +147 -0

__init__.py ADDED Viewed

File without changes

app.py CHANGED Viewed

@@ -1,7 +1,140 @@
 import gradio as gr
-def greet(name):
-    return "Hello " + name + "!!"
-demo = gr.Interface(fn=greet, inputs="text", outputs="text")
-demo.launch()

 import gradio as gr
+from pathlib import Path
+import pandas as pd
+import importlib
+from docling.document_converter import DocumentConverter
+import llm_document_parser.config as config
+from llm_document_parser.instructor_llm import extract_json_data_using_ollama_llm, pull_ollama_model
+from llm_document_parser.convert_doc_docling import (
+    load_rapid_ocr_model,
+    load_easy_ocr_model,
+    load_ocr_mac_model,
+    load_tesseract_model,
+    image_to_text
+)
+from llm_document_parser.export_data import export_as_csv, export_as_json, combine_json_data_into_df, convert_json_to_df
+print("RUNNING gradio_app.py FROM:", __file__)
+# Load OCR model based on config
+def load_ocr_model_from_config(model_type: str) -> DocumentConverter:
+    """
+    Load the OCR model based on the configuration.
+    Args:
+        model_type (str): The type of OCR model to load.
+    Returns:
+        object: The loaded OCR model.
+    """
+    if model_type == "rapid":
+        # TODO: REFACTOR LOAD OCR MODEL TO JUST EITHER USE SERVER MODELS OR MOBILE MODELS
+        return load_rapid_ocr_model(
+            "PP-OCRv4/ch_PP-OCRv4_det_server_infer.onnx",
+            "PP-OCRv3/ch_PP-OCRv3_rec_infer.onnx",
+            "PP-OCRv3/ch_ppocr_mobile_v2.0_cls_train.onnx"
+        )
+    if model_type == "easy":
+        return load_easy_ocr_model()
+    if model_type == "ocrmac":
+        return load_ocr_mac_model()
+    if model_type == "tesseract":
+        return load_tesseract_model(config.TESSERACT_TESSDATA_LOCATION)
+    raise ValueError(f"Unknown OCR model type in config: {model_type}")
+def save_results(export_type: str, output_file_name: str, df: pd.DataFrame, output_folder: str) -> str:
+    """
+    Save the results in the specified format.
+    Args:
+        export_type (str): The type of export (e.g., "csv").
+        output_file_name (str): The name of the output file.
+        json_data (str): The JSON data to save.
+        output_folder (str): The folder to save the output file.
+    Returns:
+        output_data (str): The output data from the LLM formatted into the specified format
+    """
+    if export_type == "csv":
+        return export_as_csv(df=df, output_folder=output_folder, output_file_name=output_file_name)
+    if export_type == "json":
+        return export_as_json(df=df, output_folder=output_folder, output_file_name=output_file_name)
+    return ""
+def process_file(input_path: Path, document_converter: DocumentConverter) -> str:
+    conversion_result = image_to_text(document_converter, input_path)
+    ocr_text_data = conversion_result.document.export_to_markdown()
+    json_data = extract_json_data_using_ollama_llm(
+        prompt=config.LLM_PROMPT,
+        text_data=ocr_text_data,
+        ollama_model=config.OLLAMA_MODEL,
+        response_model=config.RESPONSE_MODEL
+    )
+    return json_data
+# Full processing pipeline
+def run_full_pipeline(file_inputs):
+    document_converter = load_ocr_model_from_config(config.OCR_MODEL)
+    pull_ollama_model(config.OLLAMA_MODEL)
+    df = pd.DataFrame()
+    if type(file_inputs) == list:
+        json_data_objects = list()
+        for file in file_inputs:
+            json_data = process_file(file, document_converter)
+            json_data_objects.append(json_data)
+            df = combine_json_data_into_df(json_data_objects)
+    else:
+        json_data = process_file(Path(file_inputs), document_converter)
+        df = convert_json_to_df(json_data)
+    return save_results(export_type=config.EXPORT_TYPE,output_file_name=config.OUTPUT_FILE_NAME, df=df, output_folder=config.OUTPUT_FOLDER)
+'''
+base_dir = Path(os.path.dirname(__file__))
+config_file_path = base_dir / "src" / "llm_document_parser" / "config.py"
+config_file_path = config_file_path.resolve()
+code_contents = config_file_path.read_text()
+def load_config():
+    return config_file_path.read_text()
+def save_config(updated_config):
+    config_file_path.write_text(updated_config)
+    importlib.reload(config)
+    return "Config updated successfully!"
+'''
+with gr.Blocks() as demo:
+    gr.Markdown(f"""
+    # LLM Document Parser
+    Checkout the GitHub repo for this Blueprint: https://github.com/oronadavid/llm-document-parser
+    This app extracts structured data from a document using OCR and a local LLM.\n
+    Selected OCR model: `{config.OCR_MODEL}`\n
+    Selected LLM model: `{config.OLLAMA_MODEL}`\n
+    Export format: `{config.EXPORT_TYPE}`\n
+    Response Model: `{config.RESPONSE_MODEL.__name__}`
+    """)
+    file_input = gr.File(file_types=["image", ".pdf"], file_count="multiple", label="Upload Document(s) (Image/PDF)")
+    run_button = gr.Button("Parse Documents")
+    output_text = gr.JSON(label="Extracted Data")
+    run_button.click(fn=run_full_pipeline, inputs=file_input, outputs=output_text)
+    '''
+    gr.Markdown("""# Config
+    To update the config, make changes, then click "Update Config" below
+    """)
+    config_editor = gr.Code(code_contents, language="python", label="Config")
+    save_config_button = gr.Button("Update Config")
+    status = gr.Textbox(label="Status")
+    demo.load(fn=load_config, outputs=config_editor)
+    save_config_button.click(fn=save_config, inputs=config_editor, outputs=status)
+    '''
+if __name__ == "__main__":
+    demo.launch(share=True)

config.py ADDED Viewed

	@@ -0,0 +1,50 @@

+# config.py
+from pydantic import BaseModel
+from datetime import date
+from typing import List
+# Options: "rapid", "easy", "ocrmac", "tesseract"
+OCR_MODEL = "easy"
+# Must be set when using the tesseract OCR model
+# Linux: "/usr/share/tesseract-ocr/4.00/tessdata"
+# Windows: "C:\\Program Files\\Tesseract-OCR\\tessdata"
+# Mac: "/usr/local/share/tessdata" or "/opt/homebrew/share/tessdata"
+TESSERACT_TESSDATA_LOCATION = "/usr/share/tesseract-ocr/4.00/tessdata"
+OLLAMA_MODEL = "llama3:instruct"
+LLM_PROMPT = """
+        Extract all transactions from the following statement. Each transaction must be returned as a JSON object with the fields: transaction_date (YYYY-MM-DD), description, amount, and transaction_type ('deposit' or 'withdrawal'). All of these must be returned as a list of JSON objects under a key called 'transactions'. Here is an example:
+        [
+            {
+                transaction_date: 2025-01-24,
+                description: "Walmart",
+                amount: 34.24,
+                transaction_type: "withdrawl"
+            }
+        ]
+"""
+# Options: "csv", "json", "excel"
+EXPORT_TYPE = "json"
+# Can be a file or directory
+INPUT_PATH = ""
+OUTPUT_FOLDER = ""
+OUTPUT_FILE_NAME = "output"
+# Define Pydantic response models for instructor:
+class BankStatementEntry(BaseModel):
+    transaction_date: date | None | str
+    description: str | None
+    amount: float | None
+    #transaction_type: Literal['deposit', 'withdrawal', None]
+    transaction_type: str | None
+class BankStatement(BaseModel):
+    transactions: List[BankStatementEntry] | None
+# The model that LLM output will conform to
+RESPONSE_MODEL = BankStatement

convert_doc_docling.py ADDED Viewed

	@@ -0,0 +1,160 @@

+import os
+from pathlib import Path
+from docling.datamodel.document import ConversionResult
+from huggingface_hub import snapshot_download
+from docling.datamodel.base_models import InputFormat
+from docling.datamodel.pipeline_options import EasyOcrOptions, OcrMacOptions, PdfPipeline, PdfPipelineOptions, PipelineOptions, RapidOcrOptions, TesseractOcrOptions
+from docling.document_converter import DocumentConverter, ImageFormatOption, PdfFormatOption
+from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend, PyPdfiumPageBackend
+from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline
+from docling.pipeline.simple_pipeline import SimplePipeline
+# TODO: REFACTOR LOAD OCR MODEL TO JUST EITHER USE SERVER MODELS OR MOBILE MODELS
+def load_rapid_ocr_model(det_model: str, rec_model: str, cls_model: str) -> DocumentConverter:
+    """
+    Load the RapidOCR model from Hugging Face Hub.
+    Args:
+        det_model (str): Path to the detection model.
+        rec_model (str): Path to the recognition model.
+        cls_model (str): Path to the classification model.
+    Returns:
+        DocumentConverter: The loaded RapidOCR model.
+    """
+    print("Downloading RapidOCR models")
+    download_path = snapshot_download(repo_id="SWHL/RapidOCR")
+    det_model_path = os.path.join(
+        download_path, det_model
+    )
+    rec_model_path = os.path.join(
+        download_path, rec_model
+    )
+    cls_model_path = os.path.join(
+        download_path, cls_model
+    )
+    ocr_options = RapidOcrOptions(
+        det_model_path=det_model_path,
+        rec_model_path=rec_model_path,
+        cls_model_path=cls_model_path
+    )
+    pipeline_options = PdfPipelineOptions(
+        ocr_options=ocr_options
+    )
+    doc_converter = DocumentConverter(
+        format_options={
+            InputFormat.IMAGE: ImageFormatOption(
+                pipeline_options=pipeline_options
+            )
+        }
+    )
+    return doc_converter
+def load_ocr_mac_model() -> DocumentConverter:
+    """
+    Load the OCR Mac model.
+    Returns:
+        DocumentConverter: The loaded OCR Mac model.
+    """
+    ocr_options = OcrMacOptions(
+        framework='vision'
+    )
+    pipeline_options = PdfPipelineOptions(
+        ocr_options=ocr_options
+    )
+    doc_converter = DocumentConverter(
+        allowed_formats=[
+            InputFormat.PDF,
+            InputFormat.IMAGE,
+        ],
+        format_options={
+            InputFormat.PDF: PdfFormatOption(
+                pipeline_cls=StandardPdfPipeline, backend=PyPdfiumDocumentBackend, pipeline_options=pipeline_options
+            ),
+            InputFormat.IMAGE: PdfFormatOption(
+                pipeline_cls=StandardPdfPipeline, backend=PyPdfiumDocumentBackend, pipeline_options=pipeline_options
+            )
+        }
+    )
+    return doc_converter
+def load_tesseract_model(tessdata_path: str) -> DocumentConverter:
+    """
+    Load the Tesseract OCR model.
+    Args:
+        tessdata_path (str): Path to the Tesseract data directory.
+    Returns:
+        DocumentConverter: The loaded Tesseract OCR model.
+    """
+    os.environ["TESSDATA_PREFIX"] = tessdata_path
+    ocr_options = TesseractOcrOptions()
+    pipeline_options = PdfPipelineOptions(
+        ocr_options=ocr_options
+    )
+    doc_converter = DocumentConverter(
+        allowed_formats=[
+            InputFormat.PDF,
+            InputFormat.IMAGE
+        ],
+        format_options={
+            InputFormat.PDF: PdfFormatOption(
+                pipeline_cls=StandardPdfPipeline, backend=PyPdfiumDocumentBackend, pipeline_options=pipeline_options
+            ),
+            InputFormat.IMAGE: PdfFormatOption(
+                pipeline_cls=StandardPdfPipeline, backend=PyPdfiumDocumentBackend, pipeline_options=pipeline_options
+            )
+        }
+    )
+    return doc_converter
+def load_easy_ocr_model() -> DocumentConverter:
+    """
+    Load the EasyOCR model.
+    Returns:
+        DocumentConverter: The loaded EasyOCR model.
+    """
+    ocr_options = EasyOcrOptions()
+    pipeline_options = PdfPipelineOptions(
+        ocr_options=ocr_options
+    )
+    doc_converter = DocumentConverter(
+        allowed_formats=[
+            InputFormat.PDF,
+            InputFormat.IMAGE
+        ],
+        format_options={
+            InputFormat.PDF: PdfFormatOption(
+                pipeline_cls=StandardPdfPipeline, backend=PyPdfiumDocumentBackend, pipeline_options=pipeline_options
+            ),
+            InputFormat.IMAGE: PdfFormatOption(
+                pipeline_cls=StandardPdfPipeline, backend=PyPdfiumDocumentBackend, pipeline_options=pipeline_options
+            )
+        }
+    )
+    return doc_converter
+def image_to_text(document_converter: DocumentConverter, file_path: Path) -> ConversionResult:
+    """
+    Convert an image to text using the specified document converter.
+    Args:
+        document_converter (DocumentConverter): The document converter to use.
+        file_path (Path): Path to the image file.
+    Returns:
+        ConversionResult: The result of the conversion.
+    """
+    conv_results = document_converter.convert(file_path)
+    return conv_results

export_data.py ADDED Viewed

	@@ -0,0 +1,72 @@

+import pandas as pd
+from pathlib import Path
+import json
+from typing import List
+def convert_json_to_df(json_data: str) -> pd.DataFrame:
+    """
+    Convert a JSON string into a pandas DataFrame.
+    Automatically extracts the first top-level list if present.
+    """
+    data = json.loads(json_data)
+    # Try to extract the list of transactions if it's wrapped
+    list_name = None
+    for key, value in data.items():
+        if isinstance(value, list):
+            list_name = key
+            break
+    if list_name:
+        data = data[list_name]
+    return pd.DataFrame(data)
+def combine_json_data_into_df(json_data_objects: List[str]) -> pd.DataFrame:
+    json_dfs = list()
+    for json_object in json_data_objects:
+        json_dfs.append(convert_json_to_df(json_object))
+    return pd.concat(json_dfs)
+def export_as_csv(df: pd.DataFrame, output_folder: str, output_file_name: str) -> str:
+    """
+    Save a DataFrame as a CSV file, avoiding overwriting by incrementing filenames.
+    """
+    output_folder_path = Path(output_folder)
+    if not output_folder_path.is_dir():
+        print(f"Creating path {output_folder}")
+        output_folder_path.mkdir(parents=True)
+    file_index = 0
+    while True:
+        full_output_path = output_folder_path / f"{output_file_name}{file_index}.csv"
+        if not full_output_path.exists():
+            break
+        file_index += 1
+    df.to_csv(full_output_path, index=False)
+    print(f"Saved CSV to {full_output_path}")
+    return df.to_csv(path_or_buf=None, index=False)
+def export_as_json(df: pd.DataFrame, output_folder: str, output_file_name: str) -> str:
+    """
+    Save raw JSON string to a file, avoiding overwriting by incrementing filenames.
+    """
+    output_folder_path = Path(output_folder)
+    if not output_folder_path.is_dir():
+        print(f"Creating path {output_folder}")
+        output_folder_path.mkdir(parents=True)
+    file_index = 0
+    while True:
+        full_output_path = output_folder_path / f"{output_file_name}{file_index}.json"
+        if not full_output_path.exists():
+            break
+        file_index += 1
+    df.to_json(full_output_path, orient='records')
+    print(f"Saved JSON to {full_output_path}")
+    return df.to_json(orient='records') or ""

instructor_llm.py ADDED Viewed

	@@ -0,0 +1,52 @@

+import instructor
+from openai import OpenAI
+from pydantic import BaseModel
+from typing import Type
+import ollama
+def pull_ollama_model(model: str):
+    """
+    Pull a model from ollama if it is not already downloaded
+    """
+    if not model.__contains__(":"):
+        model += ":latest"
+    for downloaded_model in ollama.list()["models"]:
+        if downloaded_model['model']== model:
+            print(f"Model {downloaded_model['model']} is installed")
+            return
+    print(f"Model {model} is not installed")
+    print(f"Downloading {model} model...")
+    ollama.pull(model)
+def extract_json_data_using_ollama_llm(prompt: str, text_data: str, ollama_model: str, response_model: Type[BaseModel]) -> str:
+    """
+    Pass prompt and data into an ollama LLM using instructor
+    """
+    client = instructor.from_openai(
+        OpenAI(
+            base_url="http://localhost:11434/v1",
+            api_key="ollama"
+        ),
+        mode=instructor.Mode.JSON
+    )
+    resp = client.chat.completions.create(
+        model=ollama_model,
+        messages=[
+            {
+                'role': 'system',
+                'content': prompt
+            },
+            {
+                'role': 'user',
+                'content': text_data
+            },
+        ],
+        response_model=response_model,
+        max_retries=3
+    )
+    return resp.model_dump_json(indent=4)

requirements.txt ADDED Viewed

	@@ -0,0 +1,147 @@

+aiofiles==24.1.0
+aiohappyeyeballs==2.6.1
+aiohttp==3.11.16
+aiosignal==1.3.2
+annotated-types==0.7.0
+anyio==4.9.0
+async-timeout==5.0.1
+attrs==25.3.0
+beautifulsoup4==4.13.4
+certifi==2025.1.31
+charset-normalizer==3.4.1
+click==8.1.8
+coloredlogs==15.0.1
+dill==0.4.0
+distro==1.9.0
+docling==2.30.0
+docling-core==2.26.4
+docling-ibm-models==3.4.1
+docling-parse==4.0.1
+docstring_parser==0.16
+easyocr==1.7.2
+et_xmlfile==2.0.0
+exceptiongroup==1.2.2
+fastapi==0.115.12
+ffmpy==0.5.0
+filelock==3.18.0
+filetype==1.2.0
+flatbuffers==25.2.10
+frozenlist==1.5.0
+fsspec==2025.3.2
+gradio==5.27.1
+gradio_client==1.9.1
+groovy==0.1.2
+h11==0.14.0
+httpcore==1.0.8
+httpx==0.28.1
+huggingface-hub==0.30.2
+humanfriendly==10.0
+idna==3.10
+imageio==2.37.0
+instructor==1.7.9
+Jinja2==3.1.6
+jiter==0.8.2
+jsonlines==3.1.0
+jsonref==1.1.0
+jsonschema==4.23.0
+jsonschema-specifications==2024.10.1
+latex2mathml==3.77.0
+lazy_loader==0.4
+-e git+ssh://git@github.com/oronadavid/llm-document-parser.git@467ef6e3183983d82ed35a4fdc3cbdf78ab44952#egg=llm_document_parser_blueprint
+loguru==0.7.3
+lxml==5.3.2
+markdown-it-py==3.0.0
+marko==2.1.3
+MarkupSafe==3.0.2
+mdurl==0.1.2
+mpire==2.10.2
+mpmath==1.3.0
+multidict==6.4.3
+multiprocess==0.70.17
+networkx==3.4.2
+ninja==1.11.1.4
+numpy==2.2.4
+nvidia-cublas-cu12==12.4.5.8
+nvidia-cuda-cupti-cu12==12.4.127
+nvidia-cuda-nvrtc-cu12==12.4.127
+nvidia-cuda-runtime-cu12==12.4.127
+nvidia-cudnn-cu12==9.1.0.70
+nvidia-cufft-cu12==11.2.1.3
+nvidia-curand-cu12==10.3.5.147
+nvidia-cusolver-cu12==11.6.1.9
+nvidia-cusparse-cu12==12.3.1.170
+nvidia-cusparselt-cu12==0.6.2
+nvidia-nccl-cu12==2.21.5
+nvidia-nvjitlink-cu12==12.4.127
+nvidia-nvtx-cu12==12.4.127
+ollama==0.4.7
+onnxruntime==1.21.0
+onnxruntime-gpu==1.21.0
+openai==1.74.0
+opencv-python==4.11.0.86
+opencv-python-headless==4.11.0.86
+openpyxl==3.1.5
+orjson==3.10.16
+packaging==24.2
+pandas==2.2.3
+pillow==11.2.1
+pluggy==1.5.0
+propcache==0.3.1
+protobuf==6.30.2
+pyclipper==1.3.0.post6
+pydantic==2.11.3
+pydantic-settings==2.8.1
+pydantic_core==2.33.1
+pydub==0.25.1
+Pygments==2.19.1
+pylatexenc==2.10
+pypdfium2==4.30.1
+python-bidi==0.6.6
+python-dateutil==2.9.0.post0
+python-docx==1.1.2
+python-dotenv==1.1.0
+python-multipart==0.0.20
+python-pptx==1.0.2
+pytz==2025.2
+PyYAML==6.0.2
+rapidocr-onnxruntime==1.4.4
+referencing==0.36.2
+regex==2024.11.6
+requests==2.32.3
+rich==13.9.4
+rpds-py==0.24.0
+rtree==1.4.0
+ruff==0.11.7
+safehttpx==0.1.6
+safetensors==0.5.3
+scikit-image==0.25.2
+scipy==1.15.2
+semantic-version==2.10.0
+semchunk==2.2.2
+shapely==2.1.0
+shellingham==1.5.4
+six==1.17.0
+sniffio==1.3.1
+soupsieve==2.6
+starlette==0.46.2
+sympy==1.13.1
+tabulate==0.9.0
+tenacity==9.1.2
+tesserocr==2.8.0
+tifffile==2025.3.30
+tokenizers==0.21.1
+tomlkit==0.13.2
+torch==2.6.0
+torchvision==0.21.0
+tqdm==4.67.1
+transformers==4.51.3
+triton==3.2.0
+typer==0.15.2
+typing-inspection==0.4.0
+typing_extensions==4.13.2
+tzdata==2025.2
+urllib3==2.4.0
+uvicorn==0.34.2
+websockets==15.0.1
+XlsxWriter==3.2.2
+yarl==1.19.0