salverz commited on
Commit
3f7e152
·
1 Parent(s): f5bd856

Add project files

Browse files
Files changed (7) hide show
  1. __init__.py +0 -0
  2. app.py +137 -4
  3. config.py +50 -0
  4. convert_doc_docling.py +160 -0
  5. export_data.py +72 -0
  6. instructor_llm.py +52 -0
  7. requirements.txt +147 -0
__init__.py ADDED
File without changes
app.py CHANGED
@@ -1,7 +1,140 @@
1
  import gradio as gr
 
 
 
 
2
 
3
- def greet(name):
4
- return "Hello " + name + "!!"
5
 
6
- demo = gr.Interface(fn=greet, inputs="text", outputs="text")
7
- demo.launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import gradio as gr
2
+ from pathlib import Path
3
+ import pandas as pd
4
+ import importlib
5
+ from docling.document_converter import DocumentConverter
6
 
7
+ import llm_document_parser.config as config
 
8
 
9
+ from llm_document_parser.instructor_llm import extract_json_data_using_ollama_llm, pull_ollama_model
10
+ from llm_document_parser.convert_doc_docling import (
11
+ load_rapid_ocr_model,
12
+ load_easy_ocr_model,
13
+ load_ocr_mac_model,
14
+ load_tesseract_model,
15
+ image_to_text
16
+ )
17
+ from llm_document_parser.export_data import export_as_csv, export_as_json, combine_json_data_into_df, convert_json_to_df
18
+
19
+ print("RUNNING gradio_app.py FROM:", __file__)
20
+
21
+ # Load OCR model based on config
22
+ def load_ocr_model_from_config(model_type: str) -> DocumentConverter:
23
+ """
24
+ Load the OCR model based on the configuration.
25
+ Args:
26
+ model_type (str): The type of OCR model to load.
27
+ Returns:
28
+ object: The loaded OCR model.
29
+ """
30
+ if model_type == "rapid":
31
+ # TODO: REFACTOR LOAD OCR MODEL TO JUST EITHER USE SERVER MODELS OR MOBILE MODELS
32
+ return load_rapid_ocr_model(
33
+ "PP-OCRv4/ch_PP-OCRv4_det_server_infer.onnx",
34
+ "PP-OCRv3/ch_PP-OCRv3_rec_infer.onnx",
35
+ "PP-OCRv3/ch_ppocr_mobile_v2.0_cls_train.onnx"
36
+ )
37
+ if model_type == "easy":
38
+ return load_easy_ocr_model()
39
+ if model_type == "ocrmac":
40
+ return load_ocr_mac_model()
41
+ if model_type == "tesseract":
42
+ return load_tesseract_model(config.TESSERACT_TESSDATA_LOCATION)
43
+
44
+ raise ValueError(f"Unknown OCR model type in config: {model_type}")
45
+
46
+
47
+ def save_results(export_type: str, output_file_name: str, df: pd.DataFrame, output_folder: str) -> str:
48
+ """
49
+ Save the results in the specified format.
50
+ Args:
51
+ export_type (str): The type of export (e.g., "csv").
52
+ output_file_name (str): The name of the output file.
53
+ json_data (str): The JSON data to save.
54
+ output_folder (str): The folder to save the output file.
55
+ Returns:
56
+ output_data (str): The output data from the LLM formatted into the specified format
57
+ """
58
+ if export_type == "csv":
59
+ return export_as_csv(df=df, output_folder=output_folder, output_file_name=output_file_name)
60
+ if export_type == "json":
61
+ return export_as_json(df=df, output_folder=output_folder, output_file_name=output_file_name)
62
+
63
+ return ""
64
+
65
+ def process_file(input_path: Path, document_converter: DocumentConverter) -> str:
66
+ conversion_result = image_to_text(document_converter, input_path)
67
+ ocr_text_data = conversion_result.document.export_to_markdown()
68
+
69
+ json_data = extract_json_data_using_ollama_llm(
70
+ prompt=config.LLM_PROMPT,
71
+ text_data=ocr_text_data,
72
+ ollama_model=config.OLLAMA_MODEL,
73
+ response_model=config.RESPONSE_MODEL
74
+ )
75
+ return json_data
76
+
77
+ # Full processing pipeline
78
+ def run_full_pipeline(file_inputs):
79
+ document_converter = load_ocr_model_from_config(config.OCR_MODEL)
80
+ pull_ollama_model(config.OLLAMA_MODEL)
81
+
82
+ df = pd.DataFrame()
83
+ if type(file_inputs) == list:
84
+ json_data_objects = list()
85
+ for file in file_inputs:
86
+ json_data = process_file(file, document_converter)
87
+ json_data_objects.append(json_data)
88
+ df = combine_json_data_into_df(json_data_objects)
89
+ else:
90
+ json_data = process_file(Path(file_inputs), document_converter)
91
+ df = convert_json_to_df(json_data)
92
+
93
+ return save_results(export_type=config.EXPORT_TYPE,output_file_name=config.OUTPUT_FILE_NAME, df=df, output_folder=config.OUTPUT_FOLDER)
94
+ '''
95
+ base_dir = Path(os.path.dirname(__file__))
96
+ config_file_path = base_dir / "src" / "llm_document_parser" / "config.py"
97
+ config_file_path = config_file_path.resolve()
98
+ code_contents = config_file_path.read_text()
99
+
100
+ def load_config():
101
+ return config_file_path.read_text()
102
+
103
+ def save_config(updated_config):
104
+ config_file_path.write_text(updated_config)
105
+ importlib.reload(config)
106
+ return "Config updated successfully!"
107
+ '''
108
+
109
+ with gr.Blocks() as demo:
110
+ gr.Markdown(f"""
111
+ # LLM Document Parser
112
+ Checkout the GitHub repo for this Blueprint: https://github.com/oronadavid/llm-document-parser
113
+
114
+ This app extracts structured data from a document using OCR and a local LLM.\n
115
+ Selected OCR model: `{config.OCR_MODEL}`\n
116
+ Selected LLM model: `{config.OLLAMA_MODEL}`\n
117
+ Export format: `{config.EXPORT_TYPE}`\n
118
+ Response Model: `{config.RESPONSE_MODEL.__name__}`
119
+ """)
120
+
121
+ file_input = gr.File(file_types=["image", ".pdf"], file_count="multiple", label="Upload Document(s) (Image/PDF)")
122
+
123
+ run_button = gr.Button("Parse Documents")
124
+ output_text = gr.JSON(label="Extracted Data")
125
+ run_button.click(fn=run_full_pipeline, inputs=file_input, outputs=output_text)
126
+
127
+ '''
128
+ gr.Markdown("""# Config
129
+ To update the config, make changes, then click "Update Config" below
130
+ """)
131
+ config_editor = gr.Code(code_contents, language="python", label="Config")
132
+ save_config_button = gr.Button("Update Config")
133
+ status = gr.Textbox(label="Status")
134
+
135
+ demo.load(fn=load_config, outputs=config_editor)
136
+ save_config_button.click(fn=save_config, inputs=config_editor, outputs=status)
137
+ '''
138
+
139
+ if __name__ == "__main__":
140
+ demo.launch(share=True)
config.py ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # config.py
2
+ from pydantic import BaseModel
3
+ from datetime import date
4
+ from typing import List
5
+
6
+ # Options: "rapid", "easy", "ocrmac", "tesseract"
7
+ OCR_MODEL = "easy"
8
+
9
+ # Must be set when using the tesseract OCR model
10
+ # Linux: "/usr/share/tesseract-ocr/4.00/tessdata"
11
+ # Windows: "C:\\Program Files\\Tesseract-OCR\\tessdata"
12
+ # Mac: "/usr/local/share/tessdata" or "/opt/homebrew/share/tessdata"
13
+ TESSERACT_TESSDATA_LOCATION = "/usr/share/tesseract-ocr/4.00/tessdata"
14
+
15
+ OLLAMA_MODEL = "llama3:instruct"
16
+
17
+ LLM_PROMPT = """
18
+ Extract all transactions from the following statement. Each transaction must be returned as a JSON object with the fields: transaction_date (YYYY-MM-DD), description, amount, and transaction_type ('deposit' or 'withdrawal'). All of these must be returned as a list of JSON objects under a key called 'transactions'. Here is an example:
19
+ [
20
+ {
21
+ transaction_date: 2025-01-24,
22
+ description: "Walmart",
23
+ amount: 34.24,
24
+ transaction_type: "withdrawl"
25
+ }
26
+ ]
27
+ """
28
+
29
+ # Options: "csv", "json", "excel"
30
+ EXPORT_TYPE = "json"
31
+
32
+ # Can be a file or directory
33
+ INPUT_PATH = ""
34
+ OUTPUT_FOLDER = ""
35
+ OUTPUT_FILE_NAME = "output"
36
+
37
+ # Define Pydantic response models for instructor:
38
+
39
+ class BankStatementEntry(BaseModel):
40
+ transaction_date: date | None | str
41
+ description: str | None
42
+ amount: float | None
43
+ #transaction_type: Literal['deposit', 'withdrawal', None]
44
+ transaction_type: str | None
45
+
46
+ class BankStatement(BaseModel):
47
+ transactions: List[BankStatementEntry] | None
48
+
49
+ # The model that LLM output will conform to
50
+ RESPONSE_MODEL = BankStatement
convert_doc_docling.py ADDED
@@ -0,0 +1,160 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from pathlib import Path
3
+ from docling.datamodel.document import ConversionResult
4
+ from huggingface_hub import snapshot_download
5
+
6
+ from docling.datamodel.base_models import InputFormat
7
+ from docling.datamodel.pipeline_options import EasyOcrOptions, OcrMacOptions, PdfPipeline, PdfPipelineOptions, PipelineOptions, RapidOcrOptions, TesseractOcrOptions
8
+ from docling.document_converter import DocumentConverter, ImageFormatOption, PdfFormatOption
9
+ from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend, PyPdfiumPageBackend
10
+ from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline
11
+ from docling.pipeline.simple_pipeline import SimplePipeline
12
+
13
+
14
+ # TODO: REFACTOR LOAD OCR MODEL TO JUST EITHER USE SERVER MODELS OR MOBILE MODELS
15
+ def load_rapid_ocr_model(det_model: str, rec_model: str, cls_model: str) -> DocumentConverter:
16
+ """
17
+ Load the RapidOCR model from Hugging Face Hub.
18
+ Args:
19
+ det_model (str): Path to the detection model.
20
+ rec_model (str): Path to the recognition model.
21
+ cls_model (str): Path to the classification model.
22
+ Returns:
23
+ DocumentConverter: The loaded RapidOCR model.
24
+ """
25
+ print("Downloading RapidOCR models")
26
+ download_path = snapshot_download(repo_id="SWHL/RapidOCR")
27
+
28
+ det_model_path = os.path.join(
29
+ download_path, det_model
30
+ )
31
+ rec_model_path = os.path.join(
32
+ download_path, rec_model
33
+ )
34
+ cls_model_path = os.path.join(
35
+ download_path, cls_model
36
+ )
37
+
38
+ ocr_options = RapidOcrOptions(
39
+ det_model_path=det_model_path,
40
+ rec_model_path=rec_model_path,
41
+ cls_model_path=cls_model_path
42
+ )
43
+
44
+ pipeline_options = PdfPipelineOptions(
45
+ ocr_options=ocr_options
46
+ )
47
+
48
+ doc_converter = DocumentConverter(
49
+ format_options={
50
+ InputFormat.IMAGE: ImageFormatOption(
51
+ pipeline_options=pipeline_options
52
+ )
53
+ }
54
+ )
55
+
56
+ return doc_converter
57
+
58
+ def load_ocr_mac_model() -> DocumentConverter:
59
+ """
60
+ Load the OCR Mac model.
61
+ Returns:
62
+ DocumentConverter: The loaded OCR Mac model.
63
+ """
64
+ ocr_options = OcrMacOptions(
65
+ framework='vision'
66
+ )
67
+
68
+ pipeline_options = PdfPipelineOptions(
69
+ ocr_options=ocr_options
70
+ )
71
+
72
+ doc_converter = DocumentConverter(
73
+ allowed_formats=[
74
+ InputFormat.PDF,
75
+ InputFormat.IMAGE,
76
+ ],
77
+ format_options={
78
+ InputFormat.PDF: PdfFormatOption(
79
+ pipeline_cls=StandardPdfPipeline, backend=PyPdfiumDocumentBackend, pipeline_options=pipeline_options
80
+ ),
81
+ InputFormat.IMAGE: PdfFormatOption(
82
+ pipeline_cls=StandardPdfPipeline, backend=PyPdfiumDocumentBackend, pipeline_options=pipeline_options
83
+ )
84
+ }
85
+ )
86
+
87
+ return doc_converter
88
+
89
+ def load_tesseract_model(tessdata_path: str) -> DocumentConverter:
90
+ """
91
+ Load the Tesseract OCR model.
92
+ Args:
93
+ tessdata_path (str): Path to the Tesseract data directory.
94
+ Returns:
95
+ DocumentConverter: The loaded Tesseract OCR model.
96
+ """
97
+ os.environ["TESSDATA_PREFIX"] = tessdata_path
98
+
99
+ ocr_options = TesseractOcrOptions()
100
+
101
+ pipeline_options = PdfPipelineOptions(
102
+ ocr_options=ocr_options
103
+ )
104
+
105
+ doc_converter = DocumentConverter(
106
+ allowed_formats=[
107
+ InputFormat.PDF,
108
+ InputFormat.IMAGE
109
+ ],
110
+ format_options={
111
+ InputFormat.PDF: PdfFormatOption(
112
+ pipeline_cls=StandardPdfPipeline, backend=PyPdfiumDocumentBackend, pipeline_options=pipeline_options
113
+ ),
114
+ InputFormat.IMAGE: PdfFormatOption(
115
+ pipeline_cls=StandardPdfPipeline, backend=PyPdfiumDocumentBackend, pipeline_options=pipeline_options
116
+ )
117
+ }
118
+ )
119
+
120
+ return doc_converter
121
+
122
+ def load_easy_ocr_model() -> DocumentConverter:
123
+ """
124
+ Load the EasyOCR model.
125
+ Returns:
126
+ DocumentConverter: The loaded EasyOCR model.
127
+ """
128
+ ocr_options = EasyOcrOptions()
129
+
130
+ pipeline_options = PdfPipelineOptions(
131
+ ocr_options=ocr_options
132
+ )
133
+
134
+ doc_converter = DocumentConverter(
135
+ allowed_formats=[
136
+ InputFormat.PDF,
137
+ InputFormat.IMAGE
138
+ ],
139
+ format_options={
140
+ InputFormat.PDF: PdfFormatOption(
141
+ pipeline_cls=StandardPdfPipeline, backend=PyPdfiumDocumentBackend, pipeline_options=pipeline_options
142
+ ),
143
+ InputFormat.IMAGE: PdfFormatOption(
144
+ pipeline_cls=StandardPdfPipeline, backend=PyPdfiumDocumentBackend, pipeline_options=pipeline_options
145
+ )
146
+ }
147
+ )
148
+ return doc_converter
149
+
150
+ def image_to_text(document_converter: DocumentConverter, file_path: Path) -> ConversionResult:
151
+ """
152
+ Convert an image to text using the specified document converter.
153
+ Args:
154
+ document_converter (DocumentConverter): The document converter to use.
155
+ file_path (Path): Path to the image file.
156
+ Returns:
157
+ ConversionResult: The result of the conversion.
158
+ """
159
+ conv_results = document_converter.convert(file_path)
160
+ return conv_results
export_data.py ADDED
@@ -0,0 +1,72 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ from pathlib import Path
3
+ import json
4
+ from typing import List
5
+
6
+
7
+ def convert_json_to_df(json_data: str) -> pd.DataFrame:
8
+ """
9
+ Convert a JSON string into a pandas DataFrame.
10
+ Automatically extracts the first top-level list if present.
11
+ """
12
+ data = json.loads(json_data)
13
+
14
+ # Try to extract the list of transactions if it's wrapped
15
+ list_name = None
16
+ for key, value in data.items():
17
+ if isinstance(value, list):
18
+ list_name = key
19
+ break
20
+
21
+ if list_name:
22
+ data = data[list_name]
23
+
24
+ return pd.DataFrame(data)
25
+
26
+ def combine_json_data_into_df(json_data_objects: List[str]) -> pd.DataFrame:
27
+ json_dfs = list()
28
+ for json_object in json_data_objects:
29
+ json_dfs.append(convert_json_to_df(json_object))
30
+
31
+ return pd.concat(json_dfs)
32
+
33
+ def export_as_csv(df: pd.DataFrame, output_folder: str, output_file_name: str) -> str:
34
+ """
35
+ Save a DataFrame as a CSV file, avoiding overwriting by incrementing filenames.
36
+ """
37
+ output_folder_path = Path(output_folder)
38
+ if not output_folder_path.is_dir():
39
+ print(f"Creating path {output_folder}")
40
+ output_folder_path.mkdir(parents=True)
41
+
42
+ file_index = 0
43
+ while True:
44
+ full_output_path = output_folder_path / f"{output_file_name}{file_index}.csv"
45
+ if not full_output_path.exists():
46
+ break
47
+ file_index += 1
48
+
49
+ df.to_csv(full_output_path, index=False)
50
+ print(f"Saved CSV to {full_output_path}")
51
+ return df.to_csv(path_or_buf=None, index=False)
52
+
53
+
54
+ def export_as_json(df: pd.DataFrame, output_folder: str, output_file_name: str) -> str:
55
+ """
56
+ Save raw JSON string to a file, avoiding overwriting by incrementing filenames.
57
+ """
58
+ output_folder_path = Path(output_folder)
59
+ if not output_folder_path.is_dir():
60
+ print(f"Creating path {output_folder}")
61
+ output_folder_path.mkdir(parents=True)
62
+
63
+ file_index = 0
64
+ while True:
65
+ full_output_path = output_folder_path / f"{output_file_name}{file_index}.json"
66
+ if not full_output_path.exists():
67
+ break
68
+ file_index += 1
69
+
70
+ df.to_json(full_output_path, orient='records')
71
+ print(f"Saved JSON to {full_output_path}")
72
+ return df.to_json(orient='records') or ""
instructor_llm.py ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import instructor
2
+ from openai import OpenAI
3
+ from pydantic import BaseModel
4
+ from typing import Type
5
+
6
+ import ollama
7
+
8
+ def pull_ollama_model(model: str):
9
+ """
10
+ Pull a model from ollama if it is not already downloaded
11
+ """
12
+ if not model.__contains__(":"):
13
+ model += ":latest"
14
+
15
+ for downloaded_model in ollama.list()["models"]:
16
+ if downloaded_model['model']== model:
17
+ print(f"Model {downloaded_model['model']} is installed")
18
+ return
19
+
20
+ print(f"Model {model} is not installed")
21
+ print(f"Downloading {model} model...")
22
+ ollama.pull(model)
23
+
24
+ def extract_json_data_using_ollama_llm(prompt: str, text_data: str, ollama_model: str, response_model: Type[BaseModel]) -> str:
25
+ """
26
+ Pass prompt and data into an ollama LLM using instructor
27
+ """
28
+ client = instructor.from_openai(
29
+ OpenAI(
30
+ base_url="http://localhost:11434/v1",
31
+ api_key="ollama"
32
+ ),
33
+ mode=instructor.Mode.JSON
34
+ )
35
+
36
+ resp = client.chat.completions.create(
37
+ model=ollama_model,
38
+ messages=[
39
+ {
40
+ 'role': 'system',
41
+ 'content': prompt
42
+ },
43
+ {
44
+ 'role': 'user',
45
+ 'content': text_data
46
+ },
47
+ ],
48
+ response_model=response_model,
49
+ max_retries=3
50
+ )
51
+
52
+ return resp.model_dump_json(indent=4)
requirements.txt ADDED
@@ -0,0 +1,147 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ aiofiles==24.1.0
2
+ aiohappyeyeballs==2.6.1
3
+ aiohttp==3.11.16
4
+ aiosignal==1.3.2
5
+ annotated-types==0.7.0
6
+ anyio==4.9.0
7
+ async-timeout==5.0.1
8
+ attrs==25.3.0
9
+ beautifulsoup4==4.13.4
10
+ certifi==2025.1.31
11
+ charset-normalizer==3.4.1
12
+ click==8.1.8
13
+ coloredlogs==15.0.1
14
+ dill==0.4.0
15
+ distro==1.9.0
16
+ docling==2.30.0
17
+ docling-core==2.26.4
18
+ docling-ibm-models==3.4.1
19
+ docling-parse==4.0.1
20
+ docstring_parser==0.16
21
+ easyocr==1.7.2
22
+ et_xmlfile==2.0.0
23
+ exceptiongroup==1.2.2
24
+ fastapi==0.115.12
25
+ ffmpy==0.5.0
26
+ filelock==3.18.0
27
+ filetype==1.2.0
28
+ flatbuffers==25.2.10
29
+ frozenlist==1.5.0
30
+ fsspec==2025.3.2
31
+ gradio==5.27.1
32
+ gradio_client==1.9.1
33
+ groovy==0.1.2
34
+ h11==0.14.0
35
+ httpcore==1.0.8
36
+ httpx==0.28.1
37
+ huggingface-hub==0.30.2
38
+ humanfriendly==10.0
39
+ idna==3.10
40
+ imageio==2.37.0
41
+ instructor==1.7.9
42
+ Jinja2==3.1.6
43
+ jiter==0.8.2
44
+ jsonlines==3.1.0
45
+ jsonref==1.1.0
46
+ jsonschema==4.23.0
47
+ jsonschema-specifications==2024.10.1
48
+ latex2mathml==3.77.0
49
+ lazy_loader==0.4
50
+ -e git+ssh://git@github.com/oronadavid/llm-document-parser.git@467ef6e3183983d82ed35a4fdc3cbdf78ab44952#egg=llm_document_parser_blueprint
51
+ loguru==0.7.3
52
+ lxml==5.3.2
53
+ markdown-it-py==3.0.0
54
+ marko==2.1.3
55
+ MarkupSafe==3.0.2
56
+ mdurl==0.1.2
57
+ mpire==2.10.2
58
+ mpmath==1.3.0
59
+ multidict==6.4.3
60
+ multiprocess==0.70.17
61
+ networkx==3.4.2
62
+ ninja==1.11.1.4
63
+ numpy==2.2.4
64
+ nvidia-cublas-cu12==12.4.5.8
65
+ nvidia-cuda-cupti-cu12==12.4.127
66
+ nvidia-cuda-nvrtc-cu12==12.4.127
67
+ nvidia-cuda-runtime-cu12==12.4.127
68
+ nvidia-cudnn-cu12==9.1.0.70
69
+ nvidia-cufft-cu12==11.2.1.3
70
+ nvidia-curand-cu12==10.3.5.147
71
+ nvidia-cusolver-cu12==11.6.1.9
72
+ nvidia-cusparse-cu12==12.3.1.170
73
+ nvidia-cusparselt-cu12==0.6.2
74
+ nvidia-nccl-cu12==2.21.5
75
+ nvidia-nvjitlink-cu12==12.4.127
76
+ nvidia-nvtx-cu12==12.4.127
77
+ ollama==0.4.7
78
+ onnxruntime==1.21.0
79
+ onnxruntime-gpu==1.21.0
80
+ openai==1.74.0
81
+ opencv-python==4.11.0.86
82
+ opencv-python-headless==4.11.0.86
83
+ openpyxl==3.1.5
84
+ orjson==3.10.16
85
+ packaging==24.2
86
+ pandas==2.2.3
87
+ pillow==11.2.1
88
+ pluggy==1.5.0
89
+ propcache==0.3.1
90
+ protobuf==6.30.2
91
+ pyclipper==1.3.0.post6
92
+ pydantic==2.11.3
93
+ pydantic-settings==2.8.1
94
+ pydantic_core==2.33.1
95
+ pydub==0.25.1
96
+ Pygments==2.19.1
97
+ pylatexenc==2.10
98
+ pypdfium2==4.30.1
99
+ python-bidi==0.6.6
100
+ python-dateutil==2.9.0.post0
101
+ python-docx==1.1.2
102
+ python-dotenv==1.1.0
103
+ python-multipart==0.0.20
104
+ python-pptx==1.0.2
105
+ pytz==2025.2
106
+ PyYAML==6.0.2
107
+ rapidocr-onnxruntime==1.4.4
108
+ referencing==0.36.2
109
+ regex==2024.11.6
110
+ requests==2.32.3
111
+ rich==13.9.4
112
+ rpds-py==0.24.0
113
+ rtree==1.4.0
114
+ ruff==0.11.7
115
+ safehttpx==0.1.6
116
+ safetensors==0.5.3
117
+ scikit-image==0.25.2
118
+ scipy==1.15.2
119
+ semantic-version==2.10.0
120
+ semchunk==2.2.2
121
+ shapely==2.1.0
122
+ shellingham==1.5.4
123
+ six==1.17.0
124
+ sniffio==1.3.1
125
+ soupsieve==2.6
126
+ starlette==0.46.2
127
+ sympy==1.13.1
128
+ tabulate==0.9.0
129
+ tenacity==9.1.2
130
+ tesserocr==2.8.0
131
+ tifffile==2025.3.30
132
+ tokenizers==0.21.1
133
+ tomlkit==0.13.2
134
+ torch==2.6.0
135
+ torchvision==0.21.0
136
+ tqdm==4.67.1
137
+ transformers==4.51.3
138
+ triton==3.2.0
139
+ typer==0.15.2
140
+ typing-inspection==0.4.0
141
+ typing_extensions==4.13.2
142
+ tzdata==2025.2
143
+ urllib3==2.4.0
144
+ uvicorn==0.34.2
145
+ websockets==15.0.1
146
+ XlsxWriter==3.2.2
147
+ yarl==1.19.0