|
|
from fastapi import FastAPI, UploadFile, File |
|
|
|
|
|
import time |
|
|
import tempfile |
|
|
import os |
|
|
from docling.document_converter import DocumentConverter, PdfFormatOption |
|
|
from docling.datamodel.base_models import InputFormat |
|
|
from docling.datamodel.pipeline_options import PdfPipelineOptions |
|
|
from docling_core.transforms.chunker import HierarchicalChunker |
|
|
|
|
|
|
|
|
app = FastAPI() |
|
|
|
|
|
@app.get("/") |
|
|
def greet_json(): |
|
|
return {"Hello": "World!"} |
|
|
|
|
|
@app.post("/upload-file") |
|
|
def upload_file(file: UploadFile = File(...)): |
|
|
start = time.time() |
|
|
print(f"Processing file started") |
|
|
|
|
|
|
|
|
with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp_file: |
|
|
content = file.file.read() |
|
|
tmp_file.write(content) |
|
|
tmp_file_path = tmp_file.name |
|
|
|
|
|
print(f"File saved to {tmp_file_path}") |
|
|
try: |
|
|
|
|
|
pipeline_options = PdfPipelineOptions( |
|
|
do_ocr=False, |
|
|
do_table_structure=True, |
|
|
table_structure_options={ |
|
|
"do_cell_matching": True, |
|
|
} |
|
|
) |
|
|
|
|
|
|
|
|
converter = DocumentConverter( |
|
|
format_options={ |
|
|
InputFormat.PDF: PdfFormatOption( |
|
|
pipeline_options=pipeline_options |
|
|
|
|
|
) |
|
|
} |
|
|
) |
|
|
|
|
|
|
|
|
result = converter.convert(tmp_file_path) |
|
|
dl_doc = result.document |
|
|
|
|
|
|
|
|
chunker = HierarchicalChunker( |
|
|
merge_list_items=True, |
|
|
) |
|
|
|
|
|
chunks = list(chunker.chunk(dl_doc=dl_doc)) |
|
|
|
|
|
end = time.time() |
|
|
print(f"Processing time: {end - start:.2f} seconds") |
|
|
print(f"Number of chunks: {len(chunks)}") |
|
|
|
|
|
response = { |
|
|
"filename": file.filename, |
|
|
"chunks": [chunk.text for chunk in chunks], |
|
|
"processing_time": end - start, |
|
|
"number_of_chunks": len(chunks) |
|
|
} |
|
|
|
|
|
print(f"Response: {response}") |
|
|
|
|
|
return response |
|
|
|
|
|
finally: |
|
|
print(f"Cleaning up temporary file") |
|
|
|
|
|
if os.path.exists(tmp_file_path): |
|
|
os.unlink(tmp_file_path) |
|
|
|
|
|
|