from fastapi import FastAPI, UploadFile, File import time import tempfile import os from docling.document_converter import DocumentConverter, PdfFormatOption from docling.datamodel.base_models import InputFormat from docling.datamodel.pipeline_options import PdfPipelineOptions from docling_core.transforms.chunker import HierarchicalChunker app = FastAPI() @app.get("/") def greet_json(): return {"Hello": "World!"} @app.post("/upload-file") def upload_file(file: UploadFile = File(...)): start = time.time() print(f"Processing file started") # Save uploaded file temporarily with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp_file: content = file.file.read() tmp_file.write(content) tmp_file_path = tmp_file.name print(f"File saved to {tmp_file_path}") try: # Configure pipeline options for GPU acceleration (if supported by docling) pipeline_options = PdfPipelineOptions( do_ocr=False, do_table_structure=True, table_structure_options={ "do_cell_matching": True, } ) # Create converter (do not specify backend if it causes errors) converter = DocumentConverter( format_options={ InputFormat.PDF: PdfFormatOption( pipeline_options=pipeline_options # backend="gpu" removed as per instructions ) } ) # Convert document using the uploaded file result = converter.convert(tmp_file_path) dl_doc = result.document # Chunking chunker = HierarchicalChunker( merge_list_items=True, ) chunks = list(chunker.chunk(dl_doc=dl_doc)) end = time.time() print(f"Processing time: {end - start:.2f} seconds") print(f"Number of chunks: {len(chunks)}") response = { "filename": file.filename, "chunks": [chunk.text for chunk in chunks], # Extract text from chunks "processing_time": end - start, "number_of_chunks": len(chunks) } print(f"Response: {response}") return response finally: print(f"Cleaning up temporary file") # Clean up temporary file if os.path.exists(tmp_file_path): os.unlink(tmp_file_path)