File size: 2,405 Bytes
14f1ed8 201b062 14f1ed8 04b9456 14f1ed8 04b9456 14f1ed8 04b9456 14f1ed8 04b9456 14f1ed8 04b9456 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 | from fastapi import FastAPI, UploadFile, File
import time
import tempfile
import os
from docling.document_converter import DocumentConverter, PdfFormatOption
from docling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import PdfPipelineOptions
from docling_core.transforms.chunker import HierarchicalChunker
app = FastAPI()
@app.get("/")
def greet_json():
return {"Hello": "World!"}
@app.post("/upload-file")
def upload_file(file: UploadFile = File(...)):
start = time.time()
print(f"Processing file started")
# Save uploaded file temporarily
with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp_file:
content = file.file.read()
tmp_file.write(content)
tmp_file_path = tmp_file.name
print(f"File saved to {tmp_file_path}")
try:
# Configure pipeline options for GPU acceleration (if supported by docling)
pipeline_options = PdfPipelineOptions(
do_ocr=False,
do_table_structure=True,
table_structure_options={
"do_cell_matching": True,
}
)
# Create converter (do not specify backend if it causes errors)
converter = DocumentConverter(
format_options={
InputFormat.PDF: PdfFormatOption(
pipeline_options=pipeline_options
# backend="gpu" removed as per instructions
)
}
)
# Convert document using the uploaded file
result = converter.convert(tmp_file_path)
dl_doc = result.document
# Chunking
chunker = HierarchicalChunker(
merge_list_items=True,
)
chunks = list(chunker.chunk(dl_doc=dl_doc))
end = time.time()
print(f"Processing time: {end - start:.2f} seconds")
print(f"Number of chunks: {len(chunks)}")
response = {
"filename": file.filename,
"chunks": [chunk.text for chunk in chunks], # Extract text from chunks
"processing_time": end - start,
"number_of_chunks": len(chunks)
}
print(f"Response: {response}")
return response
finally:
print(f"Cleaning up temporary file")
# Clean up temporary file
if os.path.exists(tmp_file_path):
os.unlink(tmp_file_path)
|