adentic / app.py
seynath
jj
04b9456
from fastapi import FastAPI, UploadFile, File
import time
import tempfile
import os
from docling.document_converter import DocumentConverter, PdfFormatOption
from docling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import PdfPipelineOptions
from docling_core.transforms.chunker import HierarchicalChunker
app = FastAPI()
@app.get("/")
def greet_json():
return {"Hello": "World!"}
@app.post("/upload-file")
def upload_file(file: UploadFile = File(...)):
start = time.time()
print(f"Processing file started")
# Save uploaded file temporarily
with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp_file:
content = file.file.read()
tmp_file.write(content)
tmp_file_path = tmp_file.name
print(f"File saved to {tmp_file_path}")
try:
# Configure pipeline options for GPU acceleration (if supported by docling)
pipeline_options = PdfPipelineOptions(
do_ocr=False,
do_table_structure=True,
table_structure_options={
"do_cell_matching": True,
}
)
# Create converter (do not specify backend if it causes errors)
converter = DocumentConverter(
format_options={
InputFormat.PDF: PdfFormatOption(
pipeline_options=pipeline_options
# backend="gpu" removed as per instructions
)
}
)
# Convert document using the uploaded file
result = converter.convert(tmp_file_path)
dl_doc = result.document
# Chunking
chunker = HierarchicalChunker(
merge_list_items=True,
)
chunks = list(chunker.chunk(dl_doc=dl_doc))
end = time.time()
print(f"Processing time: {end - start:.2f} seconds")
print(f"Number of chunks: {len(chunks)}")
response = {
"filename": file.filename,
"chunks": [chunk.text for chunk in chunks], # Extract text from chunks
"processing_time": end - start,
"number_of_chunks": len(chunks)
}
print(f"Response: {response}")
return response
finally:
print(f"Cleaning up temporary file")
# Clean up temporary file
if os.path.exists(tmp_file_path):
os.unlink(tmp_file_path)