Spaces:

seynath
/

adentic

Sleeping

adentic / app.py

seynath

04b9456 9 months ago

2.41 kB

	from fastapi import FastAPI, UploadFile, File

	import time
	import tempfile
	import os
	from docling.document_converter import DocumentConverter, PdfFormatOption
	from docling.datamodel.base_models import InputFormat
	from docling.datamodel.pipeline_options import PdfPipelineOptions
	from docling_core.transforms.chunker import HierarchicalChunker


	app = FastAPI()

	@app.get("/")
	def greet_json():
	return {"Hello": "World!"}

	@app.post("/upload-file")
	def upload_file(file: UploadFile = File(...)):
	start = time.time()
	print(f"Processing file started")

	# Save uploaded file temporarily
	with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp_file:
	content = file.file.read()
	tmp_file.write(content)
	tmp_file_path = tmp_file.name

	print(f"File saved to {tmp_file_path}")
	try:
	# Configure pipeline options for GPU acceleration (if supported by docling)
	pipeline_options = PdfPipelineOptions(
	do_ocr=False,
	do_table_structure=True,
	table_structure_options={
	"do_cell_matching": True,
	}
	)

	# Create converter (do not specify backend if it causes errors)
	converter = DocumentConverter(
	format_options={
	InputFormat.PDF: PdfFormatOption(
	pipeline_options=pipeline_options
	# backend="gpu" removed as per instructions
	)
	}
	)

	# Convert document using the uploaded file
	result = converter.convert(tmp_file_path)
	dl_doc = result.document

	# Chunking
	chunker = HierarchicalChunker(
	merge_list_items=True,
	)

	chunks = list(chunker.chunk(dl_doc=dl_doc))

	end = time.time()
	print(f"Processing time: {end - start:.2f} seconds")
	print(f"Number of chunks: {len(chunks)}")

	response = {
	"filename": file.filename,
	"chunks": [chunk.text for chunk in chunks], # Extract text from chunks
	"processing_time": end - start,
	"number_of_chunks": len(chunks)
	}

	print(f"Response: {response}")

	return response

	finally:
	print(f"Cleaning up temporary file")
	# Clean up temporary file
	if os.path.exists(tmp_file_path):
	os.unlink(tmp_file_path)