File size: 2,405 Bytes
14f1ed8
 
 
 
 
 
 
 
 
 
201b062
 
 
 
 
 
14f1ed8
 
 
 
04b9456
14f1ed8
 
 
 
 
 
 
04b9456
14f1ed8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
04b9456
 
14f1ed8
 
 
04b9456
14f1ed8
 
 
04b9456
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
from fastapi import FastAPI, UploadFile, File

import time
import tempfile
import os
from docling.document_converter import DocumentConverter, PdfFormatOption
from docling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import PdfPipelineOptions
from docling_core.transforms.chunker import HierarchicalChunker


app = FastAPI()

@app.get("/")
def greet_json():
    return {"Hello": "World!"}

@app.post("/upload-file")
def upload_file(file: UploadFile = File(...)):
    start = time.time()
    print(f"Processing file started")

    # Save uploaded file temporarily
    with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp_file:
        content = file.file.read()
        tmp_file.write(content)
        tmp_file_path = tmp_file.name

    print(f"File saved to {tmp_file_path}")
    try:
        # Configure pipeline options for GPU acceleration (if supported by docling)
        pipeline_options = PdfPipelineOptions(
            do_ocr=False,
            do_table_structure=True,
            table_structure_options={
                "do_cell_matching": True,
            }
        )

        # Create converter (do not specify backend if it causes errors)
        converter = DocumentConverter(
            format_options={
                InputFormat.PDF: PdfFormatOption(
                    pipeline_options=pipeline_options
                    # backend="gpu" removed as per instructions
                )
            }
        )

        # Convert document using the uploaded file
        result = converter.convert(tmp_file_path)
        dl_doc = result.document

        # Chunking
        chunker = HierarchicalChunker(
            merge_list_items=True,
        )

        chunks = list(chunker.chunk(dl_doc=dl_doc))

        end = time.time()
        print(f"Processing time: {end - start:.2f} seconds")
        print(f"Number of chunks: {len(chunks)}")

        response = {
            "filename": file.filename,
            "chunks": [chunk.text for chunk in chunks],  # Extract text from chunks
            "processing_time": end - start,
            "number_of_chunks": len(chunks)
        }

        print(f"Response: {response}")

        return response

    finally:
        print(f"Cleaning up temporary file")
        # Clean up temporary file
        if os.path.exists(tmp_file_path):
            os.unlink(tmp_file_path)