File size: 4,945 Bytes
63069dd
 
 
 
b68df3b
 
 
1a755c0
 
 
 
 
63069dd
1a755c0
63069dd
1a755c0
 
 
63069dd
 
1a755c0
 
 
 
 
 
 
 
 
 
 
63069dd
1a755c0
63069dd
1a755c0
63069dd
b68df3b
 
63069dd
 
 
1a755c0
63069dd
 
 
 
 
 
 
 
 
 
 
 
 
1a755c0
 
 
 
 
 
 
 
 
 
 
 
63069dd
 
1a755c0
63069dd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1a755c0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
63069dd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1a755c0
63069dd
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
from fastapi import FastAPI, File, UploadFile
from fastapi.responses import JSONResponse
import base64
import re

app = FastAPI()

from io import BytesIO
from gmft.pdf_bindings import PyPDFium2Document
from utils import get_page_text_with_tables, detector, formatter

def extract_text_from_pdf(pdf_bytes: bytes, page_numbers=None) -> str:
    """
    Extract text from PDF bytes using gmft without temporary files
    """
    # Create a PyPDFium2Document directly from bytes
    doc = PyPDFium2Document(pdf_bytes)
    page_set = set(page_numbers if page_numbers else list(range(len(doc))))
    
    try:
        pages = []
        for page_num, page in enumerate(doc):
            if not page_num in page_set:
                continue
            try:
                tables = detector.extract(page)
                fmt_tables = [formatter.extract(table, margin=(0, 0, 0, 0)) for table in tables]
                page_text = get_page_text_with_tables(page, fmt_tables)
                pages.append(page_text)
            finally:
                page.close()
    finally:
        doc.close()
    
    return pages

@app.get("/")
def greet_json():
    return {"Hello": "World!"}

@app.post("/extract-text")
async def extract_pdf_text(file: UploadFile = File(...), page_numbers: str = None):
    """
    Endpoint to extract text from uploaded PDF file
    """
    # Check if the uploaded file is a PDF
    if not file.filename.lower().endswith('.pdf'):
        return JSONResponse(
            status_code=400,
            content={"error": "Only PDF files are supported"}
        )
    
    # Read the file content
    content = await file.read()
    
    # Parse page_numbers if provided
    parsed_page_numbers = None
    if page_numbers:
        try:
            # Convert comma-separated string to list of integers
            parsed_page_numbers = [int(p.strip()) for p in page_numbers.split(',') if p.strip()]
        except ValueError:
            return JSONResponse(
                status_code=400,
                content={"error": "Invalid page_numbers format. Use comma-separated integers."}
            )
    
    try:
        # Extract text from PDF
        extracted_text = extract_text_from_pdf(content, parsed_page_numbers)
        
        return {
            "filename": file.filename,
            "text": extracted_text
        }
    except Exception as e:
        return JSONResponse(
            status_code=500,
            content={"error": f"Failed to extract text: {str(e)}"}
        )

@app.post("/extract-text-base64")
async def extract_pdf_text_base64(data: dict):
    """
    Endpoint to extract text from PDF provided as base64 encoded string
    """
    # Check if 'file' key exists in request
    if 'file' not in data:
        return JSONResponse(
            status_code=400,
            content={"error": "Missing 'file' field in request body"}
        )
    
    # Get the base64 encoded string
    base64_string = data['file']
    
    # Extract filename if provided
    filename = data.get('filename', 'unknown.pdf')
    
    # Extract page_numbers if provided
    page_numbers = data.get('page_numbers')
    parsed_page_numbers = None
    if page_numbers:
        try:
            # Handle both string and list formats
            if isinstance(page_numbers, str):
                parsed_page_numbers = [int(p.strip()) for p in page_numbers.split(',') if p.strip()]
            elif isinstance(page_numbers, list):
                parsed_page_numbers = [int(p) for p in page_numbers if isinstance(p, (int, str))]
            else:
                return JSONResponse(
                    status_code=400,
                    content={"error": "Invalid page_numbers format. Use comma-separated integers or array."}
                )
        except (ValueError, TypeError):
            return JSONResponse(
                status_code=400,
                content={"error": "Invalid page_numbers format. Use comma-separated integers or array."}
            )
    
    try:
        # Handle data URL format (e.g., "data:application/pdf;base64,...")
        if base64_string.startswith('data:'):
            # Extract the base64 part after the comma
            match = re.search(r'base64,(.*)', base64_string)
            if match:
                base64_string = match.group(1)
            else:
                return JSONResponse(
                    status_code=400,
                    content={"error": "Invalid data URL format"}
                )
        
        pdf_bytes = base64.b64decode(base64_string)
        
        # Extract text from PDF
        extracted_text = extract_text_from_pdf(pdf_bytes, parsed_page_numbers)
        
        return {
            "filename": filename,
            "text": extracted_text
        }
    except Exception as e:
        return JSONResponse(
            status_code=500,
            content={"error": f"Failed to process base64 PDF: {str(e)}"}
        )