File size: 2,443 Bytes
a479f18
59f237a
a479f18
59f237a
 
8809763
59f237a
8809763
59f237a
 
8809763
a479f18
 
 
 
59f237a
a479f18
 
 
 
 
 
 
 
8809763
a479f18
 
 
 
 
 
 
 
 
 
 
 
 
 
 
59f237a
a479f18
 
 
 
 
 
 
 
 
 
59f237a
a479f18
 
 
 
 
 
 
 
 
 
 
 
 
59f237a
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
from fastapi import FastAPI, File, HTTPException
from fastapi.responses import JSONResponse
import pdfplumber
from io import BytesIO
import base64

app = FastAPI()

@app.post("/api/convert")
async def convert_pdf(file: bytes = File(...)):
    try:
        # Parse PDF with pdfplumber
        with pdfplumber.open(BytesIO(file)) as pdf:
            page = pdf.pages[0]  # Process first page for simplicity
            width, height = page.width, page.height

            # Initialize result
            result = {
                "width": width,
                "height": height,
                "texts": [],
                "images": [],
                "shapes": []
            }

            # Extract text
            for char in page.chars:
                result["texts"].append({
                    "content": char["text"],
                    "x": char["x0"],
                    "y": char["y0"],
                    "font_family": char["fontname"].split("+")[-1] or "Arial",
                    "font_style": "Regular",  # pdfplumber doesn't provide style directly
                    "font_size": char["size"],
                    "color": {
                        "r": 0,  # Simplified: assume black text (enhance with actual color extraction if needed)
                        "g": 0,
                        "b": 0
                    }
                })

            # Extract images
            for img in page.images:
                img_data = img["stream"].get_data()  # Raw image data
                result["images"].append({
                    "data": base64.b64encode(img_data).decode('utf-8'),
                    "x": img["x0"],
                    "y": img["y0"],
                    "width": img["width"],
                    "height": img["height"]
                })

            # Extract shapes (basic lines/curves)
            for curve in page.curves:
                path = " ".join([f"M {p['x']},{p['y']}" for p in curve["points"]])  # Simplified SVG path
                result["shapes"].append({
                    "path": path,
                    "x": curve["x0"],
                    "y": curve["y0"],
                    "color": {
                        "r": 0,
                        "g": 0,
                        "b": 0
                    }
                })

        return JSONResponse(content=result)
    except Exception as e:
        raise HTTPException(status_code=500, detail=str(e))