Spaces:
Sleeping
Sleeping
File size: 2,443 Bytes
a479f18 59f237a a479f18 59f237a 8809763 59f237a 8809763 59f237a 8809763 a479f18 59f237a a479f18 8809763 a479f18 59f237a a479f18 59f237a a479f18 59f237a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 |
from fastapi import FastAPI, File, HTTPException
from fastapi.responses import JSONResponse
import pdfplumber
from io import BytesIO
import base64
app = FastAPI()
@app.post("/api/convert")
async def convert_pdf(file: bytes = File(...)):
try:
# Parse PDF with pdfplumber
with pdfplumber.open(BytesIO(file)) as pdf:
page = pdf.pages[0] # Process first page for simplicity
width, height = page.width, page.height
# Initialize result
result = {
"width": width,
"height": height,
"texts": [],
"images": [],
"shapes": []
}
# Extract text
for char in page.chars:
result["texts"].append({
"content": char["text"],
"x": char["x0"],
"y": char["y0"],
"font_family": char["fontname"].split("+")[-1] or "Arial",
"font_style": "Regular", # pdfplumber doesn't provide style directly
"font_size": char["size"],
"color": {
"r": 0, # Simplified: assume black text (enhance with actual color extraction if needed)
"g": 0,
"b": 0
}
})
# Extract images
for img in page.images:
img_data = img["stream"].get_data() # Raw image data
result["images"].append({
"data": base64.b64encode(img_data).decode('utf-8'),
"x": img["x0"],
"y": img["y0"],
"width": img["width"],
"height": img["height"]
})
# Extract shapes (basic lines/curves)
for curve in page.curves:
path = " ".join([f"M {p['x']},{p['y']}" for p in curve["points"]]) # Simplified SVG path
result["shapes"].append({
"path": path,
"x": curve["x0"],
"y": curve["y0"],
"color": {
"r": 0,
"g": 0,
"b": 0
}
})
return JSONResponse(content=result)
except Exception as e:
raise HTTPException(status_code=500, detail=str(e)) |