Spaces:
Sleeping
Sleeping
| from fastapi import FastAPI, File, HTTPException | |
| from fastapi.responses import JSONResponse | |
| import pdfplumber | |
| from io import BytesIO | |
| import base64 | |
| app = FastAPI() | |
| async def convert_pdf(file: bytes = File(...)): | |
| try: | |
| # Parse PDF with pdfplumber | |
| with pdfplumber.open(BytesIO(file)) as pdf: | |
| page = pdf.pages[0] # Process first page for simplicity | |
| width, height = page.width, page.height | |
| # Initialize result | |
| result = { | |
| "width": width, | |
| "height": height, | |
| "texts": [], | |
| "images": [], | |
| "shapes": [] | |
| } | |
| # Extract text | |
| for char in page.chars: | |
| result["texts"].append({ | |
| "content": char["text"], | |
| "x": char["x0"], | |
| "y": char["y0"], | |
| "font_family": char["fontname"].split("+")[-1] or "Arial", | |
| "font_style": "Regular", # pdfplumber doesn't provide style directly | |
| "font_size": char["size"], | |
| "color": { | |
| "r": 0, # Simplified: assume black text (enhance with actual color extraction if needed) | |
| "g": 0, | |
| "b": 0 | |
| } | |
| }) | |
| # Extract images | |
| for img in page.images: | |
| img_data = img["stream"].get_data() # Raw image data | |
| result["images"].append({ | |
| "data": base64.b64encode(img_data).decode('utf-8'), | |
| "x": img["x0"], | |
| "y": img["y0"], | |
| "width": img["width"], | |
| "height": img["height"] | |
| }) | |
| # Extract shapes (basic lines/curves) | |
| for curve in page.curves: | |
| path = " ".join([f"M {p['x']},{p['y']}" for p in curve["points"]]) # Simplified SVG path | |
| result["shapes"].append({ | |
| "path": path, | |
| "x": curve["x0"], | |
| "y": curve["y0"], | |
| "color": { | |
| "r": 0, | |
| "g": 0, | |
| "b": 0 | |
| } | |
| }) | |
| return JSONResponse(content=result) | |
| except Exception as e: | |
| raise HTTPException(status_code=500, detail=str(e)) |