AkashKumarave commited on
Commit
a479f18
·
verified ·
1 Parent(s): 2f699bc

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +52 -41
app.py CHANGED
@@ -1,6 +1,6 @@
1
- from fastapi import FastAPI, HTTPException
2
  from fastapi.responses import JSONResponse
3
- from pdf2json import Pdf2Json
4
  from io import BytesIO
5
  import base64
6
 
@@ -9,49 +9,60 @@ app = FastAPI()
9
  @app.post("/api/convert")
10
  async def convert_pdf(file: bytes = File(...)):
11
  try:
12
- # Parse PDF
13
- pdf_parser = Pdf2Json(BytesIO(file))
14
- pdf_data = pdf_parser.get_json()
 
15
 
16
- # Process PDF data
17
- result = {
18
- "width": pdf_data["width"], # Page width in pixels
19
- "height": pdf_data["height"], # Page height in pixels
20
- "texts": [],
21
- "images": [],
22
- "shapes": []
23
- }
24
 
25
- # Extract text
26
- for text in pdf_data["texts"]:
27
- result["texts"].append({
28
- "content": text["content"],
29
- "x": text["x"],
30
- "y": text["y"],
31
- "fontFamily": text["font"] or "Arial",
32
- "fontStyle": text["style"] or "Regular",
33
- "fontSize": text["size"],
34
- "color": {"r": text["color"]["r"]/255, "g": text["color"]["g"]/255, "b": text["color"]["b"]/255}
35
- })
 
 
 
 
36
 
37
- # Extract images
38
- for img in pdf_data["images"]:
39
- result["images"].append({
40
- "data": base64.b64encode(img["data"]).decode('utf-8'),
41
- "x": img["x"],
42
- "y": img["y"],
43
- "width": img["width"],
44
- "height": img["height"]
45
- })
 
46
 
47
- # Extract shapes
48
- for shape in pdf_data["shapes"]:
49
- result["shapes"].append({
50
- "path": shape["path"],
51
- "x": shape["x"],
52
- "y": shape["y"],
53
- "color": {"r": shape["color"]["r"]/255, "g": shape["color"]["g"]/255, "b": shape["color"]["b"]/255}
54
- })
 
 
 
 
 
55
 
56
  return JSONResponse(content=result)
57
  except Exception as e:
 
1
+ from fastapi import FastAPI, File, HTTPException
2
  from fastapi.responses import JSONResponse
3
+ import pdfplumber
4
  from io import BytesIO
5
  import base64
6
 
 
9
  @app.post("/api/convert")
10
  async def convert_pdf(file: bytes = File(...)):
11
  try:
12
+ # Parse PDF with pdfplumber
13
+ with pdfplumber.open(BytesIO(file)) as pdf:
14
+ page = pdf.pages[0] # Process first page for simplicity
15
+ width, height = page.width, page.height
16
 
17
+ # Initialize result
18
+ result = {
19
+ "width": width,
20
+ "height": height,
21
+ "texts": [],
22
+ "images": [],
23
+ "shapes": []
24
+ }
25
 
26
+ # Extract text
27
+ for char in page.chars:
28
+ result["texts"].append({
29
+ "content": char["text"],
30
+ "x": char["x0"],
31
+ "y": char["y0"],
32
+ "font_family": char["fontname"].split("+")[-1] or "Arial",
33
+ "font_style": "Regular", # pdfplumber doesn't provide style directly
34
+ "font_size": char["size"],
35
+ "color": {
36
+ "r": 0, # Simplified: assume black text (enhance with actual color extraction if needed)
37
+ "g": 0,
38
+ "b": 0
39
+ }
40
+ })
41
 
42
+ # Extract images
43
+ for img in page.images:
44
+ img_data = img["stream"].get_data() # Raw image data
45
+ result["images"].append({
46
+ "data": base64.b64encode(img_data).decode('utf-8'),
47
+ "x": img["x0"],
48
+ "y": img["y0"],
49
+ "width": img["width"],
50
+ "height": img["height"]
51
+ })
52
 
53
+ # Extract shapes (basic lines/curves)
54
+ for curve in page.curves:
55
+ path = " ".join([f"M {p['x']},{p['y']}" for p in curve["points"]]) # Simplified SVG path
56
+ result["shapes"].append({
57
+ "path": path,
58
+ "x": curve["x0"],
59
+ "y": curve["y0"],
60
+ "color": {
61
+ "r": 0,
62
+ "g": 0,
63
+ "b": 0
64
+ }
65
+ })
66
 
67
  return JSONResponse(content=result)
68
  except Exception as e: