AkashKumarave commited on
Commit
69f734d
·
verified ·
1 Parent(s): 4240c72

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +96 -57
app.py CHANGED
@@ -1,72 +1,111 @@
1
  import fitz # PyMuPDF
2
  import base64
 
3
  from fastapi import FastAPI, File, UploadFile
4
  from fastapi.middleware.cors import CORSMiddleware
5
 
 
 
 
 
6
  app = FastAPI()
7
- app.add_middleware(CORSMiddleware, allow_origins=["*"], allow_methods=["*"], allow_headers=["*"])
 
 
 
 
 
 
 
8
 
9
  def normalize_color(color):
10
- if not color: return {"r": 0, "g": 0, "b": 0}
11
- # PyMuPDF colors can be 1 (gray), 3 (rgb), or 4 (cmyk) components
12
- if len(color) == 1: return {"r": color[0], "g": color[0], "b": color[0]}
13
- return {"r": color[0], "g": color[1], "b": color[2]}
 
 
 
 
 
 
 
 
 
 
 
 
14
 
15
  @app.post("/convert")
16
  async def convert_pdf(file: UploadFile = File(...)):
17
- content = await file.read()
18
- doc = fitz.open(stream=content, filetype="pdf")
19
- pages_data = []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20
 
21
- for page in doc:
22
- page_dict = {
23
- "width": page.rect.width,
24
- "height": page.rect.height,
25
- "elements": []
26
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
27
 
28
- # 1. Extract Text and Images using "dict"
29
- raw = page.get_text("dict")
30
- for block in raw["blocks"]:
31
- if block["type"] == 0: # TEXT
32
- for line in block["lines"]:
33
- for span in line["spans"]:
34
- page_dict["elements"].append({
35
- "type": "TEXT",
36
- "content": span["text"],
37
- "x": span["bbox"][0],
38
- "y": span["bbox"][1],
39
- "size": span["size"],
40
- "color": normalize_color(fitz.utils.getColor(span["color"]))
41
- })
42
- elif block["type"] == 1: # IMAGE
43
- page_dict["elements"].append({
44
- "type": "IMAGE",
45
- "bytes": base64.b64encode(block["image"]).decode("utf-8"),
46
- "x": block["bbox"][0], "y": block["bbox"][1],
47
- "width": block["bbox"][2] - block["bbox"][0],
48
- "height": block["bbox"][3] - block["bbox"][1]
49
- })
50
 
51
- # 2. Extract Vector Drawings
52
- for path in page.get_drawings():
53
- # Convert paths to SVG-like data for Figma's vectorPaths
54
- svg_path = ""
55
- for item in path["items"]:
56
- if item[0] == "l": svg_path += f"M {item[1].x} {item[1].y} L {item[2].x} {item[2].y} "
57
- elif item[0] == "c": svg_path += f"M {item[1].x} {item[1].y} C {item[2].x} {item[2].y} {item[3].x} {item[3].y} {item[4].x} {item[4].y} "
58
- elif item[0] == "re":
59
- r = item[1]
60
- svg_path += f"M {r.x0} {r.y0} L {r.x1} {r.y0} L {r.x1} {r.y1} L {r.x0} {r.y1} Z "
61
-
62
- if svg_path:
63
- page_dict["elements"].append({
64
- "type": "VECTOR",
65
- "path": svg_path.strip(),
66
- "fill": normalize_color(path.get("fill")),
67
- "stroke": normalize_color(path.get("color")),
68
- "strokeWeight": path.get("width", 1)
69
- })
70
 
71
- pages_data.append(page_dict)
72
- return {"pages": pages_data}
 
 
1
  import fitz # PyMuPDF
2
  import base64
3
+ import logging
4
  from fastapi import FastAPI, File, UploadFile
5
  from fastapi.middleware.cors import CORSMiddleware
6
 
7
+ # Set up logging to see errors in the Hugging Face Log tab
8
+ logging.basicConfig(level=logging.INFO)
9
+ logger = logging.getLogger(__name__)
10
+
11
  app = FastAPI()
12
+
13
+ app.add_middleware(
14
+ CORSMiddleware,
15
+ allow_origins=["*"],
16
+ allow_credentials=True,
17
+ allow_methods=["*"],
18
+ allow_headers=["*"],
19
+ )
20
 
21
  def normalize_color(color):
22
+ """Ensures color is always a 0-1 RGB dict for Figma"""
23
+ try:
24
+ if color is None: return {"r": 0, "g": 0, "b": 0}
25
+ # If color is an integer (from span['color'])
26
+ if isinstance(color, int):
27
+ color = fitz.utils.getColor(color)
28
+
29
+ if len(color) == 1: # Grayscale
30
+ return {"r": float(color[0]), "g": float(color[0]), "b": float(color[0])}
31
+ return {"r": float(color[0]), "g": float(color[1]), "b": float(color[2])}
32
+ except:
33
+ return {"r": 0, "g": 0, "b": 0}
34
+
35
+ @app.get("/")
36
+ async def root():
37
+ return {"status": "PDF Converter is Online"}
38
 
39
  @app.post("/convert")
40
  async def convert_pdf(file: UploadFile = File(...)):
41
+ try:
42
+ logger.info(f"Processing file: {file.filename}")
43
+ content = await file.read()
44
+ doc = fitz.open(stream=content, filetype="pdf")
45
+ pages_data = []
46
+
47
+ for page in doc:
48
+ page_dict = {
49
+ "width": float(page.rect.width),
50
+ "height": float(page.rect.height),
51
+ "elements": []
52
+ }
53
+
54
+ # 1. Extract Text and Images
55
+ # Using "dict" captures font size, location, and images
56
+ raw_dict = page.get_text("dict")
57
+ for block in raw_dict["blocks"]:
58
+ if block["type"] == 0: # Text Block
59
+ for line in block["lines"]:
60
+ for span in line["spans"]:
61
+ page_dict["elements"].append({
62
+ "type": "TEXT",
63
+ "content": span["text"],
64
+ "x": float(span["bbox"][0]),
65
+ "y": float(span["bbox"][1]),
66
+ "size": float(span["size"]),
67
+ "color": normalize_color(span["color"])
68
+ })
69
+ elif block["type"] == 1: # Image Block
70
+ page_dict["elements"].append({
71
+ "type": "IMAGE",
72
+ "bytes": base64.b64encode(block["image"]).decode("utf-8"),
73
+ "x": float(block["bbox"][0]),
74
+ "y": float(block["bbox"][1]),
75
+ "width": float(block["bbox"][2] - block["bbox"][0]),
76
+ "height": float(block["bbox"][3] - block["bbox"][1])
77
+ })
78
 
79
+ # 2. Extract Vector Drawings (Paths)
80
+ for path in page.get_drawings():
81
+ svg_path = ""
82
+ for item in path["items"]:
83
+ if item[0] == "l": # line
84
+ svg_path += f"M {item[1].x} {item[1].y} L {item[2].x} {item[2].y} "
85
+ elif item[0] == "c": # curve
86
+ svg_path += f"M {item[1].x} {item[1].y} C {item[2].x} {item[2].y} {item[3].x} {item[3].y} {item[4].x} {item[4].y} "
87
+ elif item[0] == "re": # rectangle
88
+ r = item[1]
89
+ svg_path += f"M {r.x0} {r.y0} L {r.x1} {r.y0} L {r.x1} {r.y1} L {r.x0} {r.y1} Z "
90
+
91
+ if svg_path:
92
+ page_dict["elements"].append({
93
+ "type": "VECTOR",
94
+ "path": svg_path.strip(),
95
+ "fill": normalize_color(path.get("fill")),
96
+ "stroke": normalize_color(path.get("color")),
97
+ "strokeWeight": float(path.get("width", 1))
98
+ })
99
 
100
+ pages_data.append(page_dict)
101
+
102
+ doc.close()
103
+ return {"pages": pages_data}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
104
 
105
+ except Exception as e:
106
+ logger.error(f"Conversion Error: {str(e)}")
107
+ return {"error": str(e)}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
108
 
109
+ if __name__ == "__main__":
110
+ import uvicorn
111
+ uvicorn.run(app, host="0.0.0.0", port=7860)