chipling commited on
Commit
277f5b9
·
verified ·
1 Parent(s): ea88869

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +41 -65
app.py CHANGED
@@ -1,50 +1,26 @@
1
  import os
2
-
3
- # --- CPU OPTIMIZATION FLAGS (Must be set before importing Paddle) ---
4
- # Hugging Face Free Tier has 2 vCPUs. Limiting threads prevents overhead.
5
- os.environ["OMP_NUM_THREADS"] = "2"
6
-
7
- # DISABLE MKLDNN: PaddleOCR-VL transformer ops are currently incompatible
8
- # with the oneDNN instruction converter in Paddle's new executor.
9
- os.environ["FLAGS_use_mkldnn"] = "0"
10
-
11
  import shutil
12
- import json
13
  import tempfile
 
 
14
  from fastapi import FastAPI, UploadFile, File, HTTPException
15
- from paddleocr import PaddleOCRVL
16
- from PIL import Image
 
 
 
 
17
 
18
  app = FastAPI(
19
  title="Document Ingestion API",
20
- description="Optimized PaddleOCR-VL extraction"
21
  )
22
 
23
- print("Initializing PaddleOCR-VL Pipeline (MKLDNN Disabled)...")
24
- pipeline = PaddleOCRVL()
 
25
  print("Pipeline ready!")
26
 
27
- def optimize_image_for_vlm(file_path, max_dimension=1200):
28
- """
29
- Downscales large images to reduce the number of visual tokens the VLM
30
- has to process. This creates a massive speedup on CPU.
31
- """
32
- try:
33
- # Only attempt to resize if it's a standard image (ignore PDFs)
34
- if file_path.lower().endswith(('.png', '.jpg', '.jpeg', '.webp')):
35
- with Image.open(file_path) as img:
36
- # Calculate the scaling factor if the image is too large
37
- if max(img.size) > max_dimension:
38
- ratio = max_dimension / max(img.size)
39
- new_size = (int(img.width * ratio), int(img.height * ratio))
40
-
41
- # Resize and overwrite the temporary file
42
- img = img.resize(new_size, Image.Resampling.LANCZOS)
43
- img.save(file_path)
44
- print(f"Image downscaled to {new_size} for faster CPU inference.")
45
- except Exception as e:
46
- print(f"Skipping image optimization: {e}")
47
-
48
  @app.post("/ingest")
49
  async def ingest_document(file: UploadFile = File(...)):
50
  if not file.filename:
@@ -58,41 +34,41 @@ async def ingest_document(file: UploadFile = File(...)):
58
  with open(file_path, "wb") as buffer:
59
  shutil.copyfileobj(file.file, buffer)
60
 
61
- # 2. Optimize image (massive CPU speedup)
62
- optimize_image_for_vlm(file_path)
 
 
63
 
64
- # 3. Predict
65
- output = pipeline.predict(file_path)
66
 
67
- parsed_pages = []
68
- for page_num, res in enumerate(output):
69
- md_path = os.path.join(temp_dir, f"page_{page_num + 1}.md")
70
- json_path = os.path.join(temp_dir, f"page_{page_num + 1}.json")
 
71
 
72
- res.save_to_markdown(save_path=md_path)
73
- res.save_to_json(save_path=json_path)
74
-
75
- with open(md_path, "r", encoding="utf-8") as f:
76
- md_content = f.read()
77
-
78
- with open(json_path, "r", encoding="utf-8") as f:
79
- json_content = json.load(f)
80
-
81
- parsed_pages.append({
82
- "page": page_num + 1,
83
- "markdown": md_content,
84
- "json_data": json_content
 
85
  })
86
-
87
  return {
88
  "status": "success",
89
  "filename": file.filename,
90
- "data": parsed_pages
91
  }
92
-
93
- except Exception as e:
94
- raise HTTPException(status_code=500, detail=str(e))
95
 
96
- @app.get("/")
97
- def health_check():
98
- return {"status": "active", "model": "PaddleOCR-VL (Optimized)"}
 
1
  import os
 
 
 
 
 
 
 
 
 
2
  import shutil
 
3
  import tempfile
4
+ import cv2
5
+ import numpy as np
6
  from fastapi import FastAPI, UploadFile, File, HTTPException
7
+
8
+ # --- CPU OPTIMIZATION FLAGS ---
9
+ os.environ["OMP_NUM_THREADS"] = "2"
10
+ os.environ["FLAGS_use_mkldnn"] = "1" # Back on! Works perfectly with PP-Structure
11
+
12
+ from paddleocr import PPStructure
13
 
14
  app = FastAPI(
15
  title="Document Ingestion API",
16
+ description="Lightweight PP-StructureV2 extraction"
17
  )
18
 
19
+ print("Initializing PP-Structure (MKLDNN Enabled)...")
20
+ # recovery=True helps fix layout issues, layout=True enables section detection
21
+ table_engine = PPStructure(show_log=False, layout=True, recovery=True)
22
  print("Pipeline ready!")
23
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24
  @app.post("/ingest")
25
  async def ingest_document(file: UploadFile = File(...)):
26
  if not file.filename:
 
34
  with open(file_path, "wb") as buffer:
35
  shutil.copyfileobj(file.file, buffer)
36
 
37
+ # 2. Read image for PP-Structure (expects cv2 numpy array)
38
+ img = cv2.imread(file_path)
39
+ if img is None:
40
+ raise ValueError("Could not read image file.")
41
 
42
+ # 3. Predict layout and extract
43
+ result = table_engine(img)
44
 
45
+ # 4. Format the output cleanly
46
+ structured_data = []
47
+ for region in result:
48
+ # region is a dict with keys like: 'type', 'bbox', 'res'
49
+ block_type = region.get('type') # e.g., 'text', 'title', 'table', 'figure'
50
 
51
+ content = ""
52
+ if block_type == 'table':
53
+ content = region.get('res', {}).get('html', '')
54
+ elif block_type in ['text', 'title', 'list']:
55
+ # Extract joined text lines
56
+ lines = region.get('res', [])
57
+ content = "\n".join([line.get('text', '') for line in lines if 'text' in line])
58
+ elif block_type == 'figure':
59
+ content = "[Image cropped by layout engine]"
60
+
61
+ structured_data.append({
62
+ "type": block_type,
63
+ "bounding_box": region.get('bbox'),
64
+ "content": content
65
  })
66
+
67
  return {
68
  "status": "success",
69
  "filename": file.filename,
70
+ "data": structured_data
71
  }
 
 
 
72
 
73
+ except Exception as e:
74
+ raise HTTPException(status_code=500, detail=str(e))