chipling commited on
Commit
3865d02
·
verified ·
1 Parent(s): 277f5b9

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +31 -36
app.py CHANGED
@@ -1,24 +1,25 @@
1
  import os
2
  import shutil
3
  import tempfile
4
- import cv2
5
- import numpy as np
6
  from fastapi import FastAPI, UploadFile, File, HTTPException
7
 
8
  # --- CPU OPTIMIZATION FLAGS ---
 
9
  os.environ["OMP_NUM_THREADS"] = "2"
10
- os.environ["FLAGS_use_mkldnn"] = "1" # Back on! Works perfectly with PP-Structure
 
11
 
12
- from paddleocr import PPStructure
 
13
 
14
  app = FastAPI(
15
  title="Document Ingestion API",
16
- description="Lightweight PP-StructureV2 extraction"
17
  )
18
 
19
- print("Initializing PP-Structure (MKLDNN Enabled)...")
20
- # recovery=True helps fix layout issues, layout=True enables section detection
21
- table_engine = PPStructure(show_log=False, layout=True, recovery=True)
22
  print("Pipeline ready!")
23
 
24
  @app.post("/ingest")
@@ -34,40 +35,34 @@ async def ingest_document(file: UploadFile = File(...)):
34
  with open(file_path, "wb") as buffer:
35
  shutil.copyfileobj(file.file, buffer)
36
 
37
- # 2. Read image for PP-Structure (expects cv2 numpy array)
38
- img = cv2.imread(file_path)
39
- if img is None:
40
- raise ValueError("Could not read image file.")
41
-
42
- # 3. Predict layout and extract
43
- result = table_engine(img)
44
 
45
- # 4. Format the output cleanly
46
- structured_data = []
47
- for region in result:
48
- # region is a dict with keys like: 'type', 'bbox', 'res'
49
- block_type = region.get('type') # e.g., 'text', 'title', 'table', 'figure'
50
 
51
- content = ""
52
- if block_type == 'table':
53
- content = region.get('res', {}).get('html', '')
54
- elif block_type in ['text', 'title', 'list']:
55
- # Extract joined text lines
56
- lines = region.get('res', [])
57
- content = "\n".join([line.get('text', '') for line in lines if 'text' in line])
58
- elif block_type == 'figure':
59
- content = "[Image cropped by layout engine]"
60
-
61
- structured_data.append({
62
- "type": block_type,
63
- "bounding_box": region.get('bbox'),
64
- "content": content
65
  })
66
-
67
  return {
68
  "status": "success",
69
  "filename": file.filename,
70
- "data": structured_data
71
  }
72
 
73
  except Exception as e:
 
1
  import os
2
  import shutil
3
  import tempfile
4
+ import json
 
5
  from fastapi import FastAPI, UploadFile, File, HTTPException
6
 
7
  # --- CPU OPTIMIZATION FLAGS ---
8
+ # Limit threads for HF Free Tier (2 vCPUs)
9
  os.environ["OMP_NUM_THREADS"] = "2"
10
+ # MKLDNN works perfectly with PPStructureV3 for a massive Intel speedup
11
+ os.environ["FLAGS_use_mkldnn"] = "1"
12
 
13
+ # IMPORT THE NEW V3 PIPELINE
14
+ from paddleocr import PPStructureV3
15
 
16
  app = FastAPI(
17
  title="Document Ingestion API",
18
+ description="Lightweight PP-StructureV3 extraction"
19
  )
20
 
21
+ print("Initializing PP-StructureV3 (MKLDNN Enabled)...")
22
+ pipeline = PPStructureV3()
 
23
  print("Pipeline ready!")
24
 
25
  @app.post("/ingest")
 
35
  with open(file_path, "wb") as buffer:
36
  shutil.copyfileobj(file.file, buffer)
37
 
38
+ # 2. Predict (takes the file path directly, no cv2 needed!)
39
+ output = pipeline.predict(file_path)
 
 
 
 
 
40
 
41
+ parsed_pages = []
42
+ for page_num, res in enumerate(output):
43
+ # We can save to the temp directory exactly like the VLM pipeline
44
+ md_path = os.path.join(temp_dir, f"page_{page_num + 1}.md")
45
+ json_path = os.path.join(temp_dir, f"page_{page_num + 1}.json")
46
 
47
+ res.save_to_markdown(save_path=md_path)
48
+ res.save_to_json(save_path=json_path)
49
+
50
+ with open(md_path, "r", encoding="utf-8") as f:
51
+ md_content = f.read()
52
+
53
+ with open(json_path, "r", encoding="utf-8") as f:
54
+ json_content = json.load(f)
55
+
56
+ parsed_pages.append({
57
+ "page": page_num + 1,
58
+ "markdown": md_content,
59
+ "json_data": json_content
 
60
  })
61
+
62
  return {
63
  "status": "success",
64
  "filename": file.filename,
65
+ "data": parsed_pages
66
  }
67
 
68
  except Exception as e: