chipling commited on
Commit
1bcd2bb
·
verified ·
1 Parent(s): d03efa1

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +37 -13
app.py CHANGED
@@ -1,49 +1,75 @@
1
  import os
 
 
 
 
 
 
 
2
  import shutil
3
  import json
4
  import tempfile
5
  from fastapi import FastAPI, UploadFile, File, HTTPException
6
  from paddleocr import PaddleOCRVL
 
7
 
8
- # Initialize FastAPI
9
  app = FastAPI(
10
  title="Document Ingestion API",
11
- description="PaddleOCR-VL Markdown/JSON extraction"
12
  )
13
 
14
- # Load the model once globally when the server starts
15
- print("Initializing PaddleOCR-VL Pipeline...")
16
  pipeline = PaddleOCRVL()
17
  print("Pipeline ready!")
18
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19
  @app.post("/ingest")
20
  async def ingest_document(file: UploadFile = File(...)):
21
  if not file.filename:
22
  raise HTTPException(status_code=400, detail="No file provided")
23
 
24
- # Use a TemporaryDirectory to handle creation and auto-cleanup safely
25
  with tempfile.TemporaryDirectory() as temp_dir:
26
  file_path = os.path.join(temp_dir, file.filename)
27
 
28
  try:
29
- # Save the uploaded file temporarily
30
  with open(file_path, "wb") as buffer:
31
  shutil.copyfileobj(file.file, buffer)
 
 
 
32
 
33
- # Run the VLM prediction
34
  output = pipeline.predict(file_path)
35
 
36
  parsed_pages = []
37
-
38
  for page_num, res in enumerate(output):
39
  md_path = os.path.join(temp_dir, f"page_{page_num + 1}.md")
40
  json_path = os.path.join(temp_dir, f"page_{page_num + 1}.json")
41
 
42
- # Save using the model's native methods
43
  res.save_to_markdown(save_path=md_path)
44
  res.save_to_json(save_path=json_path)
45
 
46
- # Read the contents back to send in the HTTP response
47
  with open(md_path, "r", encoding="utf-8") as f:
48
  md_content = f.read()
49
 
@@ -59,14 +85,12 @@ async def ingest_document(file: UploadFile = File(...)):
59
  return {
60
  "status": "success",
61
  "filename": file.filename,
62
- "total_pages": len(parsed_pages),
63
  "data": parsed_pages
64
  }
65
 
66
  except Exception as e:
67
  raise HTTPException(status_code=500, detail=str(e))
68
 
69
- # Hugging Face Spaces routes health checks to the root
70
  @app.get("/")
71
  def health_check():
72
- return {"status": "active", "model": "PaddleOCR-VL"}
 
1
  import os
2
+
3
+ # --- CPU OPTIMIZATION FLAGS (Must be set before importing Paddle) ---
4
+ # Hugging Face Free Tier has 2 vCPUs. Limiting threads prevents overhead.
5
+ os.environ["OMP_NUM_THREADS"] = "2"
6
+ # Enables Intel CPU math acceleration
7
+ os.environ["FLAGS_use_mkldnn"] = "1"
8
+
9
  import shutil
10
  import json
11
  import tempfile
12
  from fastapi import FastAPI, UploadFile, File, HTTPException
13
  from paddleocr import PaddleOCRVL
14
+ from PIL import Image
15
 
 
16
  app = FastAPI(
17
  title="Document Ingestion API",
18
+ description="Optimized PaddleOCR-VL extraction"
19
  )
20
 
21
+ print("Initializing PaddleOCR-VL Pipeline (MKLDNN Enabled)...")
 
22
  pipeline = PaddleOCRVL()
23
  print("Pipeline ready!")
24
 
25
+ def optimize_image_for_vlm(file_path, max_dimension=1200):
26
+ """
27
+ Downscales large images to reduce the number of visual tokens the VLM
28
+ has to process. This creates a massive speedup on CPU.
29
+ """
30
+ try:
31
+ # Only attempt to resize if it's a standard image (ignore PDFs)
32
+ if file_path.lower().endswith(('.png', '.jpg', '.jpeg', '.webp')):
33
+ with Image.open(file_path) as img:
34
+ # Calculate the scaling factor if the image is too large
35
+ if max(img.size) > max_dimension:
36
+ ratio = max_dimension / max(img.size)
37
+ new_size = (int(img.width * ratio), int(img.height * ratio))
38
+
39
+ # Resize and overwrite the temporary file
40
+ img = img.resize(new_size, Image.Resampling.LANCZOS)
41
+ img.save(file_path)
42
+ print(f"Image downscaled to {new_size} for faster CPU inference.")
43
+ except Exception as e:
44
+ print(f"Skipping image optimization: {e}")
45
+
46
  @app.post("/ingest")
47
  async def ingest_document(file: UploadFile = File(...)):
48
  if not file.filename:
49
  raise HTTPException(status_code=400, detail="No file provided")
50
 
 
51
  with tempfile.TemporaryDirectory() as temp_dir:
52
  file_path = os.path.join(temp_dir, file.filename)
53
 
54
  try:
55
+ # 1. Save file
56
  with open(file_path, "wb") as buffer:
57
  shutil.copyfileobj(file.file, buffer)
58
+
59
+ # 2. Optimize image (massive CPU speedup)
60
+ optimize_image_for_vlm(file_path)
61
 
62
+ # 3. Predict
63
  output = pipeline.predict(file_path)
64
 
65
  parsed_pages = []
 
66
  for page_num, res in enumerate(output):
67
  md_path = os.path.join(temp_dir, f"page_{page_num + 1}.md")
68
  json_path = os.path.join(temp_dir, f"page_{page_num + 1}.json")
69
 
 
70
  res.save_to_markdown(save_path=md_path)
71
  res.save_to_json(save_path=json_path)
72
 
 
73
  with open(md_path, "r", encoding="utf-8") as f:
74
  md_content = f.read()
75
 
 
85
  return {
86
  "status": "success",
87
  "filename": file.filename,
 
88
  "data": parsed_pages
89
  }
90
 
91
  except Exception as e:
92
  raise HTTPException(status_code=500, detail=str(e))
93
 
 
94
  @app.get("/")
95
  def health_check():
96
+ return {"status": "active", "model": "PaddleOCR-VL (Optimized)"}