chipling commited on
Commit
d03efa1
·
verified ·
1 Parent(s): e24fdcc

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +43 -43
app.py CHANGED
@@ -1,11 +1,15 @@
1
  import os
2
  import shutil
3
  import json
 
4
  from fastapi import FastAPI, UploadFile, File, HTTPException
5
  from paddleocr import PaddleOCRVL
6
 
7
  # Initialize FastAPI
8
- app = FastAPI(title="Document Ingestion API", description="PaddleOCR-VL Markdown/JSON extraction")
 
 
 
9
 
10
  # Load the model once globally when the server starts
11
  print("Initializing PaddleOCR-VL Pipeline...")
@@ -17,54 +21,50 @@ async def ingest_document(file: UploadFile = File(...)):
17
  if not file.filename:
18
  raise HTTPException(status_code=400, detail="No file provided")
19
 
20
- temp_dir = "temp_workspace"
21
- os.makedirs(temp_dir, exist_ok=True)
22
- file_path = os.path.join(temp_dir, file.filename)
23
-
24
- try:
25
- # Save the uploaded file temporarily
26
- with open(file_path, "wb") as buffer:
27
- shutil.copyfileobj(file.file, buffer)
28
-
29
- # Run the VLM prediction
30
- output = pipeline.predict(file_path)
31
-
32
- parsed_pages = []
33
 
34
- for page_num, res in enumerate(output):
35
- md_path = os.path.join(temp_dir, f"page_{page_num + 1}.md")
36
- json_path = os.path.join(temp_dir, f"page_{page_num + 1}.json")
 
 
 
 
37
 
38
- # Save using the model's native methods
39
- res.save_to_markdown(save_path=md_path)
40
- res.save_to_json(save_path=json_path)
41
 
42
- # Read the contents back to send in the HTTP response
43
- with open(md_path, "r", encoding="utf-8") as f:
44
- md_content = f.read()
45
 
46
- with open(json_path, "r", encoding="utf-8") as f:
47
- json_content = json.load(f)
 
48
 
49
- parsed_pages.append({
50
- "page": page_num + 1,
51
- "markdown": md_content,
52
- "json_data": json_content
53
- })
 
 
 
 
 
 
 
 
 
 
 
 
 
 
54
 
55
- return {
56
- "status": "success",
57
- "filename": file.filename,
58
- "total_pages": len(parsed_pages),
59
- "data": parsed_pages
60
- }
61
-
62
- except Exception as e:
63
- raise HTTPException(status_code=500, detail=str(e))
64
-
65
- finally:
66
- # Clean up the temporary files so the server doesn't run out of storage
67
- shutil.rmtree(temp_dir, ignore_errors=True)
68
 
69
  # Hugging Face Spaces routes health checks to the root
70
  @app.get("/")
 
1
  import os
2
  import shutil
3
  import json
4
+ import tempfile
5
  from fastapi import FastAPI, UploadFile, File, HTTPException
6
  from paddleocr import PaddleOCRVL
7
 
8
  # Initialize FastAPI
9
+ app = FastAPI(
10
+ title="Document Ingestion API",
11
+ description="PaddleOCR-VL Markdown/JSON extraction"
12
+ )
13
 
14
  # Load the model once globally when the server starts
15
  print("Initializing PaddleOCR-VL Pipeline...")
 
21
  if not file.filename:
22
  raise HTTPException(status_code=400, detail="No file provided")
23
 
24
+ # Use a TemporaryDirectory to handle creation and auto-cleanup safely
25
+ with tempfile.TemporaryDirectory() as temp_dir:
26
+ file_path = os.path.join(temp_dir, file.filename)
 
 
 
 
 
 
 
 
 
 
27
 
28
+ try:
29
+ # Save the uploaded file temporarily
30
+ with open(file_path, "wb") as buffer:
31
+ shutil.copyfileobj(file.file, buffer)
32
+
33
+ # Run the VLM prediction
34
+ output = pipeline.predict(file_path)
35
 
36
+ parsed_pages = []
 
 
37
 
38
+ for page_num, res in enumerate(output):
39
+ md_path = os.path.join(temp_dir, f"page_{page_num + 1}.md")
40
+ json_path = os.path.join(temp_dir, f"page_{page_num + 1}.json")
41
 
42
+ # Save using the model's native methods
43
+ res.save_to_markdown(save_path=md_path)
44
+ res.save_to_json(save_path=json_path)
45
 
46
+ # Read the contents back to send in the HTTP response
47
+ with open(md_path, "r", encoding="utf-8") as f:
48
+ md_content = f.read()
49
+
50
+ with open(json_path, "r", encoding="utf-8") as f:
51
+ json_content = json.load(f)
52
+
53
+ parsed_pages.append({
54
+ "page": page_num + 1,
55
+ "markdown": md_content,
56
+ "json_data": json_content
57
+ })
58
+
59
+ return {
60
+ "status": "success",
61
+ "filename": file.filename,
62
+ "total_pages": len(parsed_pages),
63
+ "data": parsed_pages
64
+ }
65
 
66
+ except Exception as e:
67
+ raise HTTPException(status_code=500, detail=str(e))
 
 
 
 
 
 
 
 
 
 
 
68
 
69
  # Hugging Face Spaces routes health checks to the root
70
  @app.get("/")