adAstra144 commited on
Commit
57228d5
·
verified ·
1 Parent(s): febefdd

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +8 -3
app.py CHANGED
@@ -1,10 +1,15 @@
 
1
  from fastapi import FastAPI, UploadFile
2
  from doctr.models import ocr_predictor
3
  from doctr.io import DocumentFile
4
 
 
 
 
 
5
  app = FastAPI()
6
 
7
- # Initialize DocTR OCR model
8
  model = ocr_predictor(pretrained=True)
9
 
10
  @app.post("/ocr")
@@ -14,11 +19,11 @@ async def extract_text(file: UploadFile):
14
  with open(image_path, "wb") as f:
15
  f.write(await file.read())
16
 
17
- # Read the document
18
  doc = DocumentFile.from_images(image_path)
19
  result = model(doc)
20
 
21
- # Extract text from all pages, blocks, lines, words
22
  text = " ".join([
23
  word.value
24
  for page in result.pages
 
1
+ import os
2
  from fastapi import FastAPI, UploadFile
3
  from doctr.models import ocr_predictor
4
  from doctr.io import DocumentFile
5
 
6
+ # Set DocTR cache directory before importing models
7
+ os.environ["DOCTR_CACHE_DIR"] = "/app/.cache"
8
+ os.makedirs("/app/.cache", exist_ok=True)
9
+
10
  app = FastAPI()
11
 
12
+ # Initialize DocTR OCR model once at startup
13
  model = ocr_predictor(pretrained=True)
14
 
15
  @app.post("/ocr")
 
19
  with open(image_path, "wb") as f:
20
  f.write(await file.read())
21
 
22
+ # Read document and run OCR
23
  doc = DocumentFile.from_images(image_path)
24
  result = model(doc)
25
 
26
+ # Extract all words as a single string
27
  text = " ".join([
28
  word.value
29
  for page in result.pages