Spaces:

vaibreact
/

audiolens-backend

Build error

App Files Files Community

Vaibhav Gaikwad commited on Mar 23

Commit

cf220c1

1 Parent(s): 374083e

optimise: ocr on cpu, reduce tts gpu reservation

Browse files

Files changed (1) hide show

app.py +13 -13

app.py CHANGED Viewed

@@ -64,9 +64,9 @@ dit_model     = AutoModelForImageClassification.from_pretrained('microsoft/dit-b
 dit_model.eval()
 print('dit-base loaded.')
-# -- ocr: easyocr (lazy-init inside gpu function so it binds to cuda) --
 ocr_reader = None
-print('easyocr will lazy-init on first ocr request.')
 # -- tts: kokoro --
 import soundfile as sf
@@ -87,9 +87,9 @@ def pil_to_cv2(pil_image):
     return cv2.cvtColor(rgb, cv2.COLOR_RGB2BGR)
-# ============================================================
-# -- gpu functions --
-# ============================================================
 @spaces.GPU
 def classify_fn(image):
@@ -122,17 +122,17 @@ def classify_fn(image):
         return {'error': str(e)}
-@spaces.GPU
 def ocr_gpu(clean_image):
     """
     runs easyocr on a preprocessed image.
-    easyocr lazy-inits on first call so it binds to cuda.
     """
     global ocr_reader
     if ocr_reader is None:
         import easyocr
-        ocr_reader = easyocr.Reader(['en'], gpu=True, verbose=False)
-        print('easyocr initialised on gpu.')
     results = ocr_reader.readtext(clean_image, detail=0)
     return ' '.join(results)
@@ -143,8 +143,8 @@ def ocr_fn(image):
     extracts text from a document image.
     called via gradio api: /call/ocr
-    preprocessing (deskew, denoise, contrast, binarise) runs on cpu
-    before the gpu function is called for ocr inference.
     input:  pil image (gradio Image component with type="pil")
     output: extracted text string
@@ -159,7 +159,7 @@ def ocr_fn(image):
         # preprocessing runs on cpu — outside the gpu function
         clean = preprocess(cv2_image)
-        # ocr inference on gpu
         text = ocr_gpu(clean)
         return text
@@ -167,7 +167,7 @@ def ocr_fn(image):
         return f'error: {str(e)}'
-@spaces.GPU(duration=30)
 def speak_fn(text, voice):
     """
     converts text to speech using kokoro.

 dit_model.eval()
 print('dit-base loaded.')
+# -- ocr: easyocr (lazy-init on first call, runs on cpu to save gpu quota) --
 ocr_reader = None
+print('easyocr will lazy-init on first ocr request (cpu).')
 # -- tts: kokoro --
 import soundfile as sf
     return cv2.cvtColor(rgb, cv2.COLOR_RGB2BGR)
+# -- endpoint section: ocr --
+# preprocesses the image then runs easyocr — both on cpu.
+# saves gpu quota for classify and tts only.
 @spaces.GPU
 def classify_fn(image):
         return {'error': str(e)}
 def ocr_gpu(clean_image):
     """
     runs easyocr on a preprocessed image.
+    runs on cpu to save gpu quota — easyocr is fast enough on cpu.
+    lazy-inits on first call.
     """
     global ocr_reader
     if ocr_reader is None:
         import easyocr
+        ocr_reader = easyocr.Reader(['en'], gpu=False, verbose=False)
+        print('easyocr initialised on cpu.')
     results = ocr_reader.readtext(clean_image, detail=0)
     return ' '.join(results)
     extracts text from a document image.
     called via gradio api: /call/ocr
+    both preprocessing and ocr run on cpu to save gpu quota.
+    easyocr is fast enough on cpu for document-sized images.
     input:  pil image (gradio Image component with type="pil")
     output: extracted text string
         # preprocessing runs on cpu — outside the gpu function
         clean = preprocess(cv2_image)
+        # ocr inference on cpu
         text = ocr_gpu(clean)
         return text
         return f'error: {str(e)}'
+@spaces.GPU(duration=15)
 def speak_fn(text, voice):
     """
     converts text to speech using kokoro.