abdullahmak commited on
Commit
a00eb68
·
1 Parent(s): f93c592
Files changed (2) hide show
  1. Dockerfile +2 -0
  2. app.py +4 -5
Dockerfile CHANGED
@@ -13,6 +13,8 @@ WORKDIR /app
13
 
14
  COPY --chown=user ./requirements.txt requirements.txt
15
  RUN pip install --no-cache-dir --upgrade -r requirements.txt
 
 
16
 
17
  COPY --chown=user . .
18
 
 
13
 
14
  COPY --chown=user ./requirements.txt requirements.txt
15
  RUN pip install --no-cache-dir --upgrade -r requirements.txt
16
+ RUN apt-get update && \
17
+ apt-get install -y tesseract-ocr tesseract-ocr-eng libtesseract-dev
18
 
19
  COPY --chown=user . .
20
 
app.py CHANGED
@@ -619,13 +619,12 @@ async def download_and_process_file(url: str) -> Dict[str, Any]:
619
  from PIL import Image
620
  img = Image.open(io.BytesIO(content))
621
  text = pytesseract.image_to_string(img)
622
- b64 = base64.b64encode(content).decode('utf-8')
623
- logger.info(f"[FILE] Image OCR: {len(text)} chars")
624
  return {'type': 'image', 'ocr_text': text, 'size': img.size}
625
  except Exception as e:
626
- logger.warning(f"[FILE] Image OCR error: {e}")
627
- b64 = base64.b64encode(content[:1000]).decode('utf-8')
628
- return {'type': 'image', 'error': str(e), 'base64_sample': b64}
 
629
 
630
  elif ext in ['mp3', 'wav']:
631
  b64 = base64.b64encode(content[:5000]).decode('utf-8')
 
619
  from PIL import Image
620
  img = Image.open(io.BytesIO(content))
621
  text = pytesseract.image_to_string(img)
 
 
622
  return {'type': 'image', 'ocr_text': text, 'size': img.size}
623
  except Exception as e:
624
+ logger.warning(f"[FILE] OCR unavailable, sending base64 sample instead")
625
+ b64 = base64.b64encode(content).decode('utf-8')
626
+ return {'type': 'image', 'base64_sample': b64, 'ocr_error': str(e)}
627
+
628
 
629
  elif ext in ['mp3', 'wav']:
630
  b64 = base64.b64encode(content[:5000]).decode('utf-8')