Spaces:
Sleeping
Sleeping
Commit ·
a00eb68
1
Parent(s): f93c592
heh
Browse files- Dockerfile +2 -0
- app.py +4 -5
Dockerfile
CHANGED
|
@@ -13,6 +13,8 @@ WORKDIR /app
|
|
| 13 |
|
| 14 |
COPY --chown=user ./requirements.txt requirements.txt
|
| 15 |
RUN pip install --no-cache-dir --upgrade -r requirements.txt
|
|
|
|
|
|
|
| 16 |
|
| 17 |
COPY --chown=user . .
|
| 18 |
|
|
|
|
| 13 |
|
| 14 |
COPY --chown=user ./requirements.txt requirements.txt
|
| 15 |
RUN pip install --no-cache-dir --upgrade -r requirements.txt
|
| 16 |
+
RUN apt-get update && \
|
| 17 |
+
apt-get install -y tesseract-ocr tesseract-ocr-eng libtesseract-dev
|
| 18 |
|
| 19 |
COPY --chown=user . .
|
| 20 |
|
app.py
CHANGED
|
@@ -619,13 +619,12 @@ async def download_and_process_file(url: str) -> Dict[str, Any]:
|
|
| 619 |
from PIL import Image
|
| 620 |
img = Image.open(io.BytesIO(content))
|
| 621 |
text = pytesseract.image_to_string(img)
|
| 622 |
-
b64 = base64.b64encode(content).decode('utf-8')
|
| 623 |
-
logger.info(f"[FILE] Image OCR: {len(text)} chars")
|
| 624 |
return {'type': 'image', 'ocr_text': text, 'size': img.size}
|
| 625 |
except Exception as e:
|
| 626 |
-
logger.warning(f"[FILE]
|
| 627 |
-
b64 = base64.b64encode(content
|
| 628 |
-
return {'type': 'image', '
|
|
|
|
| 629 |
|
| 630 |
elif ext in ['mp3', 'wav']:
|
| 631 |
b64 = base64.b64encode(content[:5000]).decode('utf-8')
|
|
|
|
| 619 |
from PIL import Image
|
| 620 |
img = Image.open(io.BytesIO(content))
|
| 621 |
text = pytesseract.image_to_string(img)
|
|
|
|
|
|
|
| 622 |
return {'type': 'image', 'ocr_text': text, 'size': img.size}
|
| 623 |
except Exception as e:
|
| 624 |
+
logger.warning(f"[FILE] OCR unavailable, sending base64 sample instead")
|
| 625 |
+
b64 = base64.b64encode(content).decode('utf-8')
|
| 626 |
+
return {'type': 'image', 'base64_sample': b64, 'ocr_error': str(e)}
|
| 627 |
+
|
| 628 |
|
| 629 |
elif ext in ['mp3', 'wav']:
|
| 630 |
b64 = base64.b64encode(content[:5000]).decode('utf-8')
|