ocr-minerva / app.py
Shads229's picture
Upload app.py
257344e verified
from fastapi import FastAPI, File, UploadFile, HTTPException
from fastapi.responses import JSONResponse
from paddleocr import PaddleOCR
import numpy as np
from PIL import Image
import fitz # PyMuPDF
import io
import os
app = FastAPI(title="OCR API", description="API d'extraction de texte pour PDF et images")
# Initialiser PaddleOCR (français et anglais)
ocr_fr = PaddleOCR(use_angle_cls=True, lang='fr')
ocr_en = PaddleOCR(use_angle_cls=True, lang='en')
def extract_text_from_image(image_bytes, lang='fr'):
"""Extrait le texte d'une image"""
try:
image = Image.open(io.BytesIO(image_bytes))
image_np = np.array(image)
# Choisir le bon modèle OCR
ocr = ocr_fr if lang == 'fr' else ocr_en
result = ocr.ocr(image_np)
if result is None or len(result) == 0 or result[0] is None:
return ""
# Extraire uniquement le texte
text_lines = [line[1][0] for line in result[0]]
return "\n".join(text_lines)
except Exception as e:
raise HTTPException(status_code=500, detail=f"Erreur OCR image: {str(e)}")
def extract_text_from_pdf(pdf_bytes, lang='fr'):
"""Extrait le texte d'un PDF"""
try:
doc = fitz.open(stream=pdf_bytes, filetype="pdf")
all_text = []
for page_num in range(len(doc)):
page = doc[page_num]
# Essayer d'extraire le texte natif
text = page.get_text()
if text.strip():
all_text.append(text)
else:
# OCR sur l'image de la page
pix = page.get_pixmap(matrix=fitz.Matrix(2, 2))
img_bytes = pix.tobytes("png")
ocr_text = extract_text_from_image(img_bytes, lang)
all_text.append(ocr_text)
doc.close()
return "\n\n".join(all_text)
except Exception as e:
raise HTTPException(status_code=500, detail=f"Erreur lecture PDF: {str(e)}")
@app.get("/")
async def root():
return {
"message": "OCR API - Envoyez vos fichiers à /extract",
"supported_formats": ["pdf", "jpg", "jpeg", "png", "bmp", "tiff", "webp"],
"languages": ["fr", "en"]
}
@app.post("/extract")
async def extract_text(
file: UploadFile = File(...),
lang: str = "fr"
):
"""
Extrait le texte d'un fichier (PDF ou image)
- **file**: Fichier à traiter (PDF, JPG, PNG, etc.)
- **lang**: Langue (fr ou en, défaut: fr)
"""
# Vérifier la langue
if lang not in ['fr', 'en']:
raise HTTPException(status_code=400, detail="Langue non supportée. Utilisez 'fr' ou 'en'")
# Lire le fichier
file_bytes = await file.read()
file_extension = file.filename.lower().split('.')[-1]
# Traiter selon le type
if file_extension == 'pdf':
text = extract_text_from_pdf(file_bytes, lang)
elif file_extension in ['jpg', 'jpeg', 'png', 'bmp', 'tiff', 'webp']:
text = extract_text_from_image(file_bytes, lang)
else:
raise HTTPException(
status_code=400,
detail=f"Format non supporté: {file_extension}"
)
return JSONResponse(content={"text": text})
@app.get("/health")
async def health():
return {"status": "ok"}