Spaces:
Running
Running
| from fastapi import FastAPI, File, UploadFile, HTTPException | |
| from fastapi.responses import JSONResponse | |
| from paddleocr import PaddleOCR | |
| import numpy as np | |
| from PIL import Image | |
| import fitz # PyMuPDF | |
| import io | |
| import os | |
| app = FastAPI(title="OCR API", description="API d'extraction de texte pour PDF et images") | |
| # Initialiser PaddleOCR (français et anglais) | |
| ocr_fr = PaddleOCR(use_angle_cls=True, lang='fr') | |
| ocr_en = PaddleOCR(use_angle_cls=True, lang='en') | |
| def extract_text_from_image(image_bytes, lang='fr'): | |
| """Extrait le texte d'une image""" | |
| try: | |
| image = Image.open(io.BytesIO(image_bytes)) | |
| image_np = np.array(image) | |
| # Choisir le bon modèle OCR | |
| ocr = ocr_fr if lang == 'fr' else ocr_en | |
| result = ocr.ocr(image_np) | |
| if result is None or len(result) == 0 or result[0] is None: | |
| return "" | |
| # Extraire uniquement le texte | |
| text_lines = [line[1][0] for line in result[0]] | |
| return "\n".join(text_lines) | |
| except Exception as e: | |
| raise HTTPException(status_code=500, detail=f"Erreur OCR image: {str(e)}") | |
| def extract_text_from_pdf(pdf_bytes, lang='fr'): | |
| """Extrait le texte d'un PDF""" | |
| try: | |
| doc = fitz.open(stream=pdf_bytes, filetype="pdf") | |
| all_text = [] | |
| for page_num in range(len(doc)): | |
| page = doc[page_num] | |
| # Essayer d'extraire le texte natif | |
| text = page.get_text() | |
| if text.strip(): | |
| all_text.append(text) | |
| else: | |
| # OCR sur l'image de la page | |
| pix = page.get_pixmap(matrix=fitz.Matrix(2, 2)) | |
| img_bytes = pix.tobytes("png") | |
| ocr_text = extract_text_from_image(img_bytes, lang) | |
| all_text.append(ocr_text) | |
| doc.close() | |
| return "\n\n".join(all_text) | |
| except Exception as e: | |
| raise HTTPException(status_code=500, detail=f"Erreur lecture PDF: {str(e)}") | |
| async def root(): | |
| return { | |
| "message": "OCR API - Envoyez vos fichiers à /extract", | |
| "supported_formats": ["pdf", "jpg", "jpeg", "png", "bmp", "tiff", "webp"], | |
| "languages": ["fr", "en"] | |
| } | |
| async def extract_text( | |
| file: UploadFile = File(...), | |
| lang: str = "fr" | |
| ): | |
| """ | |
| Extrait le texte d'un fichier (PDF ou image) | |
| - **file**: Fichier à traiter (PDF, JPG, PNG, etc.) | |
| - **lang**: Langue (fr ou en, défaut: fr) | |
| """ | |
| # Vérifier la langue | |
| if lang not in ['fr', 'en']: | |
| raise HTTPException(status_code=400, detail="Langue non supportée. Utilisez 'fr' ou 'en'") | |
| # Lire le fichier | |
| file_bytes = await file.read() | |
| file_extension = file.filename.lower().split('.')[-1] | |
| # Traiter selon le type | |
| if file_extension == 'pdf': | |
| text = extract_text_from_pdf(file_bytes, lang) | |
| elif file_extension in ['jpg', 'jpeg', 'png', 'bmp', 'tiff', 'webp']: | |
| text = extract_text_from_image(file_bytes, lang) | |
| else: | |
| raise HTTPException( | |
| status_code=400, | |
| detail=f"Format non supporté: {file_extension}" | |
| ) | |
| return JSONResponse(content={"text": text}) | |
| async def health(): | |
| return {"status": "ok"} | |