import gradio as gr import tempfile import os import json import yaml from pathlib import Path from docling.datamodel.base_models import InputFormat from docling.datamodel.pipeline_options import ( PdfPipelineOptions, TesseractCliOcrOptions ) from docling.document_converter import ( DocumentConverter, PdfFormatOption, WordFormatOption ) from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline from docling.pipeline.simple_pipeline import SimplePipeline from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend # Initialize the document converter with advanced options doc_converter = DocumentConverter( allowed_formats=[ InputFormat.PDF, InputFormat.IMAGE, InputFormat.DOCX, InputFormat.HTML, InputFormat.PPTX, InputFormat.ASCIIDOC, InputFormat.MD, InputFormat.XLSX, ], format_options={ InputFormat.PDF: PdfFormatOption( pipeline_cls=StandardPdfPipeline, backend=PyPdfiumDocumentBackend, pipeline_options=PdfPipelineOptions( do_ocr=True, ocr_options=TesseractCliOcrOptions(lang=["auto"]) ) ), InputFormat.DOCX: WordFormatOption( pipeline_cls=SimplePipeline ), } ) def convert_document(input_file, output_format): try: # Get the input file path temp_dir = tempfile.mkdtemp() input_path = Path(input_file.name) # Generate output path output_filename = f"converted_document.{output_format.lower()}" output_path = Path(temp_dir) / output_filename # Convert the document result = doc_converter.convert(str(input_path)) # Export to desired format if output_format.lower() == "html": with open(output_path, "w", encoding="utf-8") as f: f.write(result.document.export_to_html()) elif output_format.lower() == "text": with open(output_path, "w", encoding="utf-8") as f: f.write(result.document.export_to_text()) elif output_format.lower() in ["md", "markdown"]: with open(output_path, "w", encoding="utf-8") as f: f.write(result.document.export_to_markdown()) elif output_format.lower() == "json": with open(output_path, "w", encoding="utf-8") as f: json.dump(result.document.export_to_dict(), f, indent=2) elif output_format.lower() == "yaml": with open(output_path, "w", encoding="utf-8") as f: yaml.safe_dump(result.document.export_to_dict(), f) elif output_format.lower() == "doctags": with open(output_path, "w", encoding="utf-8") as f: f.write(result.document.export_to_document_tokens()) else: raise ValueError(f"Format de sortie non supporté: {output_format}") return str(output_path) except Exception as e: return f"Erreur lors de la conversion: {str(e)}" # Define available formats SUPPORTED_OUTPUT_FORMATS = [ "HTML", "Markdown", "JSON", "Text", "YAML", "Doctags" ] # Create the Gradio interface with gr.Blocks(theme=gr.themes.Soft(primary_hue="blue")) as demo: gr.Markdown( """ # 📄 Document Converter Pro Convertissez facilement vos documents dans différents formats avec support OCR automatique. """ ) with gr.Row(): with gr.Column(scale=1): input_file = gr.File( label="Télécharger un document", file_types=[ "pdf", "docx", "doc", "pptx", "xlsx", "md", "html", "htm", "txt", "rtf", "png", "jpg", "jpeg", "tiff", "bmp", "adoc", "asciidoc" ], ) output_format = gr.Dropdown( choices=SUPPORTED_OUTPUT_FORMATS, value="HTML", label="Convertir en", info="Sélectionnez le format de sortie" ) convert_btn = gr.Button("Convertir le document", variant="primary") with gr.Column(scale=1): output = gr.File(label="Document converti") convert_btn.click( fn=convert_document, inputs=[input_file, output_format], outputs=output ) gr.Markdown( """ ### 📝 Instructions 1. Téléchargez votre document 2. Choisissez le format de sortie souhaité 3. Cliquez sur "Convertir le document" ### ℹ️ Formats supportés #### Formats d'entrée - Documents Office : PDF, DOCX, XLSX, PPTX - Documents Web : HTML, XHTML - Documents texte : Markdown, AsciiDoc, TXT, RTF - Images : PNG, JPEG, TIFF, BMP #### Formats de sortie - HTML (avec support des images intégrées) - Markdown - Text (texte brut sans formatage) - JSON (sérialisation sans perte) - YAML - Doctags ### 🔍 Fonctionnalités - Détection automatique de la langue pour l'OCR - Support complet des tableaux - Extraction des images - Conversion multi-format """ ) if __name__ == "__main__": demo.launch(share=False)