Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| import tempfile | |
| import os | |
| import json | |
| import yaml | |
| from pathlib import Path | |
| from docling.datamodel.base_models import InputFormat | |
| from docling.datamodel.pipeline_options import ( | |
| PdfPipelineOptions, | |
| TesseractCliOcrOptions | |
| ) | |
| from docling.document_converter import ( | |
| DocumentConverter, | |
| PdfFormatOption, | |
| WordFormatOption | |
| ) | |
| from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline | |
| from docling.pipeline.simple_pipeline import SimplePipeline | |
| from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend | |
| # Initialize the document converter with advanced options | |
| doc_converter = DocumentConverter( | |
| allowed_formats=[ | |
| InputFormat.PDF, | |
| InputFormat.IMAGE, | |
| InputFormat.DOCX, | |
| InputFormat.HTML, | |
| InputFormat.PPTX, | |
| InputFormat.ASCIIDOC, | |
| InputFormat.MD, | |
| InputFormat.XLSX, | |
| ], | |
| format_options={ | |
| InputFormat.PDF: PdfFormatOption( | |
| pipeline_cls=StandardPdfPipeline, | |
| backend=PyPdfiumDocumentBackend, | |
| pipeline_options=PdfPipelineOptions( | |
| do_ocr=True, | |
| ocr_options=TesseractCliOcrOptions(lang=["auto"]) | |
| ) | |
| ), | |
| InputFormat.DOCX: WordFormatOption( | |
| pipeline_cls=SimplePipeline | |
| ), | |
| } | |
| ) | |
| def convert_document(input_file, output_format): | |
| try: | |
| # Get the input file path | |
| temp_dir = tempfile.mkdtemp() | |
| input_path = Path(input_file.name) | |
| # Generate output path | |
| output_filename = f"converted_document.{output_format.lower()}" | |
| output_path = Path(temp_dir) / output_filename | |
| # Convert the document | |
| result = doc_converter.convert(str(input_path)) | |
| # Export to desired format | |
| if output_format.lower() == "html": | |
| with open(output_path, "w", encoding="utf-8") as f: | |
| f.write(result.document.export_to_html()) | |
| elif output_format.lower() == "text": | |
| with open(output_path, "w", encoding="utf-8") as f: | |
| f.write(result.document.export_to_text()) | |
| elif output_format.lower() in ["md", "markdown"]: | |
| with open(output_path, "w", encoding="utf-8") as f: | |
| f.write(result.document.export_to_markdown()) | |
| elif output_format.lower() == "json": | |
| with open(output_path, "w", encoding="utf-8") as f: | |
| json.dump(result.document.export_to_dict(), f, indent=2) | |
| elif output_format.lower() == "yaml": | |
| with open(output_path, "w", encoding="utf-8") as f: | |
| yaml.safe_dump(result.document.export_to_dict(), f) | |
| elif output_format.lower() == "doctags": | |
| with open(output_path, "w", encoding="utf-8") as f: | |
| f.write(result.document.export_to_document_tokens()) | |
| else: | |
| raise ValueError(f"Format de sortie non supporté: {output_format}") | |
| return str(output_path) | |
| except Exception as e: | |
| return f"Erreur lors de la conversion: {str(e)}" | |
| # Define available formats | |
| SUPPORTED_OUTPUT_FORMATS = [ | |
| "HTML", "Markdown", "JSON", "Text", "YAML", "Doctags" | |
| ] | |
| # Create the Gradio interface | |
| with gr.Blocks(theme=gr.themes.Soft(primary_hue="blue")) as demo: | |
| gr.Markdown( | |
| """ | |
| # 📄 Document Converter Pro | |
| Convertissez facilement vos documents dans différents formats avec support OCR automatique. | |
| """ | |
| ) | |
| with gr.Row(): | |
| with gr.Column(scale=1): | |
| input_file = gr.File( | |
| label="Télécharger un document", | |
| file_types=[ | |
| "pdf", "docx", "doc", "pptx", "xlsx", | |
| "md", "html", "htm", "txt", "rtf", | |
| "png", "jpg", "jpeg", "tiff", "bmp", | |
| "adoc", "asciidoc" | |
| ], | |
| ) | |
| output_format = gr.Dropdown( | |
| choices=SUPPORTED_OUTPUT_FORMATS, | |
| value="HTML", | |
| label="Convertir en", | |
| info="Sélectionnez le format de sortie" | |
| ) | |
| convert_btn = gr.Button("Convertir le document", variant="primary") | |
| with gr.Column(scale=1): | |
| output = gr.File(label="Document converti") | |
| convert_btn.click( | |
| fn=convert_document, | |
| inputs=[input_file, output_format], | |
| outputs=output | |
| ) | |
| gr.Markdown( | |
| """ | |
| ### 📝 Instructions | |
| 1. Téléchargez votre document | |
| 2. Choisissez le format de sortie souhaité | |
| 3. Cliquez sur "Convertir le document" | |
| ### ℹ️ Formats supportés | |
| #### Formats d'entrée | |
| - Documents Office : PDF, DOCX, XLSX, PPTX | |
| - Documents Web : HTML, XHTML | |
| - Documents texte : Markdown, AsciiDoc, TXT, RTF | |
| - Images : PNG, JPEG, TIFF, BMP | |
| #### Formats de sortie | |
| - HTML (avec support des images intégrées) | |
| - Markdown | |
| - Text (texte brut sans formatage) | |
| - JSON (sérialisation sans perte) | |
| - YAML | |
| - Doctags | |
| ### 🔍 Fonctionnalités | |
| - Détection automatique de la langue pour l'OCR | |
| - Support complet des tableaux | |
| - Extraction des images | |
| - Conversion multi-format | |
| """ | |
| ) | |
| if __name__ == "__main__": | |
| demo.launch(share=False) |