Spaces:

gthai
/

convert-anything

Sleeping

App Files Files Community

gth commited on Feb 1, 2025

Commit

038cc08

0 Parent(s):

init

Browse files

Files changed (3) hide show

README.md +60 -0
app.py +162 -0
requirements.txt +4 -0

README.md ADDED Viewed

	@@ -0,0 +1,60 @@

+# Document Converter Pro
+Une application web moderne et efficace pour convertir vos documents dans différents formats.
+## Fonctionnalités
+- Interface utilisateur intuitive et ergonomique
+- Support de multiples formats de documents
+- Conversion rapide et efficace
+- Gestion des erreurs robuste
+## Installation
+1. Clonez ce dépôt :
+```bash
+git clone <repository-url>
+cd convert-anything
+```
+2. Installez les dépendances :
+```bash
+pip install -r requirements.txt
+```
+## Utilisation
+1. Lancez l'application :
+```bash
+python app.py
+```
+2. Ouvrez votre navigateur et accédez à l'URL locale affichée dans le terminal
+3. Utilisez l'interface pour :
+   - Télécharger votre document
+   - Sélectionner le format de sortie souhaité
+   - Cliquer sur "Convert Document"
+   - Télécharger le document converti
+## Formats supportés
+### Documents
+- PDF
+- DOCX
+- DOC
+- ODT
+- RTF
+- TXT
+### Images
+- PNG
+- JPEG
+- TIFF
+- BMP
+## Technologies utilisées
+- Gradio : Interface utilisateur web
+- Docling : Moteur de conversion de documents
+- Python-Magic : Détection des types de fichiers

app.py ADDED Viewed

	@@ -0,0 +1,162 @@

+import gradio as gr
+import tempfile
+import os
+import json
+import yaml
+from pathlib import Path
+from docling.datamodel.base_models import InputFormat
+from docling.datamodel.pipeline_options import (
+    PdfPipelineOptions,
+    TesseractCliOcrOptions
+)
+from docling.document_converter import (
+    DocumentConverter,
+    PdfFormatOption,
+    WordFormatOption
+)
+from docling.pipeline.standard_pdf_pipeline import StandardPdfPipeline
+from docling.pipeline.simple_pipeline import SimplePipeline
+from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
+# Initialize the document converter with advanced options
+doc_converter = DocumentConverter(
+    allowed_formats=[
+        InputFormat.PDF,
+        InputFormat.IMAGE,
+        InputFormat.DOCX,
+        InputFormat.HTML,
+        InputFormat.PPTX,
+        InputFormat.ASCIIDOC,
+        InputFormat.MD,
+        InputFormat.XLSX,
+    ],
+    format_options={
+        InputFormat.PDF: PdfFormatOption(
+            pipeline_cls=StandardPdfPipeline,
+            backend=PyPdfiumDocumentBackend,
+            pipeline_options=PdfPipelineOptions(
+                do_ocr=True,
+                ocr_options=TesseractCliOcrOptions(lang=["auto"])
+            )
+        ),
+        InputFormat.DOCX: WordFormatOption(
+            pipeline_cls=SimplePipeline
+        ),
+    }
+)
+def convert_document(input_file, output_format):
+    try:
+        # Get the input file path
+        temp_dir = tempfile.mkdtemp()
+        input_path = Path(input_file.name)
+        # Generate output path
+        output_filename = f"converted_document.{output_format.lower()}"
+        output_path = Path(temp_dir) / output_filename
+        # Convert the document
+        result = doc_converter.convert(str(input_path))
+        # Export to desired format
+        if output_format.lower() == "html":
+            with open(output_path, "w", encoding="utf-8") as f:
+                f.write(result.document.export_to_html())
+        elif output_format.lower() == "text":
+            with open(output_path, "w", encoding="utf-8") as f:
+                f.write(result.document.export_to_text())
+        elif output_format.lower() in ["md", "markdown"]:
+            with open(output_path, "w", encoding="utf-8") as f:
+                f.write(result.document.export_to_markdown())
+        elif output_format.lower() == "json":
+            with open(output_path, "w", encoding="utf-8") as f:
+                json.dump(result.document.export_to_dict(), f, indent=2)
+        elif output_format.lower() == "yaml":
+            with open(output_path, "w", encoding="utf-8") as f:
+                yaml.safe_dump(result.document.export_to_dict(), f)
+        elif output_format.lower() == "doctags":
+            with open(output_path, "w", encoding="utf-8") as f:
+                f.write(result.document.export_to_document_tokens())
+        else:
+            raise ValueError(f"Format de sortie non supporté: {output_format}")
+        return str(output_path)
+    except Exception as e:
+        return f"Erreur lors de la conversion: {str(e)}"
+# Define available formats
+SUPPORTED_OUTPUT_FORMATS = [
+    "HTML", "Markdown", "JSON", "Text", "YAML", "Doctags"
+]
+# Create the Gradio interface
+with gr.Blocks(theme=gr.themes.Soft(primary_hue="blue")) as demo:
+    gr.Markdown(
+        """
+        # 📄 Document Converter Pro
+        Convertissez facilement vos documents dans différents formats avec support OCR automatique.
+        """
+    )
+    with gr.Row():
+        with gr.Column(scale=1):
+            input_file = gr.File(
+                label="Télécharger un document",
+                file_types=[
+                    "pdf", "docx", "doc", "pptx", "xlsx",
+                    "md", "html", "htm", "txt", "rtf",
+                    "png", "jpg", "jpeg", "tiff", "bmp",
+                    "adoc", "asciidoc"
+                ],
+            )
+            output_format = gr.Dropdown(
+                choices=SUPPORTED_OUTPUT_FORMATS,
+                value="HTML",
+                label="Convertir en",
+                info="Sélectionnez le format de sortie"
+            )
+            convert_btn = gr.Button("Convertir le document", variant="primary")
+        with gr.Column(scale=1):
+            output = gr.File(label="Document converti")
+    convert_btn.click(
+        fn=convert_document,
+        inputs=[input_file, output_format],
+        outputs=output
+    )
+    gr.Markdown(
+        """
+        ### 📝 Instructions
+        1. Téléchargez votre document
+        2. Choisissez le format de sortie souhaité
+        3. Cliquez sur "Convertir le document"
+        ### ℹ️ Formats supportés
+        #### Formats d'entrée
+        - Documents Office : PDF, DOCX, XLSX, PPTX
+        - Documents Web : HTML, XHTML
+        - Documents texte : Markdown, AsciiDoc, TXT, RTF
+        - Images : PNG, JPEG, TIFF, BMP
+        #### Formats de sortie
+        - HTML (avec support des images intégrées)
+        - Markdown
+        - Text (texte brut sans formatage)
+        - JSON (sérialisation sans perte)
+        - YAML
+        - Doctags
+        ### 🔍 Fonctionnalités
+        - Détection automatique de la langue pour l'OCR
+        - Support complet des tableaux
+        - Extraction des images
+        - Conversion multi-format
+        """
+    )
+if __name__ == "__main__":
+    demo.launch(share=False)

requirements.txt ADDED Viewed

	@@ -0,0 +1,4 @@

+gradio>=4.19.2
+docling>=2.0.0
+pyyaml>=6.0.1
+tesseract-ocr>=5.3.3