from __future__ import annotations from typing import List, Tuple from docling.document_converter import DocumentConverter import gradio as gr import shutil, tempfile from pathlib import Path def convert_directory( directory: Path | str, theme: str, objectif: str, output_markdown: Path | str = "dossier_documentaire.md", ) -> None: """Parcourt *directory* et consolide les documents en un document Markdown consolidé. """ EXTENSIONS = { "pdf", "docx", "xlsx", "pptx", "md", "adoc", "asciidoc", "html", "xhtml", "csv", "png", "jpeg", "jpg", "tiff", "bmp" } directory = Path(directory) if not directory.is_dir(): raise NotADirectoryError(f"{directory} n'est pas un répertoire valide") doc_paths = sorted( p for p in directory.rglob("*") if p.is_file() and p.suffix.lstrip(".").lower() in EXTENSIONS ) if not doc_paths: raise FileNotFoundError("Aucun fichier PDF trouvé dans le répertoire") output_markdown = Path(output_markdown) converter = DocumentConverter() # Accumulate markdown blocks, index entries and temp‑PDFs to merge. markdown_blocks: List[str] = ["# Dossier documentaire\n", f"**Thème : {theme}**\n", f"**Objectif du dossier : {objectif}**\n" ] index_entries: List[str] = ["\n## Index des documents \n"] current_global_page = 1 # First pass – convert each PDF, capture markdown pages and produce a temp PDF. all_docs_pages: List[Tuple[str, List[str]]] = [] for doc_path in doc_paths: result = converter.convert(str(doc_path)) pages_md: List[str] = [ result.document.export_to_markdown(page_no=i) for i in range(len(result.document.pages)+1) ] all_docs_pages.append((doc_path.name, pages_md)) # Index index_entries.append(f"p. {current_global_page} \t : \t {doc_path.name} \n") current_global_page += len(pages_md) # Add index to markdown. markdown_blocks.extend(index_entries) # Second pass – build content section. for file_name, pages_md in all_docs_pages: markdown_blocks.append("\n\n---\n\n") # Saut de page dans le markdown markdown_blocks.append(f"*Début du document : {file_name}*\n") for page_number, page_md in enumerate(pages_md, start=0): markdown_blocks.append(f"*Début de la page {page_number} du doc : {file_name}*\n") markdown_blocks.append(page_md.strip()) markdown_blocks.append(f"\n*Fin de la page {page_number} du doc : {file_name}**\n") markdown_blocks.append("\n\n---\n\n") # Saut de page dans le markdown markdown_blocks.append(f"*Fin du document : {file_name}*\n") # Écriture du markdown consolidé. output_markdown.write_text("\n".join(markdown_blocks), encoding="utf-8") print(f"✅ Markdown écrit → {output_markdown.resolve()}") # --- petit wrappeur multifile --- def run_convert(files: list[str], theme: str, objectif: str, output_markdown: str = "dossier_documentaire.md") -> str: """ Copie les fichiers uploadés dans un dossier temporaire, appelle convert_directory, puis renvoie le chemin du markdown. """ with tempfile.TemporaryDirectory() as tmpdir: tmp_path = Path(tmpdir) for f in files: # f est un chemin vers le fichier uploadé shutil.copy(f, tmp_path / Path(f).name) convert_directory(tmp_path, theme, objectif, output_markdown) return str(Path(output_markdown).resolve()) # Gradio générera le téléchargement # --- IHM Gradio multifile --- with gr.Blocks(title="Dossier documentaire") as demo: gr.Markdown("## Générer un dossier documentaire à partir de fichiers") files_in = gr.Files(label="Fichiers source (plusieurs); extensions PDF, DOCX, PNG…", file_count="multiple") theme_in = gr.Textbox(label="Thème") obj_in = gr.Textbox(label="Objectif") out_name_in = gr.Textbox(label="Nom du fichier markdown de sortie", value="dossier_documentaire.md") launch_btn = gr.Button("Convertir 📄➡️📝") md_out = gr.File(label="Markdown généré") launch_btn.click(run_convert, inputs=[files_in, theme_in, obj_in, out_name_in], outputs=md_out) if __name__ == "__main__": demo.launch()