Spaces:

Adibrino
/

Document-Format-Converter

Sleeping

App Files Files Community

Aditya DN commited on Jan 28, 2025

Commit

3d40c3e

verified ·

1 Parent(s): 929af28

Update app.py

Browse files

Files changed (1) hide show

app.py +42 -21

app.py CHANGED Viewed

@@ -1,13 +1,26 @@
 import gradio as gr
 import pypandoc
 import os
-# from pdf2docx import Converter
-try: pypandoc.get_pandoc_version()
-except OSError: pypandoc.download_pandoc()
 # Daftar format yang didukung
-input_supported_formats = [data.upper() for data in sorted(list(pypandoc.get_pandoc_formats()[0]) or [
     'BIBLATEX', 'BIBTEX', 'BITS', 'COMMONMARK', 'COMMONMARK_X', 'CREOLE', 'CSLJSON', 'CSV',
     'DJOT', 'DOCBOOK', 'DOCX', 'DOKUWIKI', 'ENDNOTEXML', 'EPUB', 'FB2', 'GFM', 'HADDOCK',
     'HTML', 'IPYNB', 'JATS', 'JIRA', 'JSON', 'LATEX', 'MAN', 'MARKDOWN', 'MARKDOWN_GITHUB',
@@ -25,26 +38,26 @@ output_supported_formats = [data.upper() for data in sorted([
     "MARKDOWN_MMD", "MARKDOWN_PHPEXTRA", "MARKDOWN_STRICT", "MARKUA", "MEDIAWIKI", "MS",
     "MUSE", "NATIVE", "ODT", "OPENDOCUMENT", "OPML", "ORG", "PDF", "PLAIN", "PPTX", "REVEALJS",
     "RST", "RTF", "S5", "SLIDEOUS", "SLIDY", "TEI", "TEXINFO", "TEXTILE", "TYPST", "XWIKI", "ZIMWIKI"
-]) if data not in ['PDF']]
-# def convert_pdf_to_docx(pdf_file):
-#     """Konversi PDF ke DOCX menggunakan pdf2docx"""
-#     output_docx = f"{os.path.splitext(pdf_file.name)[0]}.docx"
-#     cv = Converter(pdf_file.name)
-#     cv.convert(output_docx, start=0, end=None)
-#     return output_docx
 def convert_document(doc_file, target_format):
     try:
         target_format = target_format.lower()
-        # # If the file is a PDF, convert it to DOCX first
-        # if isinstance(doc_file, str) and doc_file.lower().endswith('.pdf'):
-        #     print("Converting PDF to DOCX...")
-        #     doc_file = convert_pdf_to_docx(doc_file)  # Pass the file path directly
-        #     print("PDF converted to DOCX.")
-        # elif hasattr(doc_file, 'name'):  # If it's a file-like object
-        doc_file = doc_file.name  # Get the file path from the file-like object
         # Get the base name of the file (without extension)
         base_name = os.path.splitext(os.path.basename(doc_file))[0]
@@ -56,7 +69,10 @@ def convert_document(doc_file, target_format):
         pypandoc.convert_file(
             doc_file,
             target_format.lower(),  # Convert the format to lowercase
-            outputfile=output_file
         )
         return output_file
@@ -74,4 +90,9 @@ interface = gr.Interface(
     title="Document Format Converter",
     description="Upload a document and select any target format for conversion.",
     css="footer {visibility: hidden}"
-)

 import gradio as gr
 import pypandoc
 import os
+from pdf2docx import Converter
+os.system('sudo apt-get install texlive')
+def ensure_pandoc_installed():
+    try:
+        # Periksa apakah pandoc sudah ada
+        pypandoc.get_pandoc_version()
+        print("Pandoc is already installed and accessible.")
+    except OSError:
+        # Unduh pandoc jika belum ada
+        print("Pandoc not found, downloading...")
+        pypandoc.download_pandoc()
+        print("Pandoc downloaded successfully.")
+# Pastikan Pandoc terpasang
+ensure_pandoc_installed()
 # Daftar format yang didukung
+input_supported_formats = [data.upper() for data in sorted(list(pypandoc.get_pandoc_formats()[0]).append('PDF') or [
     'BIBLATEX', 'BIBTEX', 'BITS', 'COMMONMARK', 'COMMONMARK_X', 'CREOLE', 'CSLJSON', 'CSV',
     'DJOT', 'DOCBOOK', 'DOCX', 'DOKUWIKI', 'ENDNOTEXML', 'EPUB', 'FB2', 'GFM', 'HADDOCK',
     'HTML', 'IPYNB', 'JATS', 'JIRA', 'JSON', 'LATEX', 'MAN', 'MARKDOWN', 'MARKDOWN_GITHUB',
     "MARKDOWN_MMD", "MARKDOWN_PHPEXTRA", "MARKDOWN_STRICT", "MARKUA", "MEDIAWIKI", "MS",
     "MUSE", "NATIVE", "ODT", "OPENDOCUMENT", "OPML", "ORG", "PDF", "PLAIN", "PPTX", "REVEALJS",
     "RST", "RTF", "S5", "SLIDEOUS", "SLIDY", "TEI", "TEXINFO", "TEXTILE", "TYPST", "XWIKI", "ZIMWIKI"
+])]
+def convert_pdf_to_docx(pdf_file):
+    """Konversi PDF ke DOCX menggunakan pdf2docx"""
+    output_docx = f"{os.path.splitext(pdf_file.name)[0]}.docx"
+    cv = Converter(pdf_file.name)
+    cv.convert(output_docx, start=0, end=None)
+    return output_docx
 def convert_document(doc_file, target_format):
     try:
         target_format = target_format.lower()
+        # If the file is a PDF, convert it to DOCX first
+        if isinstance(doc_file, str) and doc_file.lower().endswith('.pdf'):
+            print("Converting PDF to DOCX...")
+            doc_file = convert_pdf_to_docx(doc_file)  # Pass the file path directly
+            print("PDF converted to DOCX.")
+        elif hasattr(doc_file, 'name'):  # If it's a file-like object
+            doc_file = doc_file.name  # Get the file path from the file-like object
         # Get the base name of the file (without extension)
         base_name = os.path.splitext(os.path.basename(doc_file))[0]
         pypandoc.convert_file(
             doc_file,
             target_format.lower(),  # Convert the format to lowercase
+            outputfile=output_file,
+            extra_args=['-V geometry:margin=1.5cm',
+                        # '--pdf-engine=/usr/bin/xelatex',
+                        '--metadata', 'title="Converted Document by Flowly AI"']
         )
         return output_file
     title="Document Format Converter",
     description="Upload a document and select any target format for conversion.",
     css="footer {visibility: hidden}"
+)
+# Jalankan aplikasi
+if __name__ == "__main__":
+    interface.launch()