Spaces:
Sleeping
Sleeping
Aditya DN commited on
Update app.py
Browse files
app.py
CHANGED
|
@@ -1,13 +1,26 @@
|
|
| 1 |
import gradio as gr
|
| 2 |
import pypandoc
|
| 3 |
import os
|
| 4 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5 |
|
| 6 |
-
try: pypandoc.get_pandoc_version()
|
| 7 |
-
except OSError: pypandoc.download_pandoc()
|
| 8 |
-
|
| 9 |
# Daftar format yang didukung
|
| 10 |
-
input_supported_formats = [data.upper() for data in sorted(list(pypandoc.get_pandoc_formats()[0]) or [
|
| 11 |
'BIBLATEX', 'BIBTEX', 'BITS', 'COMMONMARK', 'COMMONMARK_X', 'CREOLE', 'CSLJSON', 'CSV',
|
| 12 |
'DJOT', 'DOCBOOK', 'DOCX', 'DOKUWIKI', 'ENDNOTEXML', 'EPUB', 'FB2', 'GFM', 'HADDOCK',
|
| 13 |
'HTML', 'IPYNB', 'JATS', 'JIRA', 'JSON', 'LATEX', 'MAN', 'MARKDOWN', 'MARKDOWN_GITHUB',
|
|
@@ -25,26 +38,26 @@ output_supported_formats = [data.upper() for data in sorted([
|
|
| 25 |
"MARKDOWN_MMD", "MARKDOWN_PHPEXTRA", "MARKDOWN_STRICT", "MARKUA", "MEDIAWIKI", "MS",
|
| 26 |
"MUSE", "NATIVE", "ODT", "OPENDOCUMENT", "OPML", "ORG", "PDF", "PLAIN", "PPTX", "REVEALJS",
|
| 27 |
"RST", "RTF", "S5", "SLIDEOUS", "SLIDY", "TEI", "TEXINFO", "TEXTILE", "TYPST", "XWIKI", "ZIMWIKI"
|
| 28 |
-
])
|
| 29 |
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
|
| 36 |
|
| 37 |
def convert_document(doc_file, target_format):
|
| 38 |
try:
|
| 39 |
target_format = target_format.lower()
|
| 40 |
|
| 41 |
-
#
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
| 48 |
|
| 49 |
# Get the base name of the file (without extension)
|
| 50 |
base_name = os.path.splitext(os.path.basename(doc_file))[0]
|
|
@@ -56,7 +69,10 @@ def convert_document(doc_file, target_format):
|
|
| 56 |
pypandoc.convert_file(
|
| 57 |
doc_file,
|
| 58 |
target_format.lower(), # Convert the format to lowercase
|
| 59 |
-
outputfile=output_file
|
|
|
|
|
|
|
|
|
|
| 60 |
)
|
| 61 |
|
| 62 |
return output_file
|
|
@@ -74,4 +90,9 @@ interface = gr.Interface(
|
|
| 74 |
title="Document Format Converter",
|
| 75 |
description="Upload a document and select any target format for conversion.",
|
| 76 |
css="footer {visibility: hidden}"
|
| 77 |
-
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
import gradio as gr
|
| 2 |
import pypandoc
|
| 3 |
import os
|
| 4 |
+
from pdf2docx import Converter
|
| 5 |
+
|
| 6 |
+
os.system('sudo apt-get install texlive')
|
| 7 |
+
|
| 8 |
+
def ensure_pandoc_installed():
|
| 9 |
+
try:
|
| 10 |
+
# Periksa apakah pandoc sudah ada
|
| 11 |
+
pypandoc.get_pandoc_version()
|
| 12 |
+
print("Pandoc is already installed and accessible.")
|
| 13 |
+
except OSError:
|
| 14 |
+
# Unduh pandoc jika belum ada
|
| 15 |
+
print("Pandoc not found, downloading...")
|
| 16 |
+
pypandoc.download_pandoc()
|
| 17 |
+
print("Pandoc downloaded successfully.")
|
| 18 |
+
|
| 19 |
+
# Pastikan Pandoc terpasang
|
| 20 |
+
ensure_pandoc_installed()
|
| 21 |
|
|
|
|
|
|
|
|
|
|
| 22 |
# Daftar format yang didukung
|
| 23 |
+
input_supported_formats = [data.upper() for data in sorted(list(pypandoc.get_pandoc_formats()[0]).append('PDF') or [
|
| 24 |
'BIBLATEX', 'BIBTEX', 'BITS', 'COMMONMARK', 'COMMONMARK_X', 'CREOLE', 'CSLJSON', 'CSV',
|
| 25 |
'DJOT', 'DOCBOOK', 'DOCX', 'DOKUWIKI', 'ENDNOTEXML', 'EPUB', 'FB2', 'GFM', 'HADDOCK',
|
| 26 |
'HTML', 'IPYNB', 'JATS', 'JIRA', 'JSON', 'LATEX', 'MAN', 'MARKDOWN', 'MARKDOWN_GITHUB',
|
|
|
|
| 38 |
"MARKDOWN_MMD", "MARKDOWN_PHPEXTRA", "MARKDOWN_STRICT", "MARKUA", "MEDIAWIKI", "MS",
|
| 39 |
"MUSE", "NATIVE", "ODT", "OPENDOCUMENT", "OPML", "ORG", "PDF", "PLAIN", "PPTX", "REVEALJS",
|
| 40 |
"RST", "RTF", "S5", "SLIDEOUS", "SLIDY", "TEI", "TEXINFO", "TEXTILE", "TYPST", "XWIKI", "ZIMWIKI"
|
| 41 |
+
])]
|
| 42 |
|
| 43 |
+
def convert_pdf_to_docx(pdf_file):
|
| 44 |
+
"""Konversi PDF ke DOCX menggunakan pdf2docx"""
|
| 45 |
+
output_docx = f"{os.path.splitext(pdf_file.name)[0]}.docx"
|
| 46 |
+
cv = Converter(pdf_file.name)
|
| 47 |
+
cv.convert(output_docx, start=0, end=None)
|
| 48 |
+
return output_docx
|
| 49 |
|
| 50 |
def convert_document(doc_file, target_format):
|
| 51 |
try:
|
| 52 |
target_format = target_format.lower()
|
| 53 |
|
| 54 |
+
# If the file is a PDF, convert it to DOCX first
|
| 55 |
+
if isinstance(doc_file, str) and doc_file.lower().endswith('.pdf'):
|
| 56 |
+
print("Converting PDF to DOCX...")
|
| 57 |
+
doc_file = convert_pdf_to_docx(doc_file) # Pass the file path directly
|
| 58 |
+
print("PDF converted to DOCX.")
|
| 59 |
+
elif hasattr(doc_file, 'name'): # If it's a file-like object
|
| 60 |
+
doc_file = doc_file.name # Get the file path from the file-like object
|
| 61 |
|
| 62 |
# Get the base name of the file (without extension)
|
| 63 |
base_name = os.path.splitext(os.path.basename(doc_file))[0]
|
|
|
|
| 69 |
pypandoc.convert_file(
|
| 70 |
doc_file,
|
| 71 |
target_format.lower(), # Convert the format to lowercase
|
| 72 |
+
outputfile=output_file,
|
| 73 |
+
extra_args=['-V geometry:margin=1.5cm',
|
| 74 |
+
# '--pdf-engine=/usr/bin/xelatex',
|
| 75 |
+
'--metadata', 'title="Converted Document by Flowly AI"']
|
| 76 |
)
|
| 77 |
|
| 78 |
return output_file
|
|
|
|
| 90 |
title="Document Format Converter",
|
| 91 |
description="Upload a document and select any target format for conversion.",
|
| 92 |
css="footer {visibility: hidden}"
|
| 93 |
+
)
|
| 94 |
+
|
| 95 |
+
# Jalankan aplikasi
|
| 96 |
+
if __name__ == "__main__":
|
| 97 |
+
interface.launch()
|
| 98 |
+
|