docgen / app.py
Juan Palomino
Deploy Document Format Converter
225be25
import gradio as gr
import pypandoc
import os
from pdf2docx import Converter
from docx import Document
from docx.table import _Cell
from docx.shared import Inches, Pt, RGBColor
from docx.enum.text import WD_ALIGN_PARAGRAPH, WD_COLOR_INDEX
from docx.oxml.ns import qn
import json
import base64
import hashlib
import sys
import tempfile
from flask import Flask, request, jsonify, send_file
import threading
import secrets
os.system('sudo apt-get install texlive')
def ensure_pandoc_installed():
try:
# Periksa apakah pandoc sudah ada
pypandoc.get_pandoc_version()
print("Pandoc is already installed and accessible.")
except OSError:
# Unduh pandoc jika belum ada
print("Pandoc not found, downloading...")
pypandoc.download_pandoc()
print("Pandoc downloaded successfully.")
# Pastikan Pandoc terpasang
ensure_pandoc_installed()
# Daftar format yang didukung
input_supported_formats = [data.upper() for data in sorted(list(pypandoc.get_pandoc_formats()[0]).append('PDF') or [
'BIBLATEX', 'BIBTEX', 'BITS', 'COMMONMARK', 'COMMONMARK_X', 'CREOLE', 'CSLJSON', 'CSV',
'DJOT', 'DOCBOOK', 'DOCX', 'DOKUWIKI', 'ENDNOTEXML', 'EPUB', 'FB2', 'GFM', 'HADDOCK',
'HTML', 'IPYNB', 'JATS', 'JIRA', 'JSON', 'LATEX', 'MAN', 'MARKDOWN', 'MARKDOWN_GITHUB',
'MARKDOWN_MMD', 'MARKDOWN_PHPEXTRA', 'MARKDOWN_STRICT', 'MDOC', 'MEDIAWIKI', 'MUSE',
'NATIVE', 'ODT', 'OPML', 'ORG', 'PDF', 'POD', 'RIS', 'RST', 'RTF', 'T2T', 'TEXTILE',
'TIKIWIKI', 'TSV', 'TWIKI', 'TYPST', 'VIMWIKI'
]) if data not in ['PDF']]
output_supported_formats = [data.upper() for data in sorted([
"ANSI", "ASCIIDOC", "ASCIIDOC_LEGACY", "ASCIIDOCTOR", "BEAMER", "BIBLATEX", "BIBTEX", "CHUNKEDHTML",
"COMMONMARK", "COMMONMARK_X", "CONTEXT", "CSLJSON", "DJOT", "DOCBOOK", "DOCBOOK4", "DOCBOOK5",
"DOCX", "DOKUWIKI", "DZSLIDES", "EPUB", "EPUB2", "EPUB3", "FB2", "GFM", "HADDOCK", "HTML",
"HTML4", "HTML5", "ICML", "IPYNB", "JATS", "JATS_ARCHIVING", "JATS_ARTICLEAUTHORING",
"JATS_PUBLISHING", "JIRA", "JSON", "LATEX", "MAN", "MARKDOWN", "MARKDOWN_GITHUB",
"MARKDOWN_MMD", "MARKDOWN_PHPEXTRA", "MARKDOWN_STRICT", "MARKUA", "MEDIAWIKI", "MS",
"MUSE", "NATIVE", "ODT", "OPENDOCUMENT", "OPML", "ORG", "PDF", "PLAIN", "PPTX", "REVEALJS",
"RST", "RTF", "S5", "SLIDEOUS", "SLIDY", "TEI", "TEXINFO", "TEXTILE", "TYPST", "XWIKI", "ZIMWIKI"
]) if data not in ['PDF']]
def convert_pdf_to_docx(pdf_file):
output_docx = f"{os.path.splitext(pdf_file)[0]}.docx"
cv = Converter(pdf_file)
cv.convert(output_docx, start=0, end=None)
return output_docx
def get_preview(file_path):
ext = os.path.splitext(file_path)[1].lower()
try:
if ext in ['.txt', '.md', '.csv', '.json']:
with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
content = f.read(2000) # Preview first 2000 chars
return f"<pre style='max-height:300px;overflow:auto'>{content}</pre>"
elif ext == '.pdf':
# Show PDF inline using HTML embed
return f"<embed src='{file_path}' type='application/pdf' width='100%' height='400px' />"
elif ext == '.docx':
try:
doc = Document(file_path)
html = ""
# Extract header(s) with paragraphs and tables
headers = []
for section in doc.sections:
header_texts = []
# Paragraphs
for p in section.header.paragraphs:
if p.text.strip():
header_texts.append(p.text.strip())
# Tables
for table in section.header.tables:
for row in table.rows:
row_text = " ".join(cell.text.strip() for cell in row.cells if cell.text.strip())
if row_text:
header_texts.append(row_text)
if header_texts:
headers.append(" | ".join(header_texts))
if headers:
html += f"<div style='font-weight:bold;font-size:1.2em;margin-bottom:8px;'>{' | '.join(headers)}</div>"
para_count = 0
for para in doc.paragraphs:
text = para.text.strip()
if text:
html += f"<p>{text}</p>"
para_count += 1
if para_count > 30:
html += "<p><i>Preview truncated...</i></p>"
break
return f"<div style='max-height:300px;overflow:auto'>{html}</div>"
except Exception as e:
return f"<b>Error reading DOCX:</b> {e}"
elif ext == '.doc':
return f"<b>DOC file:</b> {os.path.basename(file_path)} (Preview not supported)"
else:
return f"<b>File:</b> {os.path.basename(file_path)} (Preview not supported)"
except Exception as e:
return f"<b>Error generating preview:</b> {e}"
def extract_runs(paragraph):
runs = []
for run in paragraph.runs:
run_data = {
"text": run.text
}
if run.bold:
run_data["bold"] = True
if run.italic:
run_data["italic"] = True
if run.underline:
run_data["underline"] = True
if run.font and run.font.size:
run_data["font_size"] = run.font.size.pt
if run.font and run.font.name:
run_data["font_name"] = run.font.name
# Extract color (RGB or theme)
if run.font and run.font.color:
if run.font.color.rgb:
run_data["color"] = str(run.font.color.rgb)
elif run.font.color.theme_color:
run_data["color_theme"] = str(run.font.color.theme_color)
# Highlight color
if run.font and hasattr(run.font, "highlight_color") and run.font.highlight_color:
run_data["highlight"] = str(run.font.highlight_color)
runs.append(run_data)
return runs
# Detect heading and list paragraphs
def extract_paragraph_block(paragraph):
style_name = paragraph.style.name if paragraph.style else "Normal"
# Heading
if style_name.startswith("Heading"):
try:
level = int(style_name.split()[-1])
except Exception:
level = 1
return {
"type": "heading",
"level": level,
"runs": extract_runs(paragraph),
"alignment": str(paragraph.alignment) if paragraph.alignment else "left",
"style": style_name
}
# List
elif "List" in style_name:
return {
"type": "list_item",
"list_type": "number" if "Number" in style_name else "bullet",
"runs": extract_runs(paragraph),
"alignment": str(paragraph.alignment) if paragraph.alignment else "left",
"style": style_name
}
# Normal paragraph
else:
return {
"type": "paragraph",
"runs": extract_runs(paragraph),
"alignment": str(paragraph.alignment) if paragraph.alignment else "left",
"style": style_name
}
# Add spacing extraction
def extract_blocks(element, output_dir, image_prefix):
blocks = []
if hasattr(element, 'paragraphs'):
for para in element.paragraphs:
if para.text.strip():
para_block = extract_paragraph_block(para)
# Add spacing info
pf = para.paragraph_format
if pf:
if pf.space_before:
para_block["space_before"] = pf.space_before.pt
if pf.space_after:
para_block["space_after"] = pf.space_after.pt
if pf.line_spacing:
para_block["line_spacing"] = pf.line_spacing
blocks.append(para_block)
if hasattr(element, 'tables'):
for table in element.tables:
blocks.append(extract_table_block(table))
return blocks
def extract_table_block(table):
rows = []
for row in table.rows:
row_cells = []
for cell in row.cells:
# Only take unique paragraphs (python-docx repeats cell objects)
unique_paras = []
seen = set()
for para in cell.paragraphs:
para_id = id(para)
if para_id not in seen:
unique_paras.append(para)
seen.add(para_id)
row_cells.append([extract_paragraph_block(para) for para in unique_paras if para.text.strip()])
rows.append(row_cells)
return {"type": "table", "rows": rows}
def extract_images_from_doc(doc, output_dir, image_prefix):
image_blocks = []
rels = doc.part.rels
for rel in rels.values():
if rel.reltype == 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/image':
img_blob = rel.target_part.blob
img_hash = hashlib.sha1(img_blob).hexdigest()[:8]
img_ext = rel.target_part.content_type.split('/')[-1]
img_id = f"{image_prefix}_{img_hash}"
img_filename = f"{img_id}.{img_ext}"
img_path = os.path.join(output_dir, img_filename)
with open(img_path, 'wb') as f:
f.write(img_blob)
image_blocks.append({
"type": "image",
"image_id": img_id,
"image_format": img_ext,
"path": img_filename
})
return image_blocks
def add_runs_to_paragraph(paragraph, runs):
for run_info in runs:
run = paragraph.add_run(run_info.get("text", ""))
if run_info.get("bold"): run.bold = True
if run_info.get("italic"): run.italic = True
if run_info.get("underline"): run.underline = True
if run_info.get("font_size"): run.font.size = Pt(run_info["font_size"])
if run_info.get("font_name"): run.font.name = run_info["font_name"]
# Set color (RGB or theme)
if run_info.get("color"):
try:
run.font.color.rgb = RGBColor.from_string(run_info["color"].replace("#", ""))
except Exception:
pass
elif run_info.get("color_theme"):
try:
run.font.color.theme_color = int(run_info["color_theme"])
except Exception:
pass
if run_info.get("highlight"):
try:
if run_info["highlight"].isdigit():
run.font.highlight_color = int(run_info["highlight"])
else:
run.font.highlight_color = WD_COLOR_INDEX[run_info["highlight"]]
except Exception:
pass
# Add heading and list support
def add_block_to_doc(doc, block, image_dir):
if block["type"] == "heading":
level = block.get("level", 1)
text = "".join([r.get("text", "") for r in block.get("runs", [])])
para = doc.add_heading(text, level=level)
add_runs_to_paragraph(para, block.get("runs", []))
align = block.get("alignment", "left")
if align == "center": para.alignment = WD_ALIGN_PARAGRAPH.CENTER
elif align == "right": para.alignment = WD_ALIGN_PARAGRAPH.RIGHT
else: para.alignment = WD_ALIGN_PARAGRAPH.LEFT
# Spacing
if "space_before" in block: para.paragraph_format.space_before = Pt(block["space_before"])
if "space_after" in block: para.paragraph_format.space_after = Pt(block["space_after"])
if "line_spacing" in block: para.paragraph_format.line_spacing = block["line_spacing"]
elif block["type"] == "list_item":
style = "List Number" if block.get("list_type") == "number" else "List Bullet"
para = doc.add_paragraph(style=style)
add_runs_to_paragraph(para, block.get("runs", []))
align = block.get("alignment", "left")
if align == "center": para.alignment = WD_ALIGN_PARAGRAPH.CENTER
elif align == "right": para.alignment = WD_ALIGN_PARAGRAPH.RIGHT
else: para.alignment = WD_ALIGN_PARAGRAPH.LEFT
if "space_before" in block: para.paragraph_format.space_before = Pt(block["space_before"])
if "space_after" in block: para.paragraph_format.space_after = Pt(block["space_after"])
if "line_spacing" in block: para.paragraph_format.line_spacing = block["line_spacing"]
elif block["type"] == "paragraph":
para = doc.add_paragraph()
add_runs_to_paragraph(para, block.get("runs", []))
align = block.get("alignment", "left")
if align == "center": para.alignment = WD_ALIGN_PARAGRAPH.CENTER
elif align == "right": para.alignment = WD_ALIGN_PARAGRAPH.RIGHT
else: para.alignment = WD_ALIGN_PARAGRAPH.LEFT
if "space_before" in block: para.paragraph_format.space_before = Pt(block["space_before"])
if "space_after" in block: para.paragraph_format.space_after = Pt(block["space_after"])
if "line_spacing" in block: para.paragraph_format.line_spacing = block["line_spacing"]
elif block["type"] == "table":
rows = block.get("rows", [])
if rows:
try:
section = doc.sections[0]
table_width = section.page_width
except Exception:
table_width = Inches(6)
table = doc.add_table(rows=len(rows), cols=len(rows[0]), width=table_width)
for i, row in enumerate(rows):
for j, cell_blocks in enumerate(row):
cell = table.cell(i, j)
for para_block in cell_blocks:
add_block_to_doc(cell, para_block, image_dir)
elif block["type"] == "image":
img_path = os.path.join(image_dir, block["path"])
width = block.get("width")
height = block.get("height")
if os.path.exists(img_path):
if width and height:
doc.add_picture(img_path, width=Inches(width/96), height=Inches(height/96))
else:
doc.add_picture(img_path)
def add_blocks_to_doc(doc, blocks, image_dir):
for block in blocks:
# If doc is a header/footer, use add_paragraph directly
if hasattr(doc, 'is_header') or hasattr(doc, 'is_footer') or hasattr(doc, 'add_paragraph'):
add_block_to_doc(doc, block, image_dir)
else:
# If doc is a SectionHeader or SectionFooter (python-docx), use .add_paragraph()
try:
add_block_to_doc(doc, block, image_dir)
except Exception:
pass
def extract_all_sections(doc, output_dir, image_prefix):
sections = []
for idx, section in enumerate(doc.sections):
sec = {}
for htype, attr in [("header", "header"), ("first_page_header", "first_page_header"), ("even_page_header", "even_page_header"),
("footer", "footer"), ("first_page_footer", "first_page_footer"), ("even_page_footer", "even_page_footer")]:
part = getattr(section, attr, None)
if part:
sec[htype] = extract_blocks(part, output_dir, f"{image_prefix}_sec{idx}_{htype}")
sections.append(sec)
return sections
def convert_document(doc_file, target_format):
import json
from docx import Document as DocxDocument
try:
target_format = target_format.lower()
orig_file_path = None
# Handle Gradio NamedString or file-like object
if hasattr(doc_file, 'name'):
orig_file_path = doc_file.name
elif isinstance(doc_file, str):
orig_file_path = doc_file
else:
return None, "Error: Unsupported file type.", None
# If the file is a PDF, convert it to DOCX first
if orig_file_path.lower().endswith('.pdf'):
print("Converting PDF to DOCX...")
doc_file = convert_pdf_to_docx(orig_file_path)
print("PDF converted to DOCX.")
orig_file_path = doc_file
base_name = os.path.splitext(os.path.basename(orig_file_path))[0]
output_file = f"docgen_{base_name}.{target_format.lower()}"
# Custom DOCX to JSON extraction
if orig_file_path.lower().endswith('.docx') and target_format == 'json':
doc = Document(orig_file_path)
output_dir = os.path.dirname(output_file)
image_prefix = base_name
image_blocks = extract_images_from_doc(doc, output_dir, image_prefix)
sections = extract_all_sections(doc, output_dir, image_prefix)
body_blocks = extract_blocks(doc, output_dir, image_prefix)
doc_json = {
"sections": sections,
"body": body_blocks + image_blocks,
"metadata": {
"title": getattr(doc.core_properties, 'title', ''),
"author": getattr(doc.core_properties, 'author', ''),
}
}
with open(output_file, 'w', encoding='utf-8') as f:
json.dump(doc_json, f, ensure_ascii=False, indent=2)
elif orig_file_path.lower().endswith('.json') and target_format == 'docx':
# JSON to DOCX
with open(orig_file_path, 'r', encoding='utf-8') as f:
doc_json = json.load(f)
doc = DocxDocument()
image_dir = os.path.dirname(orig_file_path)
# Sections (headers/footers)
if "sections" in doc_json:
# Ensure doc has enough sections
while len(doc.sections) < len(doc_json["sections"]):
doc.add_section()
for idx, sec in enumerate(doc_json["sections"]):
section = doc.sections[idx]
for htype, attr in [("header", "header"), ("first_page_header", "first_page_header"), ("even_page_header", "even_page_header"),
("footer", "footer"), ("first_page_footer", "first_page_footer"), ("even_page_footer", "even_page_footer")]:
if htype in sec:
part = getattr(section, attr, None)
if part:
# Remove all default paragraphs
for p in list(part.paragraphs):
p._element.getparent().remove(p._element)
add_blocks_to_doc(part, sec[htype], image_dir)
# Body
if "body" in doc_json:
add_blocks_to_doc(doc, doc_json["body"], image_dir)
# Metadata
if "metadata" in doc_json:
meta = doc_json["metadata"]
if "title" in meta:
doc.core_properties.title = meta["title"]
if "author" in meta:
doc.core_properties.author = meta["author"]
doc.save(output_file)
else:
# Use Pandoc for other conversions
pypandoc.convert_file(
orig_file_path,
target_format.lower(),
outputfile=output_file,
)
input_preview = get_preview(orig_file_path)
output_preview = get_preview(output_file)
return input_preview, output_preview, output_file
except Exception as e:
return f"Error: {e}", None, None
def parity_check(docx_path):
import tempfile
print(f"[Parity Check] Testing round-trip for: {docx_path}")
class FileLike: # Fake file-like for CLI
def __init__(self, name): self.name = name
_, _, json_out = convert_document(FileLike(docx_path), 'json')
if not json_out or not os.path.exists(json_out):
print("Failed to produce JSON from DOCX.")
return False
_, _, docx_out = convert_document(FileLike(json_out), 'docx')
if not docx_out or not os.path.exists(docx_out):
print("Failed to produce DOCX from JSON.")
return False
def extract_all_sections_for_parity(docx_path):
doc = Document(docx_path)
sections = []
for idx, section in enumerate(doc.sections):
sec = {}
for htype, attr in [("header", "header"), ("first_page_header", "first_page_header"), ("even_page_header", "even_page_header"),
("footer", "footer"), ("first_page_footer", "first_page_footer"), ("even_page_footer", "even_page_footer")]:
part = getattr(section, attr, None)
if part:
sec[htype] = extract_blocks(part, os.path.dirname(docx_path), f"sec{idx}_{htype}")
sections.append(sec)
body = extract_blocks(doc, os.path.dirname(docx_path), os.path.splitext(os.path.basename(docx_path))[0])
return {"sections": sections, "body": body}
orig = extract_all_sections_for_parity(docx_path)
roundtrip = extract_all_sections_for_parity(docx_out)
import difflib, pprint
def blocks_to_str(blocks):
return pprint.pformat(blocks, width=120)
if orig == roundtrip:
print("[Parity Check] PASS: Round-trip blocks are identical!")
return True
else:
print("[Parity Check] FAIL: Differences found.")
# Compare per section
for idx, (orig_sec, round_sec) in enumerate(zip(orig["sections"], roundtrip["sections"])):
if orig_sec != round_sec:
print(f"Section {idx} header/footer mismatch:")
diff = difflib.unified_diff(blocks_to_str(orig_sec).splitlines(), blocks_to_str(round_sec).splitlines(), fromfile='original', tofile='roundtrip', lineterm='')
print('\n'.join(diff))
if orig["body"] != roundtrip["body"]:
print("Body mismatch:")
diff = difflib.unified_diff(blocks_to_str(orig["body"]).splitlines(), blocks_to_str(roundtrip["body"]).splitlines(), fromfile='original', tofile='roundtrip', lineterm='')
print('\n'.join(diff))
return False
with gr.Blocks(css="footer {visibility: hidden}") as demo:
gr.Markdown("# Document Format Converter\nUpload a document and preview as JSON. Select a format to download in another format.")
with gr.Row():
with gr.Column():
input_file = gr.File(label="Upload Document", file_types=[f'.{ext.lower()}' for ext in input_supported_formats])
input_preview = gr.HTML(label="JSON Preview")
with gr.Column():
output_format = gr.Dropdown(label="Download As...", choices=output_supported_formats, value="DOCX")
format_label = gr.Markdown("Previewing as: DOCX")
output_preview = gr.HTML(label="Output Preview")
output_file = gr.File(label="Download Converted Document", visible=True)
json_state = gr.State()
orig_file_state = gr.State()
def upload_and_preview(doc_file):
_, _, json_path = convert_document(doc_file, "json")
# Handle conversion failure
if not json_path or not os.path.exists(json_path):
error_msg = "Error converting document to JSON."
return f"<pre style='max-height:300px;overflow:auto'>{error_msg}</pre>", "", doc_file.name
# Read and preview JSON content
try:
with open(json_path, "r", encoding="utf-8") as f:
json_content = f.read()
except Exception as e:
error_msg = f"Error reading JSON: {e}"
return f"<pre style='max-height:300px;overflow:auto'>{error_msg}</pre>", "", doc_file.name
preview_html = f"<pre style='max-height:300px;overflow:auto'>{json_content[:4000]}</pre>"
return preview_html, json_content, doc_file.name
def convert_and_preview(orig_file_path, output_format):
class F:
name = orig_file_path
_, _, out_path = convert_document(F(), output_format.lower())
preview = get_preview(out_path)
return f"Previewing as: {output_format}", preview, out_path
input_file.upload(upload_and_preview, inputs=input_file, outputs=[input_preview, json_state, orig_file_state])
output_format.change(convert_and_preview, inputs=[orig_file_state, output_format], outputs=[format_label, output_preview, output_file])
if __name__ == "__main__":
if len(sys.argv) == 3 and sys.argv[1] == "--parity-check":
parity_check(sys.argv[2])
sys.exit(0)
# Generate a random API key if one doesn't exist in environment variables
API_KEY = os.environ.get('API_KEY', secrets.token_urlsafe(32))
print(f"API Key: {API_KEY}") # Print the API key when the app starts
# Create Flask app for API endpoints
app = Flask(__name__)
def check_api_key():
"""Check if the API key is valid."""
provided_key = request.headers.get('X-API-Key')
if not provided_key or provided_key != API_KEY:
return False
return True
@app.route('/api/docx-to-json', methods=['POST'])
def api_docx_to_json():
# Check API key
if not check_api_key():
return jsonify({"error": "Invalid or missing API key"}), 401
if 'file' not in request.files:
return jsonify({"error": "No file part"}), 400
file = request.files['file']
if file.filename == '':
return jsonify({"error": "No selected file"}), 400
if not file.filename.lower().endswith('.docx'):
return jsonify({"error": "File must be a DOCX document"}), 400
# Save the uploaded file
temp_dir = tempfile.mkdtemp()
file_path = os.path.join(temp_dir, file.filename)
file.save(file_path)
try:
# Convert to JSON
_, _, json_path = convert_document(type('obj', (object,), {'name': file_path}), "json")
if not json_path or not os.path.exists(json_path):
return jsonify({"error": "Error converting document to JSON"}), 500
# Read JSON content
with open(json_path, "r", encoding="utf-8") as f:
json_content = json.load(f)
return jsonify(json_content)
except Exception as e:
return jsonify({"error": str(e)}), 500
@app.route('/api/json-to-docx', methods=['POST'])
def api_json_to_docx():
# Check API key
if not check_api_key():
return jsonify({"error": "Invalid or missing API key"}), 401
if not request.is_json:
return jsonify({"error": "Request must be JSON"}), 400
try:
# Save the JSON to a temporary file
temp_dir = tempfile.mkdtemp()
json_path = os.path.join(temp_dir, "document.json")
with open(json_path, "w", encoding="utf-8") as f:
json.dump(request.json, f)
# Convert to DOCX
_, _, docx_path = convert_document(type('obj', (object,), {'name': json_path}), "docx")
if not docx_path or not os.path.exists(docx_path):
return jsonify({"error": "Error converting JSON to DOCX"}), 500
return send_file(docx_path, as_attachment=True, download_name="converted.docx")
except Exception as e:
return jsonify({"error": str(e)}), 500
# Run both Gradio and Flask
def run_flask():
app.run(host='0.0.0.0', port=5000)
# Start Flask in a separate thread
flask_thread = threading.Thread(target=run_flask)
flask_thread.daemon = True
flask_thread.start()
# Start Gradio
demo.launch(share=True)