import gradio as gr import pypandoc import os from pdf2docx import Converter from docx import Document from docx.table import _Cell from docx.shared import Inches, Pt, RGBColor from docx.enum.text import WD_ALIGN_PARAGRAPH, WD_COLOR_INDEX from docx.oxml.ns import qn import json import base64 import hashlib import sys import tempfile from flask import Flask, request, jsonify, send_file import threading import secrets os.system('sudo apt-get install texlive') def ensure_pandoc_installed(): try: # Periksa apakah pandoc sudah ada pypandoc.get_pandoc_version() print("Pandoc is already installed and accessible.") except OSError: # Unduh pandoc jika belum ada print("Pandoc not found, downloading...") pypandoc.download_pandoc() print("Pandoc downloaded successfully.") # Pastikan Pandoc terpasang ensure_pandoc_installed() # Daftar format yang didukung input_supported_formats = [data.upper() for data in sorted(list(pypandoc.get_pandoc_formats()[0]).append('PDF') or [ 'BIBLATEX', 'BIBTEX', 'BITS', 'COMMONMARK', 'COMMONMARK_X', 'CREOLE', 'CSLJSON', 'CSV', 'DJOT', 'DOCBOOK', 'DOCX', 'DOKUWIKI', 'ENDNOTEXML', 'EPUB', 'FB2', 'GFM', 'HADDOCK', 'HTML', 'IPYNB', 'JATS', 'JIRA', 'JSON', 'LATEX', 'MAN', 'MARKDOWN', 'MARKDOWN_GITHUB', 'MARKDOWN_MMD', 'MARKDOWN_PHPEXTRA', 'MARKDOWN_STRICT', 'MDOC', 'MEDIAWIKI', 'MUSE', 'NATIVE', 'ODT', 'OPML', 'ORG', 'PDF', 'POD', 'RIS', 'RST', 'RTF', 'T2T', 'TEXTILE', 'TIKIWIKI', 'TSV', 'TWIKI', 'TYPST', 'VIMWIKI' ]) if data not in ['PDF']] output_supported_formats = [data.upper() for data in sorted([ "ANSI", "ASCIIDOC", "ASCIIDOC_LEGACY", "ASCIIDOCTOR", "BEAMER", "BIBLATEX", "BIBTEX", "CHUNKEDHTML", "COMMONMARK", "COMMONMARK_X", "CONTEXT", "CSLJSON", "DJOT", "DOCBOOK", "DOCBOOK4", "DOCBOOK5", "DOCX", "DOKUWIKI", "DZSLIDES", "EPUB", "EPUB2", "EPUB3", "FB2", "GFM", "HADDOCK", "HTML", "HTML4", "HTML5", "ICML", "IPYNB", "JATS", "JATS_ARCHIVING", "JATS_ARTICLEAUTHORING", "JATS_PUBLISHING", "JIRA", "JSON", "LATEX", "MAN", "MARKDOWN", "MARKDOWN_GITHUB", "MARKDOWN_MMD", "MARKDOWN_PHPEXTRA", "MARKDOWN_STRICT", "MARKUA", "MEDIAWIKI", "MS", "MUSE", "NATIVE", "ODT", "OPENDOCUMENT", "OPML", "ORG", "PDF", "PLAIN", "PPTX", "REVEALJS", "RST", "RTF", "S5", "SLIDEOUS", "SLIDY", "TEI", "TEXINFO", "TEXTILE", "TYPST", "XWIKI", "ZIMWIKI" ]) if data not in ['PDF']] def convert_pdf_to_docx(pdf_file): output_docx = f"{os.path.splitext(pdf_file)[0]}.docx" cv = Converter(pdf_file) cv.convert(output_docx, start=0, end=None) return output_docx def get_preview(file_path): ext = os.path.splitext(file_path)[1].lower() try: if ext in ['.txt', '.md', '.csv', '.json']: with open(file_path, 'r', encoding='utf-8', errors='ignore') as f: content = f.read(2000) # Preview first 2000 chars return f"

{content}

" elif ext == '.pdf': # Show PDF inline using HTML embed return f"

" elif ext == '.docx': try: doc = Document(file_path) html = "" # Extract header(s) with paragraphs and tables headers = [] for section in doc.sections: header_texts = [] # Paragraphs for p in section.header.paragraphs: if p.text.strip(): header_texts.append(p.text.strip()) # Tables for table in section.header.tables: for row in table.rows: row_text = " ".join(cell.text.strip() for cell in row.cells if cell.text.strip()) if row_text: header_texts.append(row_text) if header_texts: headers.append(" | ".join(header_texts)) if headers: html += f"

{' | '.join(headers)}

" para_count = 0 for para in doc.paragraphs: text = para.text.strip() if text: html += f"

{text}

" para_count += 1 if para_count > 30: html += "

Preview truncated...

" break return f"

{html}

" except Exception as e: return f"Error reading DOCX: {e}" elif ext == '.doc': return f"DOC file: {os.path.basename(file_path)} (Preview not supported)" else: return f"File: {os.path.basename(file_path)} (Preview not supported)" except Exception as e: return f"Error generating preview: {e}" def extract_runs(paragraph): runs = [] for run in paragraph.runs: run_data = { "text": run.text } if run.bold: run_data["bold"] = True if run.italic: run_data["italic"] = True if run.underline: run_data["underline"] = True if run.font and run.font.size: run_data["font_size"] = run.font.size.pt if run.font and run.font.name: run_data["font_name"] = run.font.name # Extract color (RGB or theme) if run.font and run.font.color: if run.font.color.rgb: run_data["color"] = str(run.font.color.rgb) elif run.font.color.theme_color: run_data["color_theme"] = str(run.font.color.theme_color) # Highlight color if run.font and hasattr(run.font, "highlight_color") and run.font.highlight_color: run_data["highlight"] = str(run.font.highlight_color) runs.append(run_data) return runs # Detect heading and list paragraphs def extract_paragraph_block(paragraph): style_name = paragraph.style.name if paragraph.style else "Normal" # Heading if style_name.startswith("Heading"): try: level = int(style_name.split()[-1]) except Exception: level = 1 return { "type": "heading", "level": level, "runs": extract_runs(paragraph), "alignment": str(paragraph.alignment) if paragraph.alignment else "left", "style": style_name } # List elif "List" in style_name: return { "type": "list_item", "list_type": "number" if "Number" in style_name else "bullet", "runs": extract_runs(paragraph), "alignment": str(paragraph.alignment) if paragraph.alignment else "left", "style": style_name } # Normal paragraph else: return { "type": "paragraph", "runs": extract_runs(paragraph), "alignment": str(paragraph.alignment) if paragraph.alignment else "left", "style": style_name } # Add spacing extraction def extract_blocks(element, output_dir, image_prefix): blocks = [] if hasattr(element, 'paragraphs'): for para in element.paragraphs: if para.text.strip(): para_block = extract_paragraph_block(para) # Add spacing info pf = para.paragraph_format if pf: if pf.space_before: para_block["space_before"] = pf.space_before.pt if pf.space_after: para_block["space_after"] = pf.space_after.pt if pf.line_spacing: para_block["line_spacing"] = pf.line_spacing blocks.append(para_block) if hasattr(element, 'tables'): for table in element.tables: blocks.append(extract_table_block(table)) return blocks def extract_table_block(table): rows = [] for row in table.rows: row_cells = [] for cell in row.cells: # Only take unique paragraphs (python-docx repeats cell objects) unique_paras = [] seen = set() for para in cell.paragraphs: para_id = id(para) if para_id not in seen: unique_paras.append(para) seen.add(para_id) row_cells.append([extract_paragraph_block(para) for para in unique_paras if para.text.strip()]) rows.append(row_cells) return {"type": "table", "rows": rows} def extract_images_from_doc(doc, output_dir, image_prefix): image_blocks = [] rels = doc.part.rels for rel in rels.values(): if rel.reltype == 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/image': img_blob = rel.target_part.blob img_hash = hashlib.sha1(img_blob).hexdigest()[:8] img_ext = rel.target_part.content_type.split('/')[-1] img_id = f"{image_prefix}_{img_hash}" img_filename = f"{img_id}.{img_ext}" img_path = os.path.join(output_dir, img_filename) with open(img_path, 'wb') as f: f.write(img_blob) image_blocks.append({ "type": "image", "image_id": img_id, "image_format": img_ext, "path": img_filename }) return image_blocks def add_runs_to_paragraph(paragraph, runs): for run_info in runs: run = paragraph.add_run(run_info.get("text", "")) if run_info.get("bold"): run.bold = True if run_info.get("italic"): run.italic = True if run_info.get("underline"): run.underline = True if run_info.get("font_size"): run.font.size = Pt(run_info["font_size"]) if run_info.get("font_name"): run.font.name = run_info["font_name"] # Set color (RGB or theme) if run_info.get("color"): try: run.font.color.rgb = RGBColor.from_string(run_info["color"].replace("#", "")) except Exception: pass elif run_info.get("color_theme"): try: run.font.color.theme_color = int(run_info["color_theme"]) except Exception: pass if run_info.get("highlight"): try: if run_info["highlight"].isdigit(): run.font.highlight_color = int(run_info["highlight"]) else: run.font.highlight_color = WD_COLOR_INDEX[run_info["highlight"]] except Exception: pass # Add heading and list support def add_block_to_doc(doc, block, image_dir): if block["type"] == "heading": level = block.get("level", 1) text = "".join([r.get("text", "") for r in block.get("runs", [])]) para = doc.add_heading(text, level=level) add_runs_to_paragraph(para, block.get("runs", [])) align = block.get("alignment", "left") if align == "center": para.alignment = WD_ALIGN_PARAGRAPH.CENTER elif align == "right": para.alignment = WD_ALIGN_PARAGRAPH.RIGHT else: para.alignment = WD_ALIGN_PARAGRAPH.LEFT # Spacing if "space_before" in block: para.paragraph_format.space_before = Pt(block["space_before"]) if "space_after" in block: para.paragraph_format.space_after = Pt(block["space_after"]) if "line_spacing" in block: para.paragraph_format.line_spacing = block["line_spacing"] elif block["type"] == "list_item": style = "List Number" if block.get("list_type") == "number" else "List Bullet" para = doc.add_paragraph(style=style) add_runs_to_paragraph(para, block.get("runs", [])) align = block.get("alignment", "left") if align == "center": para.alignment = WD_ALIGN_PARAGRAPH.CENTER elif align == "right": para.alignment = WD_ALIGN_PARAGRAPH.RIGHT else: para.alignment = WD_ALIGN_PARAGRAPH.LEFT if "space_before" in block: para.paragraph_format.space_before = Pt(block["space_before"]) if "space_after" in block: para.paragraph_format.space_after = Pt(block["space_after"]) if "line_spacing" in block: para.paragraph_format.line_spacing = block["line_spacing"] elif block["type"] == "paragraph": para = doc.add_paragraph() add_runs_to_paragraph(para, block.get("runs", [])) align = block.get("alignment", "left") if align == "center": para.alignment = WD_ALIGN_PARAGRAPH.CENTER elif align == "right": para.alignment = WD_ALIGN_PARAGRAPH.RIGHT else: para.alignment = WD_ALIGN_PARAGRAPH.LEFT if "space_before" in block: para.paragraph_format.space_before = Pt(block["space_before"]) if "space_after" in block: para.paragraph_format.space_after = Pt(block["space_after"]) if "line_spacing" in block: para.paragraph_format.line_spacing = block["line_spacing"] elif block["type"] == "table": rows = block.get("rows", []) if rows: try: section = doc.sections[0] table_width = section.page_width except Exception: table_width = Inches(6) table = doc.add_table(rows=len(rows), cols=len(rows[0]), width=table_width) for i, row in enumerate(rows): for j, cell_blocks in enumerate(row): cell = table.cell(i, j) for para_block in cell_blocks: add_block_to_doc(cell, para_block, image_dir) elif block["type"] == "image": img_path = os.path.join(image_dir, block["path"]) width = block.get("width") height = block.get("height") if os.path.exists(img_path): if width and height: doc.add_picture(img_path, width=Inches(width/96), height=Inches(height/96)) else: doc.add_picture(img_path) def add_blocks_to_doc(doc, blocks, image_dir): for block in blocks: # If doc is a header/footer, use add_paragraph directly if hasattr(doc, 'is_header') or hasattr(doc, 'is_footer') or hasattr(doc, 'add_paragraph'): add_block_to_doc(doc, block, image_dir) else: # If doc is a SectionHeader or SectionFooter (python-docx), use .add_paragraph() try: add_block_to_doc(doc, block, image_dir) except Exception: pass def extract_all_sections(doc, output_dir, image_prefix): sections = [] for idx, section in enumerate(doc.sections): sec = {} for htype, attr in [("header", "header"), ("first_page_header", "first_page_header"), ("even_page_header", "even_page_header"), ("footer", "footer"), ("first_page_footer", "first_page_footer"), ("even_page_footer", "even_page_footer")]: part = getattr(section, attr, None) if part: sec[htype] = extract_blocks(part, output_dir, f"{image_prefix}_sec{idx}_{htype}") sections.append(sec) return sections def convert_document(doc_file, target_format): import json from docx import Document as DocxDocument try: target_format = target_format.lower() orig_file_path = None # Handle Gradio NamedString or file-like object if hasattr(doc_file, 'name'): orig_file_path = doc_file.name elif isinstance(doc_file, str): orig_file_path = doc_file else: return None, "Error: Unsupported file type.", None # If the file is a PDF, convert it to DOCX first if orig_file_path.lower().endswith('.pdf'): print("Converting PDF to DOCX...") doc_file = convert_pdf_to_docx(orig_file_path) print("PDF converted to DOCX.") orig_file_path = doc_file base_name = os.path.splitext(os.path.basename(orig_file_path))[0] output_file = f"docgen_{base_name}.{target_format.lower()}" # Custom DOCX to JSON extraction if orig_file_path.lower().endswith('.docx') and target_format == 'json': doc = Document(orig_file_path) output_dir = os.path.dirname(output_file) image_prefix = base_name image_blocks = extract_images_from_doc(doc, output_dir, image_prefix) sections = extract_all_sections(doc, output_dir, image_prefix) body_blocks = extract_blocks(doc, output_dir, image_prefix) doc_json = { "sections": sections, "body": body_blocks + image_blocks, "metadata": { "title": getattr(doc.core_properties, 'title', ''), "author": getattr(doc.core_properties, 'author', ''), } } with open(output_file, 'w', encoding='utf-8') as f: json.dump(doc_json, f, ensure_ascii=False, indent=2) elif orig_file_path.lower().endswith('.json') and target_format == 'docx': # JSON to DOCX with open(orig_file_path, 'r', encoding='utf-8') as f: doc_json = json.load(f) doc = DocxDocument() image_dir = os.path.dirname(orig_file_path) # Sections (headers/footers) if "sections" in doc_json: # Ensure doc has enough sections while len(doc.sections) < len(doc_json["sections"]): doc.add_section() for idx, sec in enumerate(doc_json["sections"]): section = doc.sections[idx] for htype, attr in [("header", "header"), ("first_page_header", "first_page_header"), ("even_page_header", "even_page_header"), ("footer", "footer"), ("first_page_footer", "first_page_footer"), ("even_page_footer", "even_page_footer")]: if htype in sec: part = getattr(section, attr, None) if part: # Remove all default paragraphs for p in list(part.paragraphs): p._element.getparent().remove(p._element) add_blocks_to_doc(part, sec[htype], image_dir) # Body if "body" in doc_json: add_blocks_to_doc(doc, doc_json["body"], image_dir) # Metadata if "metadata" in doc_json: meta = doc_json["metadata"] if "title" in meta: doc.core_properties.title = meta["title"] if "author" in meta: doc.core_properties.author = meta["author"] doc.save(output_file) else: # Use Pandoc for other conversions pypandoc.convert_file( orig_file_path, target_format.lower(), outputfile=output_file, ) input_preview = get_preview(orig_file_path) output_preview = get_preview(output_file) return input_preview, output_preview, output_file except Exception as e: return f"Error: {e}", None, None def parity_check(docx_path): import tempfile print(f"[Parity Check] Testing round-trip for: {docx_path}") class FileLike: # Fake file-like for CLI def __init__(self, name): self.name = name _, _, json_out = convert_document(FileLike(docx_path), 'json') if not json_out or not os.path.exists(json_out): print("Failed to produce JSON from DOCX.") return False _, _, docx_out = convert_document(FileLike(json_out), 'docx') if not docx_out or not os.path.exists(docx_out): print("Failed to produce DOCX from JSON.") return False def extract_all_sections_for_parity(docx_path): doc = Document(docx_path) sections = [] for idx, section in enumerate(doc.sections): sec = {} for htype, attr in [("header", "header"), ("first_page_header", "first_page_header"), ("even_page_header", "even_page_header"), ("footer", "footer"), ("first_page_footer", "first_page_footer"), ("even_page_footer", "even_page_footer")]: part = getattr(section, attr, None) if part: sec[htype] = extract_blocks(part, os.path.dirname(docx_path), f"sec{idx}_{htype}") sections.append(sec) body = extract_blocks(doc, os.path.dirname(docx_path), os.path.splitext(os.path.basename(docx_path))[0]) return {"sections": sections, "body": body} orig = extract_all_sections_for_parity(docx_path) roundtrip = extract_all_sections_for_parity(docx_out) import difflib, pprint def blocks_to_str(blocks): return pprint.pformat(blocks, width=120) if orig == roundtrip: print("[Parity Check] PASS: Round-trip blocks are identical!") return True else: print("[Parity Check] FAIL: Differences found.") # Compare per section for idx, (orig_sec, round_sec) in enumerate(zip(orig["sections"], roundtrip["sections"])): if orig_sec != round_sec: print(f"Section {idx} header/footer mismatch:") diff = difflib.unified_diff(blocks_to_str(orig_sec).splitlines(), blocks_to_str(round_sec).splitlines(), fromfile='original', tofile='roundtrip', lineterm='') print('\n'.join(diff)) if orig["body"] != roundtrip["body"]: print("Body mismatch:") diff = difflib.unified_diff(blocks_to_str(orig["body"]).splitlines(), blocks_to_str(roundtrip["body"]).splitlines(), fromfile='original', tofile='roundtrip', lineterm='') print('\n'.join(diff)) return False with gr.Blocks(css="footer {visibility: hidden}") as demo: gr.Markdown("# Document Format Converter\nUpload a document and preview as JSON. Select a format to download in another format.") with gr.Row(): with gr.Column(): input_file = gr.File(label="Upload Document", file_types=[f'.{ext.lower()}' for ext in input_supported_formats]) input_preview = gr.HTML(label="JSON Preview") with gr.Column(): output_format = gr.Dropdown(label="Download As...", choices=output_supported_formats, value="DOCX") format_label = gr.Markdown("Previewing as: DOCX") output_preview = gr.HTML(label="Output Preview") output_file = gr.File(label="Download Converted Document", visible=True) json_state = gr.State() orig_file_state = gr.State() def upload_and_preview(doc_file): _, _, json_path = convert_document(doc_file, "json") # Handle conversion failure if not json_path or not os.path.exists(json_path): error_msg = "Error converting document to JSON." return f"

{error_msg}

", "", doc_file.name # Read and preview JSON content try: with open(json_path, "r", encoding="utf-8") as f: json_content = f.read() except Exception as e: error_msg = f"Error reading JSON: {e}" return f"

{error_msg}

", "", doc_file.name preview_html = f"

{json_content[:4000]}

" return preview_html, json_content, doc_file.name def convert_and_preview(orig_file_path, output_format): class F: name = orig_file_path _, _, out_path = convert_document(F(), output_format.lower()) preview = get_preview(out_path) return f"Previewing as: {output_format}", preview, out_path input_file.upload(upload_and_preview, inputs=input_file, outputs=[input_preview, json_state, orig_file_state]) output_format.change(convert_and_preview, inputs=[orig_file_state, output_format], outputs=[format_label, output_preview, output_file]) if __name__ == "__main__": if len(sys.argv) == 3 and sys.argv[1] == "--parity-check": parity_check(sys.argv[2]) sys.exit(0) # Generate a random API key if one doesn't exist in environment variables API_KEY = os.environ.get('API_KEY', secrets.token_urlsafe(32)) print(f"API Key: {API_KEY}") # Print the API key when the app starts # Create Flask app for API endpoints app = Flask(__name__) def check_api_key(): """Check if the API key is valid.""" provided_key = request.headers.get('X-API-Key') if not provided_key or provided_key != API_KEY: return False return True @app.route('/api/docx-to-json', methods=['POST']) def api_docx_to_json(): # Check API key if not check_api_key(): return jsonify({"error": "Invalid or missing API key"}), 401 if 'file' not in request.files: return jsonify({"error": "No file part"}), 400 file = request.files['file'] if file.filename == '': return jsonify({"error": "No selected file"}), 400 if not file.filename.lower().endswith('.docx'): return jsonify({"error": "File must be a DOCX document"}), 400 # Save the uploaded file temp_dir = tempfile.mkdtemp() file_path = os.path.join(temp_dir, file.filename) file.save(file_path) try: # Convert to JSON _, _, json_path = convert_document(type('obj', (object,), {'name': file_path}), "json") if not json_path or not os.path.exists(json_path): return jsonify({"error": "Error converting document to JSON"}), 500 # Read JSON content with open(json_path, "r", encoding="utf-8") as f: json_content = json.load(f) return jsonify(json_content) except Exception as e: return jsonify({"error": str(e)}), 500 @app.route('/api/json-to-docx', methods=['POST']) def api_json_to_docx(): # Check API key if not check_api_key(): return jsonify({"error": "Invalid or missing API key"}), 401 if not request.is_json: return jsonify({"error": "Request must be JSON"}), 400 try: # Save the JSON to a temporary file temp_dir = tempfile.mkdtemp() json_path = os.path.join(temp_dir, "document.json") with open(json_path, "w", encoding="utf-8") as f: json.dump(request.json, f) # Convert to DOCX _, _, docx_path = convert_document(type('obj', (object,), {'name': json_path}), "docx") if not docx_path or not os.path.exists(docx_path): return jsonify({"error": "Error converting JSON to DOCX"}), 500 return send_file(docx_path, as_attachment=True, download_name="converted.docx") except Exception as e: return jsonify({"error": str(e)}), 500 # Run both Gradio and Flask def run_flask(): app.run(host='0.0.0.0', port=5000) # Start Flask in a separate thread flask_thread = threading.Thread(target=run_flask) flask_thread.daemon = True flask_thread.start() # Start Gradio demo.launch(share=True)