Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| import pypandoc | |
| import os | |
| from pdf2docx import Converter | |
| from docx import Document | |
| from docx.table import _Cell | |
| from docx.shared import Inches, Pt, RGBColor | |
| from docx.enum.text import WD_ALIGN_PARAGRAPH, WD_COLOR_INDEX | |
| from docx.oxml.ns import qn | |
| import json | |
| import base64 | |
| import hashlib | |
| import sys | |
| import tempfile | |
| from flask import Flask, request, jsonify, send_file | |
| import threading | |
| import secrets | |
| os.system('sudo apt-get install texlive') | |
| def ensure_pandoc_installed(): | |
| try: | |
| # Periksa apakah pandoc sudah ada | |
| pypandoc.get_pandoc_version() | |
| print("Pandoc is already installed and accessible.") | |
| except OSError: | |
| # Unduh pandoc jika belum ada | |
| print("Pandoc not found, downloading...") | |
| pypandoc.download_pandoc() | |
| print("Pandoc downloaded successfully.") | |
| # Pastikan Pandoc terpasang | |
| ensure_pandoc_installed() | |
| # Daftar format yang didukung | |
| input_supported_formats = [data.upper() for data in sorted(list(pypandoc.get_pandoc_formats()[0]).append('PDF') or [ | |
| 'BIBLATEX', 'BIBTEX', 'BITS', 'COMMONMARK', 'COMMONMARK_X', 'CREOLE', 'CSLJSON', 'CSV', | |
| 'DJOT', 'DOCBOOK', 'DOCX', 'DOKUWIKI', 'ENDNOTEXML', 'EPUB', 'FB2', 'GFM', 'HADDOCK', | |
| 'HTML', 'IPYNB', 'JATS', 'JIRA', 'JSON', 'LATEX', 'MAN', 'MARKDOWN', 'MARKDOWN_GITHUB', | |
| 'MARKDOWN_MMD', 'MARKDOWN_PHPEXTRA', 'MARKDOWN_STRICT', 'MDOC', 'MEDIAWIKI', 'MUSE', | |
| 'NATIVE', 'ODT', 'OPML', 'ORG', 'PDF', 'POD', 'RIS', 'RST', 'RTF', 'T2T', 'TEXTILE', | |
| 'TIKIWIKI', 'TSV', 'TWIKI', 'TYPST', 'VIMWIKI' | |
| ]) if data not in ['PDF']] | |
| output_supported_formats = [data.upper() for data in sorted([ | |
| "ANSI", "ASCIIDOC", "ASCIIDOC_LEGACY", "ASCIIDOCTOR", "BEAMER", "BIBLATEX", "BIBTEX", "CHUNKEDHTML", | |
| "COMMONMARK", "COMMONMARK_X", "CONTEXT", "CSLJSON", "DJOT", "DOCBOOK", "DOCBOOK4", "DOCBOOK5", | |
| "DOCX", "DOKUWIKI", "DZSLIDES", "EPUB", "EPUB2", "EPUB3", "FB2", "GFM", "HADDOCK", "HTML", | |
| "HTML4", "HTML5", "ICML", "IPYNB", "JATS", "JATS_ARCHIVING", "JATS_ARTICLEAUTHORING", | |
| "JATS_PUBLISHING", "JIRA", "JSON", "LATEX", "MAN", "MARKDOWN", "MARKDOWN_GITHUB", | |
| "MARKDOWN_MMD", "MARKDOWN_PHPEXTRA", "MARKDOWN_STRICT", "MARKUA", "MEDIAWIKI", "MS", | |
| "MUSE", "NATIVE", "ODT", "OPENDOCUMENT", "OPML", "ORG", "PDF", "PLAIN", "PPTX", "REVEALJS", | |
| "RST", "RTF", "S5", "SLIDEOUS", "SLIDY", "TEI", "TEXINFO", "TEXTILE", "TYPST", "XWIKI", "ZIMWIKI" | |
| ]) if data not in ['PDF']] | |
| def convert_pdf_to_docx(pdf_file): | |
| output_docx = f"{os.path.splitext(pdf_file)[0]}.docx" | |
| cv = Converter(pdf_file) | |
| cv.convert(output_docx, start=0, end=None) | |
| return output_docx | |
| def get_preview(file_path): | |
| ext = os.path.splitext(file_path)[1].lower() | |
| try: | |
| if ext in ['.txt', '.md', '.csv', '.json']: | |
| with open(file_path, 'r', encoding='utf-8', errors='ignore') as f: | |
| content = f.read(2000) # Preview first 2000 chars | |
| return f"<pre style='max-height:300px;overflow:auto'>{content}</pre>" | |
| elif ext == '.pdf': | |
| # Show PDF inline using HTML embed | |
| return f"<embed src='{file_path}' type='application/pdf' width='100%' height='400px' />" | |
| elif ext == '.docx': | |
| try: | |
| doc = Document(file_path) | |
| html = "" | |
| # Extract header(s) with paragraphs and tables | |
| headers = [] | |
| for section in doc.sections: | |
| header_texts = [] | |
| # Paragraphs | |
| for p in section.header.paragraphs: | |
| if p.text.strip(): | |
| header_texts.append(p.text.strip()) | |
| # Tables | |
| for table in section.header.tables: | |
| for row in table.rows: | |
| row_text = " ".join(cell.text.strip() for cell in row.cells if cell.text.strip()) | |
| if row_text: | |
| header_texts.append(row_text) | |
| if header_texts: | |
| headers.append(" | ".join(header_texts)) | |
| if headers: | |
| html += f"<div style='font-weight:bold;font-size:1.2em;margin-bottom:8px;'>{' | '.join(headers)}</div>" | |
| para_count = 0 | |
| for para in doc.paragraphs: | |
| text = para.text.strip() | |
| if text: | |
| html += f"<p>{text}</p>" | |
| para_count += 1 | |
| if para_count > 30: | |
| html += "<p><i>Preview truncated...</i></p>" | |
| break | |
| return f"<div style='max-height:300px;overflow:auto'>{html}</div>" | |
| except Exception as e: | |
| return f"<b>Error reading DOCX:</b> {e}" | |
| elif ext == '.doc': | |
| return f"<b>DOC file:</b> {os.path.basename(file_path)} (Preview not supported)" | |
| else: | |
| return f"<b>File:</b> {os.path.basename(file_path)} (Preview not supported)" | |
| except Exception as e: | |
| return f"<b>Error generating preview:</b> {e}" | |
| def extract_runs(paragraph): | |
| runs = [] | |
| for run in paragraph.runs: | |
| run_data = { | |
| "text": run.text | |
| } | |
| if run.bold: | |
| run_data["bold"] = True | |
| if run.italic: | |
| run_data["italic"] = True | |
| if run.underline: | |
| run_data["underline"] = True | |
| if run.font and run.font.size: | |
| run_data["font_size"] = run.font.size.pt | |
| if run.font and run.font.name: | |
| run_data["font_name"] = run.font.name | |
| # Extract color (RGB or theme) | |
| if run.font and run.font.color: | |
| if run.font.color.rgb: | |
| run_data["color"] = str(run.font.color.rgb) | |
| elif run.font.color.theme_color: | |
| run_data["color_theme"] = str(run.font.color.theme_color) | |
| # Highlight color | |
| if run.font and hasattr(run.font, "highlight_color") and run.font.highlight_color: | |
| run_data["highlight"] = str(run.font.highlight_color) | |
| runs.append(run_data) | |
| return runs | |
| # Detect heading and list paragraphs | |
| def extract_paragraph_block(paragraph): | |
| style_name = paragraph.style.name if paragraph.style else "Normal" | |
| # Heading | |
| if style_name.startswith("Heading"): | |
| try: | |
| level = int(style_name.split()[-1]) | |
| except Exception: | |
| level = 1 | |
| return { | |
| "type": "heading", | |
| "level": level, | |
| "runs": extract_runs(paragraph), | |
| "alignment": str(paragraph.alignment) if paragraph.alignment else "left", | |
| "style": style_name | |
| } | |
| # List | |
| elif "List" in style_name: | |
| return { | |
| "type": "list_item", | |
| "list_type": "number" if "Number" in style_name else "bullet", | |
| "runs": extract_runs(paragraph), | |
| "alignment": str(paragraph.alignment) if paragraph.alignment else "left", | |
| "style": style_name | |
| } | |
| # Normal paragraph | |
| else: | |
| return { | |
| "type": "paragraph", | |
| "runs": extract_runs(paragraph), | |
| "alignment": str(paragraph.alignment) if paragraph.alignment else "left", | |
| "style": style_name | |
| } | |
| # Add spacing extraction | |
| def extract_blocks(element, output_dir, image_prefix): | |
| blocks = [] | |
| if hasattr(element, 'paragraphs'): | |
| for para in element.paragraphs: | |
| if para.text.strip(): | |
| para_block = extract_paragraph_block(para) | |
| # Add spacing info | |
| pf = para.paragraph_format | |
| if pf: | |
| if pf.space_before: | |
| para_block["space_before"] = pf.space_before.pt | |
| if pf.space_after: | |
| para_block["space_after"] = pf.space_after.pt | |
| if pf.line_spacing: | |
| para_block["line_spacing"] = pf.line_spacing | |
| blocks.append(para_block) | |
| if hasattr(element, 'tables'): | |
| for table in element.tables: | |
| blocks.append(extract_table_block(table)) | |
| return blocks | |
| def extract_table_block(table): | |
| rows = [] | |
| for row in table.rows: | |
| row_cells = [] | |
| for cell in row.cells: | |
| # Only take unique paragraphs (python-docx repeats cell objects) | |
| unique_paras = [] | |
| seen = set() | |
| for para in cell.paragraphs: | |
| para_id = id(para) | |
| if para_id not in seen: | |
| unique_paras.append(para) | |
| seen.add(para_id) | |
| row_cells.append([extract_paragraph_block(para) for para in unique_paras if para.text.strip()]) | |
| rows.append(row_cells) | |
| return {"type": "table", "rows": rows} | |
| def extract_images_from_doc(doc, output_dir, image_prefix): | |
| image_blocks = [] | |
| rels = doc.part.rels | |
| for rel in rels.values(): | |
| if rel.reltype == 'http://schemas.openxmlformats.org/officeDocument/2006/relationships/image': | |
| img_blob = rel.target_part.blob | |
| img_hash = hashlib.sha1(img_blob).hexdigest()[:8] | |
| img_ext = rel.target_part.content_type.split('/')[-1] | |
| img_id = f"{image_prefix}_{img_hash}" | |
| img_filename = f"{img_id}.{img_ext}" | |
| img_path = os.path.join(output_dir, img_filename) | |
| with open(img_path, 'wb') as f: | |
| f.write(img_blob) | |
| image_blocks.append({ | |
| "type": "image", | |
| "image_id": img_id, | |
| "image_format": img_ext, | |
| "path": img_filename | |
| }) | |
| return image_blocks | |
| def add_runs_to_paragraph(paragraph, runs): | |
| for run_info in runs: | |
| run = paragraph.add_run(run_info.get("text", "")) | |
| if run_info.get("bold"): run.bold = True | |
| if run_info.get("italic"): run.italic = True | |
| if run_info.get("underline"): run.underline = True | |
| if run_info.get("font_size"): run.font.size = Pt(run_info["font_size"]) | |
| if run_info.get("font_name"): run.font.name = run_info["font_name"] | |
| # Set color (RGB or theme) | |
| if run_info.get("color"): | |
| try: | |
| run.font.color.rgb = RGBColor.from_string(run_info["color"].replace("#", "")) | |
| except Exception: | |
| pass | |
| elif run_info.get("color_theme"): | |
| try: | |
| run.font.color.theme_color = int(run_info["color_theme"]) | |
| except Exception: | |
| pass | |
| if run_info.get("highlight"): | |
| try: | |
| if run_info["highlight"].isdigit(): | |
| run.font.highlight_color = int(run_info["highlight"]) | |
| else: | |
| run.font.highlight_color = WD_COLOR_INDEX[run_info["highlight"]] | |
| except Exception: | |
| pass | |
| # Add heading and list support | |
| def add_block_to_doc(doc, block, image_dir): | |
| if block["type"] == "heading": | |
| level = block.get("level", 1) | |
| text = "".join([r.get("text", "") for r in block.get("runs", [])]) | |
| para = doc.add_heading(text, level=level) | |
| add_runs_to_paragraph(para, block.get("runs", [])) | |
| align = block.get("alignment", "left") | |
| if align == "center": para.alignment = WD_ALIGN_PARAGRAPH.CENTER | |
| elif align == "right": para.alignment = WD_ALIGN_PARAGRAPH.RIGHT | |
| else: para.alignment = WD_ALIGN_PARAGRAPH.LEFT | |
| # Spacing | |
| if "space_before" in block: para.paragraph_format.space_before = Pt(block["space_before"]) | |
| if "space_after" in block: para.paragraph_format.space_after = Pt(block["space_after"]) | |
| if "line_spacing" in block: para.paragraph_format.line_spacing = block["line_spacing"] | |
| elif block["type"] == "list_item": | |
| style = "List Number" if block.get("list_type") == "number" else "List Bullet" | |
| para = doc.add_paragraph(style=style) | |
| add_runs_to_paragraph(para, block.get("runs", [])) | |
| align = block.get("alignment", "left") | |
| if align == "center": para.alignment = WD_ALIGN_PARAGRAPH.CENTER | |
| elif align == "right": para.alignment = WD_ALIGN_PARAGRAPH.RIGHT | |
| else: para.alignment = WD_ALIGN_PARAGRAPH.LEFT | |
| if "space_before" in block: para.paragraph_format.space_before = Pt(block["space_before"]) | |
| if "space_after" in block: para.paragraph_format.space_after = Pt(block["space_after"]) | |
| if "line_spacing" in block: para.paragraph_format.line_spacing = block["line_spacing"] | |
| elif block["type"] == "paragraph": | |
| para = doc.add_paragraph() | |
| add_runs_to_paragraph(para, block.get("runs", [])) | |
| align = block.get("alignment", "left") | |
| if align == "center": para.alignment = WD_ALIGN_PARAGRAPH.CENTER | |
| elif align == "right": para.alignment = WD_ALIGN_PARAGRAPH.RIGHT | |
| else: para.alignment = WD_ALIGN_PARAGRAPH.LEFT | |
| if "space_before" in block: para.paragraph_format.space_before = Pt(block["space_before"]) | |
| if "space_after" in block: para.paragraph_format.space_after = Pt(block["space_after"]) | |
| if "line_spacing" in block: para.paragraph_format.line_spacing = block["line_spacing"] | |
| elif block["type"] == "table": | |
| rows = block.get("rows", []) | |
| if rows: | |
| try: | |
| section = doc.sections[0] | |
| table_width = section.page_width | |
| except Exception: | |
| table_width = Inches(6) | |
| table = doc.add_table(rows=len(rows), cols=len(rows[0]), width=table_width) | |
| for i, row in enumerate(rows): | |
| for j, cell_blocks in enumerate(row): | |
| cell = table.cell(i, j) | |
| for para_block in cell_blocks: | |
| add_block_to_doc(cell, para_block, image_dir) | |
| elif block["type"] == "image": | |
| img_path = os.path.join(image_dir, block["path"]) | |
| width = block.get("width") | |
| height = block.get("height") | |
| if os.path.exists(img_path): | |
| if width and height: | |
| doc.add_picture(img_path, width=Inches(width/96), height=Inches(height/96)) | |
| else: | |
| doc.add_picture(img_path) | |
| def add_blocks_to_doc(doc, blocks, image_dir): | |
| for block in blocks: | |
| # If doc is a header/footer, use add_paragraph directly | |
| if hasattr(doc, 'is_header') or hasattr(doc, 'is_footer') or hasattr(doc, 'add_paragraph'): | |
| add_block_to_doc(doc, block, image_dir) | |
| else: | |
| # If doc is a SectionHeader or SectionFooter (python-docx), use .add_paragraph() | |
| try: | |
| add_block_to_doc(doc, block, image_dir) | |
| except Exception: | |
| pass | |
| def extract_all_sections(doc, output_dir, image_prefix): | |
| sections = [] | |
| for idx, section in enumerate(doc.sections): | |
| sec = {} | |
| for htype, attr in [("header", "header"), ("first_page_header", "first_page_header"), ("even_page_header", "even_page_header"), | |
| ("footer", "footer"), ("first_page_footer", "first_page_footer"), ("even_page_footer", "even_page_footer")]: | |
| part = getattr(section, attr, None) | |
| if part: | |
| sec[htype] = extract_blocks(part, output_dir, f"{image_prefix}_sec{idx}_{htype}") | |
| sections.append(sec) | |
| return sections | |
| def convert_document(doc_file, target_format): | |
| import json | |
| from docx import Document as DocxDocument | |
| try: | |
| target_format = target_format.lower() | |
| orig_file_path = None | |
| # Handle Gradio NamedString or file-like object | |
| if hasattr(doc_file, 'name'): | |
| orig_file_path = doc_file.name | |
| elif isinstance(doc_file, str): | |
| orig_file_path = doc_file | |
| else: | |
| return None, "Error: Unsupported file type.", None | |
| # If the file is a PDF, convert it to DOCX first | |
| if orig_file_path.lower().endswith('.pdf'): | |
| print("Converting PDF to DOCX...") | |
| doc_file = convert_pdf_to_docx(orig_file_path) | |
| print("PDF converted to DOCX.") | |
| orig_file_path = doc_file | |
| base_name = os.path.splitext(os.path.basename(orig_file_path))[0] | |
| output_file = f"docgen_{base_name}.{target_format.lower()}" | |
| # Custom DOCX to JSON extraction | |
| if orig_file_path.lower().endswith('.docx') and target_format == 'json': | |
| doc = Document(orig_file_path) | |
| output_dir = os.path.dirname(output_file) | |
| image_prefix = base_name | |
| image_blocks = extract_images_from_doc(doc, output_dir, image_prefix) | |
| sections = extract_all_sections(doc, output_dir, image_prefix) | |
| body_blocks = extract_blocks(doc, output_dir, image_prefix) | |
| doc_json = { | |
| "sections": sections, | |
| "body": body_blocks + image_blocks, | |
| "metadata": { | |
| "title": getattr(doc.core_properties, 'title', ''), | |
| "author": getattr(doc.core_properties, 'author', ''), | |
| } | |
| } | |
| with open(output_file, 'w', encoding='utf-8') as f: | |
| json.dump(doc_json, f, ensure_ascii=False, indent=2) | |
| elif orig_file_path.lower().endswith('.json') and target_format == 'docx': | |
| # JSON to DOCX | |
| with open(orig_file_path, 'r', encoding='utf-8') as f: | |
| doc_json = json.load(f) | |
| doc = DocxDocument() | |
| image_dir = os.path.dirname(orig_file_path) | |
| # Sections (headers/footers) | |
| if "sections" in doc_json: | |
| # Ensure doc has enough sections | |
| while len(doc.sections) < len(doc_json["sections"]): | |
| doc.add_section() | |
| for idx, sec in enumerate(doc_json["sections"]): | |
| section = doc.sections[idx] | |
| for htype, attr in [("header", "header"), ("first_page_header", "first_page_header"), ("even_page_header", "even_page_header"), | |
| ("footer", "footer"), ("first_page_footer", "first_page_footer"), ("even_page_footer", "even_page_footer")]: | |
| if htype in sec: | |
| part = getattr(section, attr, None) | |
| if part: | |
| # Remove all default paragraphs | |
| for p in list(part.paragraphs): | |
| p._element.getparent().remove(p._element) | |
| add_blocks_to_doc(part, sec[htype], image_dir) | |
| # Body | |
| if "body" in doc_json: | |
| add_blocks_to_doc(doc, doc_json["body"], image_dir) | |
| # Metadata | |
| if "metadata" in doc_json: | |
| meta = doc_json["metadata"] | |
| if "title" in meta: | |
| doc.core_properties.title = meta["title"] | |
| if "author" in meta: | |
| doc.core_properties.author = meta["author"] | |
| doc.save(output_file) | |
| else: | |
| # Use Pandoc for other conversions | |
| pypandoc.convert_file( | |
| orig_file_path, | |
| target_format.lower(), | |
| outputfile=output_file, | |
| ) | |
| input_preview = get_preview(orig_file_path) | |
| output_preview = get_preview(output_file) | |
| return input_preview, output_preview, output_file | |
| except Exception as e: | |
| return f"Error: {e}", None, None | |
| def parity_check(docx_path): | |
| import tempfile | |
| print(f"[Parity Check] Testing round-trip for: {docx_path}") | |
| class FileLike: # Fake file-like for CLI | |
| def __init__(self, name): self.name = name | |
| _, _, json_out = convert_document(FileLike(docx_path), 'json') | |
| if not json_out or not os.path.exists(json_out): | |
| print("Failed to produce JSON from DOCX.") | |
| return False | |
| _, _, docx_out = convert_document(FileLike(json_out), 'docx') | |
| if not docx_out or not os.path.exists(docx_out): | |
| print("Failed to produce DOCX from JSON.") | |
| return False | |
| def extract_all_sections_for_parity(docx_path): | |
| doc = Document(docx_path) | |
| sections = [] | |
| for idx, section in enumerate(doc.sections): | |
| sec = {} | |
| for htype, attr in [("header", "header"), ("first_page_header", "first_page_header"), ("even_page_header", "even_page_header"), | |
| ("footer", "footer"), ("first_page_footer", "first_page_footer"), ("even_page_footer", "even_page_footer")]: | |
| part = getattr(section, attr, None) | |
| if part: | |
| sec[htype] = extract_blocks(part, os.path.dirname(docx_path), f"sec{idx}_{htype}") | |
| sections.append(sec) | |
| body = extract_blocks(doc, os.path.dirname(docx_path), os.path.splitext(os.path.basename(docx_path))[0]) | |
| return {"sections": sections, "body": body} | |
| orig = extract_all_sections_for_parity(docx_path) | |
| roundtrip = extract_all_sections_for_parity(docx_out) | |
| import difflib, pprint | |
| def blocks_to_str(blocks): | |
| return pprint.pformat(blocks, width=120) | |
| if orig == roundtrip: | |
| print("[Parity Check] PASS: Round-trip blocks are identical!") | |
| return True | |
| else: | |
| print("[Parity Check] FAIL: Differences found.") | |
| # Compare per section | |
| for idx, (orig_sec, round_sec) in enumerate(zip(orig["sections"], roundtrip["sections"])): | |
| if orig_sec != round_sec: | |
| print(f"Section {idx} header/footer mismatch:") | |
| diff = difflib.unified_diff(blocks_to_str(orig_sec).splitlines(), blocks_to_str(round_sec).splitlines(), fromfile='original', tofile='roundtrip', lineterm='') | |
| print('\n'.join(diff)) | |
| if orig["body"] != roundtrip["body"]: | |
| print("Body mismatch:") | |
| diff = difflib.unified_diff(blocks_to_str(orig["body"]).splitlines(), blocks_to_str(roundtrip["body"]).splitlines(), fromfile='original', tofile='roundtrip', lineterm='') | |
| print('\n'.join(diff)) | |
| return False | |
| with gr.Blocks(css="footer {visibility: hidden}") as demo: | |
| gr.Markdown("# Document Format Converter\nUpload a document and preview as JSON. Select a format to download in another format.") | |
| with gr.Row(): | |
| with gr.Column(): | |
| input_file = gr.File(label="Upload Document", file_types=[f'.{ext.lower()}' for ext in input_supported_formats]) | |
| input_preview = gr.HTML(label="JSON Preview") | |
| with gr.Column(): | |
| output_format = gr.Dropdown(label="Download As...", choices=output_supported_formats, value="DOCX") | |
| format_label = gr.Markdown("Previewing as: DOCX") | |
| output_preview = gr.HTML(label="Output Preview") | |
| output_file = gr.File(label="Download Converted Document", visible=True) | |
| json_state = gr.State() | |
| orig_file_state = gr.State() | |
| def upload_and_preview(doc_file): | |
| _, _, json_path = convert_document(doc_file, "json") | |
| # Handle conversion failure | |
| if not json_path or not os.path.exists(json_path): | |
| error_msg = "Error converting document to JSON." | |
| return f"<pre style='max-height:300px;overflow:auto'>{error_msg}</pre>", "", doc_file.name | |
| # Read and preview JSON content | |
| try: | |
| with open(json_path, "r", encoding="utf-8") as f: | |
| json_content = f.read() | |
| except Exception as e: | |
| error_msg = f"Error reading JSON: {e}" | |
| return f"<pre style='max-height:300px;overflow:auto'>{error_msg}</pre>", "", doc_file.name | |
| preview_html = f"<pre style='max-height:300px;overflow:auto'>{json_content[:4000]}</pre>" | |
| return preview_html, json_content, doc_file.name | |
| def convert_and_preview(orig_file_path, output_format): | |
| class F: | |
| name = orig_file_path | |
| _, _, out_path = convert_document(F(), output_format.lower()) | |
| preview = get_preview(out_path) | |
| return f"Previewing as: {output_format}", preview, out_path | |
| input_file.upload(upload_and_preview, inputs=input_file, outputs=[input_preview, json_state, orig_file_state]) | |
| output_format.change(convert_and_preview, inputs=[orig_file_state, output_format], outputs=[format_label, output_preview, output_file]) | |
| if __name__ == "__main__": | |
| if len(sys.argv) == 3 and sys.argv[1] == "--parity-check": | |
| parity_check(sys.argv[2]) | |
| sys.exit(0) | |
| # Generate a random API key if one doesn't exist in environment variables | |
| API_KEY = os.environ.get('API_KEY', secrets.token_urlsafe(32)) | |
| print(f"API Key: {API_KEY}") # Print the API key when the app starts | |
| # Create Flask app for API endpoints | |
| app = Flask(__name__) | |
| def check_api_key(): | |
| """Check if the API key is valid.""" | |
| provided_key = request.headers.get('X-API-Key') | |
| if not provided_key or provided_key != API_KEY: | |
| return False | |
| return True | |
| def api_docx_to_json(): | |
| # Check API key | |
| if not check_api_key(): | |
| return jsonify({"error": "Invalid or missing API key"}), 401 | |
| if 'file' not in request.files: | |
| return jsonify({"error": "No file part"}), 400 | |
| file = request.files['file'] | |
| if file.filename == '': | |
| return jsonify({"error": "No selected file"}), 400 | |
| if not file.filename.lower().endswith('.docx'): | |
| return jsonify({"error": "File must be a DOCX document"}), 400 | |
| # Save the uploaded file | |
| temp_dir = tempfile.mkdtemp() | |
| file_path = os.path.join(temp_dir, file.filename) | |
| file.save(file_path) | |
| try: | |
| # Convert to JSON | |
| _, _, json_path = convert_document(type('obj', (object,), {'name': file_path}), "json") | |
| if not json_path or not os.path.exists(json_path): | |
| return jsonify({"error": "Error converting document to JSON"}), 500 | |
| # Read JSON content | |
| with open(json_path, "r", encoding="utf-8") as f: | |
| json_content = json.load(f) | |
| return jsonify(json_content) | |
| except Exception as e: | |
| return jsonify({"error": str(e)}), 500 | |
| def api_json_to_docx(): | |
| # Check API key | |
| if not check_api_key(): | |
| return jsonify({"error": "Invalid or missing API key"}), 401 | |
| if not request.is_json: | |
| return jsonify({"error": "Request must be JSON"}), 400 | |
| try: | |
| # Save the JSON to a temporary file | |
| temp_dir = tempfile.mkdtemp() | |
| json_path = os.path.join(temp_dir, "document.json") | |
| with open(json_path, "w", encoding="utf-8") as f: | |
| json.dump(request.json, f) | |
| # Convert to DOCX | |
| _, _, docx_path = convert_document(type('obj', (object,), {'name': json_path}), "docx") | |
| if not docx_path or not os.path.exists(docx_path): | |
| return jsonify({"error": "Error converting JSON to DOCX"}), 500 | |
| return send_file(docx_path, as_attachment=True, download_name="converted.docx") | |
| except Exception as e: | |
| return jsonify({"error": str(e)}), 500 | |
| # Run both Gradio and Flask | |
| def run_flask(): | |
| app.run(host='0.0.0.0', port=5000) | |
| # Start Flask in a separate thread | |
| flask_thread = threading.Thread(target=run_flask) | |
| flask_thread.daemon = True | |
| flask_thread.start() | |
| # Start Gradio | |
| demo.launch(share=True) |