|
|
import gradio as gr |
|
|
import os |
|
|
import tempfile |
|
|
from datetime import datetime |
|
|
import pandas as pd |
|
|
import json |
|
|
import unicodedata |
|
|
import pytesseract |
|
|
from PIL import Image |
|
|
|
|
|
|
|
|
from docling.document_converter import DocumentConverter, PdfFormatOption |
|
|
from docling.datamodel.pipeline_options import PdfPipelineOptions |
|
|
from docling.datamodel.base_models import InputFormat |
|
|
|
|
|
|
|
|
try: |
|
|
from ftfy import fix_text |
|
|
def _fix_text(s: str) -> str: |
|
|
return fix_text(s) |
|
|
except ImportError: |
|
|
def _fix_text(s: str) -> str: |
|
|
return s |
|
|
|
|
|
try: |
|
|
from langdetect import detect, DetectorFactory |
|
|
DetectorFactory.seed = 0 |
|
|
def _detect_lang(text: str) -> str | None: |
|
|
try: |
|
|
return detect(text) |
|
|
except Exception: |
|
|
return None |
|
|
except ImportError: |
|
|
def _detect_lang(text: str) -> str | None: |
|
|
return None |
|
|
|
|
|
def normalize_text(s: str) -> str: |
|
|
s = _fix_text(s) |
|
|
return unicodedata.normalize("NFC", s) |
|
|
|
|
|
|
|
|
LANG_MAP = { |
|
|
"pt": "por", "es": "spa", "en": "eng", "fr": "fra", "de": "deu", "it": "ita", |
|
|
"nl": "nld", "pl": "pol", "tr": "tur", "cs": "ces", "ru": "rus", "uk": "ukr", |
|
|
"el": "ell", "ro": "ron", "hu": "hun", "sv": "swe", "da": "dan", "fi": "fin", |
|
|
"no": "nor", "ca": "cat", "gl": "glg" |
|
|
} |
|
|
|
|
|
def guess_lang_code(text: str) -> str | None: |
|
|
lang = _detect_lang(text) if text and text.strip() else None |
|
|
return LANG_MAP.get(lang) if lang else None |
|
|
|
|
|
def process_image_with_ocr(image_path: str) -> str: |
|
|
""" |
|
|
Extract text from image using OCR (Tesseract) |
|
|
""" |
|
|
try: |
|
|
img = Image.open(image_path) |
|
|
|
|
|
text = pytesseract.image_to_string(img, lang='por+eng') |
|
|
detected_lang = guess_lang_code(text) or "por" |
|
|
|
|
|
text = pytesseract.image_to_string(img, lang=detected_lang) |
|
|
return normalize_text(text) |
|
|
except Exception as e: |
|
|
raise Exception(f"OCR processing failed: {str(e)}") |
|
|
|
|
|
def looks_garbled(text: str) -> bool: |
|
|
if not text or len(text.strip()) < 100: |
|
|
return True |
|
|
|
|
|
bad_patterns = ["Ã", "Â", "�", "ª"] |
|
|
return sum(text.count(p) for p in bad_patterns) > 5 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
pdf_options = PdfPipelineOptions( |
|
|
do_ocr=False, |
|
|
ocr_model="tesseract" |
|
|
) |
|
|
format_options = {InputFormat.PDF: PdfFormatOption(pipeline_options=pdf_options)} |
|
|
docling_converter = DocumentConverter(format_options=format_options) |
|
|
|
|
|
|
|
|
def convert_with_strategy(path: str): |
|
|
|
|
|
no_ocr_opts = PdfPipelineOptions(do_ocr=False, ocr_model="tesseract") |
|
|
converter = DocumentConverter(format_options={InputFormat.PDF: PdfFormatOption(pipeline_options=no_ocr_opts)}) |
|
|
res = converter.convert(path) |
|
|
text_sample = normalize_text(res.document.export_to_text()) |
|
|
|
|
|
if not looks_garbled(text_sample): |
|
|
return res |
|
|
|
|
|
|
|
|
detected = guess_lang_code(text_sample) or "por" |
|
|
ocr_opts = PdfPipelineOptions(do_ocr=True, ocr_model="tesseract", ocr_languages=[detected]) |
|
|
ocr_converter = DocumentConverter(format_options={InputFormat.PDF: PdfFormatOption(pipeline_options=ocr_opts)}) |
|
|
return ocr_converter.convert(path) |
|
|
|
|
|
def process_file(file): |
|
|
""" |
|
|
Process an uploaded file and return 4 files: |
|
|
1. Docling document (JSON) |
|
|
2. Text file |
|
|
3. Markdown file |
|
|
4. HTML file |
|
|
Supports: PDF, DOCX, XLSX, XLS, CSV, PPTX, TXT, and IMAGE formats (PNG, JPG, JPEG, BMP, TIFF) |
|
|
""" |
|
|
if file is None: |
|
|
return None, None, None, None, "❌ Error: Please upload a file." |
|
|
|
|
|
|
|
|
path = file.name if hasattr(file, "name") else str(file) |
|
|
ext = os.path.splitext(path)[1].lower() |
|
|
|
|
|
docling_direct = {".pdf", ".docx", ".xlsx", ".pptx"} |
|
|
to_xlsx_first = {".csv", ".xls"} |
|
|
image_formats = {".png", ".jpg", ".jpeg", ".bmp", ".tiff", ".tif"} |
|
|
|
|
|
try: |
|
|
|
|
|
if ext in image_formats: |
|
|
text_content = process_image_with_ocr(path) |
|
|
|
|
|
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") |
|
|
base_filename = f"document_{timestamp}" |
|
|
|
|
|
|
|
|
docling_json_path = f"{base_filename}_docling.json" |
|
|
docling_dict = { |
|
|
"type": "image_document_ocr", |
|
|
"content": text_content, |
|
|
"metadata": { |
|
|
"source": os.path.basename(path), |
|
|
"timestamp": timestamp, |
|
|
"format": ext |
|
|
} |
|
|
} |
|
|
with open(docling_json_path, "w", encoding="utf-8") as f: |
|
|
json.dump(docling_dict, f, indent=2, ensure_ascii=False) |
|
|
|
|
|
|
|
|
txt_path = f"{base_filename}.txt" |
|
|
with open(txt_path, "w", encoding="utf-8") as f: |
|
|
f.write(text_content) |
|
|
|
|
|
|
|
|
md_path = f"{base_filename}.md" |
|
|
with open(md_path, "w", encoding="utf-8") as f: |
|
|
f.write(f"# Document (OCR from Image)\n\n{text_content}") |
|
|
|
|
|
|
|
|
html_path = f"{base_filename}.html" |
|
|
html_content = f"""<!DOCTYPE html> |
|
|
<html lang="en"> |
|
|
<head> |
|
|
<meta charset="UTF-8"> |
|
|
<meta name="viewport" content="width=device-width, initial-scale=1.0"> |
|
|
<title>Document</title> |
|
|
</head> |
|
|
<body> |
|
|
<h1>Document (OCR from Image)</h1> |
|
|
<pre>{text_content}</pre> |
|
|
</body> |
|
|
</html>""" |
|
|
with open(html_path, "w", encoding="utf-8") as f: |
|
|
f.write(html_content) |
|
|
|
|
|
success_message = "✅ Successfully processed image with OCR! 4 files generated." |
|
|
return docling_json_path, txt_path, md_path, html_path, success_message |
|
|
|
|
|
|
|
|
elif ext in to_xlsx_first: |
|
|
if ext == ".csv": |
|
|
df = pd.read_csv(path) |
|
|
else: |
|
|
df = pd.read_excel(path) |
|
|
|
|
|
with tempfile.NamedTemporaryFile(delete=False, suffix=".xlsx") as tmp: |
|
|
df.to_excel(tmp.name, index=False) |
|
|
path = tmp.name |
|
|
|
|
|
|
|
|
if ext in docling_direct or ext in to_xlsx_first: |
|
|
result = convert_with_strategy(path) |
|
|
|
|
|
|
|
|
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") |
|
|
base_filename = f"document_{timestamp}" |
|
|
|
|
|
|
|
|
docling_json_path = f"{base_filename}_docling.json" |
|
|
with open(docling_json_path, "w", encoding="utf-8") as f: |
|
|
json.dump(result.document.export_to_dict(), f, indent=2, ensure_ascii=False) |
|
|
|
|
|
|
|
|
text_out = normalize_text(result.document.export_to_text()) |
|
|
md_out = normalize_text(result.document.export_to_markdown()) |
|
|
html_out = normalize_text(result.document.export_to_html()) |
|
|
|
|
|
|
|
|
txt_path = f"{base_filename}.txt" |
|
|
with open(txt_path, "w", encoding="utf-8") as f: |
|
|
f.write(text_out) |
|
|
|
|
|
|
|
|
md_path = f"{base_filename}.md" |
|
|
with open(md_path, "w", encoding="utf-8") as f: |
|
|
f.write(md_out) |
|
|
|
|
|
|
|
|
html_path = f"{base_filename}.html" |
|
|
with open(html_path, "w", encoding="utf-8") as f: |
|
|
f.write(html_out) |
|
|
|
|
|
success_message = "✅ Successfully processed file! 4 files generated." |
|
|
return docling_json_path, txt_path, md_path, html_path, success_message |
|
|
|
|
|
elif ext == ".txt": |
|
|
|
|
|
with open(path, "r", encoding="utf-8") as f: |
|
|
text_content = f.read() |
|
|
|
|
|
|
|
|
text_content = normalize_text(text_content) |
|
|
|
|
|
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") |
|
|
base_filename = f"document_{timestamp}" |
|
|
|
|
|
|
|
|
docling_json_path = f"{base_filename}_docling.json" |
|
|
docling_dict = { |
|
|
"type": "text_document", |
|
|
"content": text_content, |
|
|
"metadata": { |
|
|
"source": os.path.basename(path), |
|
|
"timestamp": timestamp |
|
|
} |
|
|
} |
|
|
with open(docling_json_path, "w", encoding="utf-8") as f: |
|
|
json.dump(docling_dict, f, indent=2, ensure_ascii=False) |
|
|
|
|
|
|
|
|
txt_path = f"{base_filename}.txt" |
|
|
with open(txt_path, "w", encoding="utf-8") as f: |
|
|
f.write(text_content) |
|
|
|
|
|
|
|
|
md_path = f"{base_filename}.md" |
|
|
with open(md_path, "w", encoding="utf-8") as f: |
|
|
f.write(f"# Document\n\n{text_content}") |
|
|
|
|
|
|
|
|
html_path = f"{base_filename}.html" |
|
|
html_content = f"""<!DOCTYPE html> |
|
|
<html lang="en"> |
|
|
<head> |
|
|
<meta charset="UTF-8"> |
|
|
<meta name="viewport" content="width=device-width, initial-scale=1.0"> |
|
|
<title>Document</title> |
|
|
</head> |
|
|
<body> |
|
|
<pre>{text_content}</pre> |
|
|
</body> |
|
|
</html>""" |
|
|
with open(html_path, "w", encoding="utf-8") as f: |
|
|
f.write(html_content) |
|
|
|
|
|
success_message = "✅ Successfully processed text file! 4 files generated." |
|
|
return docling_json_path, txt_path, md_path, html_path, success_message |
|
|
|
|
|
else: |
|
|
error_message = f"❌ Unsupported file format: {ext}" |
|
|
return None, None, None, None, error_message |
|
|
|
|
|
except Exception as e: |
|
|
error_message = f"❌ Error processing file: {str(e)}" |
|
|
return None, None, None, None, error_message |
|
|
|
|
|
def reset_form(): |
|
|
"""Reset the form""" |
|
|
return None, None, None, None, None, "" |
|
|
|
|
|
|
|
|
with gr.Blocks(title="LLM-Ready Document Converter") as app: |
|
|
|
|
|
gr.Markdown("# 📄 LLM-Ready Document Converter") |
|
|
gr.Markdown("**HOWTO** : Upload a document or image and get 4 output files: Docling JSON, TXT, Markdown, and HTML") |
|
|
gr.Markdown("**EXPLANATION** : This app transforms various document formats (like TXT, standard and scanned PDFs, DOCX, PPT, CSV, XLS, XLSX) and **images (PNG, JPG, JPEG, BMP, TIFF)** into structured, machine-readable outputs optimized for Large Language Models (LLMs). For images, it uses OCR (Optical Character Recognition) to extract text. For all input documents, it extracts and converts content into clean formats such as DocLing JSON (for document structure), plain text, Markdown, and HTML making it easier for AI models to process, analyze, or generate responses from complex documents without losing key details like layout or formatting. Essentially, it's a bridge between raw files and AI-ready data.") |
|
|
|
|
|
with gr.Row(): |
|
|
with gr.Column(): |
|
|
file_input = gr.File( |
|
|
label="Upload Document", |
|
|
file_types=[".pdf", ".txt", ".docx", ".xlsx", ".xls", ".csv", ".pptx", ".png", ".jpg", ".jpeg", ".bmp", ".tiff", ".tif"] |
|
|
) |
|
|
|
|
|
with gr.Row(): |
|
|
submit_btn = gr.Button("Convert Document", variant="primary") |
|
|
reset_btn = gr.Button("Reset") |
|
|
|
|
|
status_output = gr.Markdown(label="Status") |
|
|
|
|
|
with gr.Row(): |
|
|
with gr.Column(): |
|
|
docling_output = gr.File(label="Docling Document (JSON)") |
|
|
with gr.Column(): |
|
|
txt_output = gr.File(label="Text File") |
|
|
|
|
|
with gr.Row(): |
|
|
with gr.Column(): |
|
|
md_output = gr.File(label="Markdown File") |
|
|
with gr.Column(): |
|
|
html_output = gr.File(label="HTML File") |
|
|
|
|
|
|
|
|
submit_btn.click( |
|
|
fn=process_file, |
|
|
inputs=[file_input], |
|
|
outputs=[docling_output, txt_output, md_output, html_output, status_output] |
|
|
) |
|
|
|
|
|
reset_btn.click( |
|
|
fn=reset_form, |
|
|
outputs=[file_input, docling_output, txt_output, md_output, html_output, status_output] |
|
|
) |
|
|
|
|
|
if __name__ == "__main__": |
|
|
app.launch(share=True) |