"""
Doctra - Document Parser for Hugging Face Spaces
This is a Hugging Face Spaces deployment of the Doctra document parsing library.
It provides a comprehensive web interface for PDF parsing, table/chart extraction,
image restoration, and enhanced document processing.
"""
import os
import shutil
import tempfile
import re
import html as _html
import base64
import json
from pathlib import Path
from typing import Optional, Tuple, List, Dict, Any
import gradio as gr
import pandas as pd
# Mock google.genai to avoid import errors
import sys
from unittest.mock import MagicMock
# Create a mock google.genai module
mock_google_genai = MagicMock()
sys.modules['google.genai'] = mock_google_genai
sys.modules['google.genai.types'] = MagicMock()
# Now import Doctra components
try:
from doctra.parsers.structured_pdf_parser import StructuredPDFParser
from doctra.parsers.table_chart_extractor import ChartTablePDFParser
from doctra.parsers.enhanced_pdf_parser import EnhancedPDFParser
from doctra.ui.docres_wrapper import DocResUIWrapper
from doctra.utils.pdf_io import render_pdf_to_images
except ImportError as e:
print(f"Warning: Some Doctra components may not be available: {e}")
# Create mock classes if imports fail
StructuredPDFParser = None
ChartTablePDFParser = None
EnhancedPDFParser = None
DocResUIWrapper = None
render_pdf_to_images = None
# UI Theme and Styling Constants
THEME = gr.themes.Soft(primary_hue="indigo", neutral_hue="slate")
CUSTOM_CSS = """
/* Full-width layout */
.gradio-container {max-width: 100% !important; padding-left: 24px; padding-right: 24px}
.container {max-width: 100% !important}
.app {max-width: 100% !important}
/* Header and helpers */
.header {margin-bottom: 8px}
.subtitle {color: var(--body-text-color-subdued)}
.card {border:1px solid var(--border-color); border-radius:12px; padding:8px}
.status-ok {color: var(--color-success)}
/* Scrollable gallery styling */
.scrollable-gallery {
max-height: 600px !important;
overflow-y: auto !important;
border: 1px solid var(--border-color) !important;
border-radius: 8px !important;
padding: 8px !important;
}
/* Page content styling */
.page-content img {
max-width: 100% !important;
height: auto !important;
display: block !important;
margin: 10px auto !important;
border: 1px solid #ddd !important;
border-radius: 8px !important;
box-shadow: 0 2px 4px rgba(0,0,0,0.1) !important;
}
.page-content {
max-height: none !important;
overflow: visible !important;
}
/* Table styling */
.page-content table.doc-table {
width: 100% !important;
border-collapse: collapse !important;
margin: 12px 0 !important;
}
.page-content table.doc-table th,
.page-content table.doc-table td {
border: 1px solid #e5e7eb !important;
padding: 8px 10px !important;
text-align: left !important;
}
.page-content table.doc-table thead th {
background: #f9fafb !important;
font-weight: 600 !important;
}
.page-content table.doc-table tbody tr:nth-child(even) td {
background: #fafafa !important;
}
/* Clickable image buttons */
.image-button {
background: #0066cc !important;
color: white !important;
border: none !important;
padding: 5px 10px !important;
border-radius: 4px !important;
cursor: pointer !important;
margin: 2px !important;
font-size: 14px !important;
}
.image-button:hover {
background: #0052a3 !important;
}
"""
def gather_outputs(
out_dir: Path,
allowed_kinds: Optional[List[str]] = None,
zip_filename: Optional[str] = None,
is_structured_parsing: bool = False
) -> Tuple[List[tuple[str, str]], List[str], str]:
"""
Gather output files and create a ZIP archive for download.
"""
gallery_items: List[tuple[str, str]] = []
file_paths: List[str] = []
if out_dir.exists():
if is_structured_parsing:
# For structured parsing, include all files
for file_path in sorted(out_dir.rglob("*")):
if file_path.is_file():
file_paths.append(str(file_path))
else:
# For full parsing, include specific main files
main_files = [
"result.html",
"result.md",
"tables.html",
"tables.xlsx"
]
for main_file in main_files:
file_path = out_dir / main_file
if file_path.exists():
file_paths.append(str(file_path))
# Include images based on allowed kinds
if allowed_kinds:
for kind in allowed_kinds:
p = out_dir / kind
if p.exists():
for img in sorted(p.glob("*.png")):
file_paths.append(str(img))
images_dir = out_dir / "images" / kind
if images_dir.exists():
for img in sorted(images_dir.glob("*.jpg")):
file_paths.append(str(img))
else:
# Include all images if no specific kinds specified
for p in (out_dir / "charts").glob("*.png"):
file_paths.append(str(p))
for p in (out_dir / "tables").glob("*.png"):
file_paths.append(str(p))
for p in (out_dir / "images").rglob("*.jpg"):
file_paths.append(str(p))
# Include Excel files based on allowed kinds
if allowed_kinds:
if "charts" in allowed_kinds and "tables" in allowed_kinds:
excel_files = ["parsed_tables_charts.xlsx"]
elif "charts" in allowed_kinds:
excel_files = ["parsed_charts.xlsx"]
elif "tables" in allowed_kinds:
excel_files = ["parsed_tables.xlsx"]
else:
excel_files = []
for excel_file in excel_files:
excel_path = out_dir / excel_file
if excel_path.exists():
file_paths.append(str(excel_path))
# Build gallery items for image display
kinds = allowed_kinds if allowed_kinds else ["tables", "charts", "figures"]
for sub in kinds:
p = out_dir / sub
if p.exists():
for img in sorted(p.glob("*.png")):
gallery_items.append((str(img), f"{sub}: {img.name}"))
images_dir = out_dir / "images" / sub
if images_dir.exists():
for img in sorted(images_dir.glob("*.jpg")):
gallery_items.append((str(img), f"{sub}: {img.name}"))
# Create ZIP archive
tmp_zip_dir = Path(tempfile.mkdtemp(prefix="doctra_zip_"))
if zip_filename:
safe_filename = re.sub(r'[<>:"/\\|?*]', '_', zip_filename)
zip_base = tmp_zip_dir / safe_filename
else:
zip_base = tmp_zip_dir / "doctra_outputs"
filtered_dir = tmp_zip_dir / "filtered_outputs"
shutil.copytree(out_dir, filtered_dir, ignore=shutil.ignore_patterns('~$*', '*.tmp', '*.temp'))
zip_path = shutil.make_archive(str(zip_base), 'zip', root_dir=str(filtered_dir))
return gallery_items, file_paths, zip_path
def validate_vlm_config(use_vlm: bool, vlm_api_key: str, vlm_provider: str = "gemini") -> Optional[str]:
"""
Validate VLM configuration parameters.
"""
if use_vlm and vlm_provider not in ["ollama"] and not vlm_api_key:
return "❌ Error: VLM API key is required when using VLM (except for Ollama)"
if use_vlm and vlm_api_key and vlm_provider not in ["ollama"]:
# Basic API key validation
if len(vlm_api_key.strip()) < 10:
return "❌ Error: VLM API key appears to be too short or invalid"
if vlm_api_key.strip().startswith('sk-') and len(vlm_api_key.strip()) < 20:
return "❌ Error: OpenAI API key appears to be invalid (too short)"
return None
def create_page_html_content(page_content: List[str], base_dir: Optional[Path] = None) -> str:
"""
Convert page content lines to HTML with inline images and proper formatting.
"""
processed_content = []
paragraph_buffer = []
def flush_paragraph():
"""Flush accumulated paragraph content to HTML"""
nonlocal paragraph_buffer
if paragraph_buffer:
joined = ' '.join(_html.escape(l) for l in paragraph_buffer)
processed_content.append(f'
{joined}
')
paragraph_buffer = []
def is_markdown_table_header(s: str) -> bool:
return '|' in s and ('---' in s or '—' in s)
def render_markdown_table(lines: List[str]) -> str:
rows = [l.strip().strip('|').split('|') for l in lines]
rows = [[_html.escape(c.strip()) for c in r] for r in rows]
if len(rows) < 2:
return ""
header = rows[0]
body = rows[2:] if len(rows) > 2 else []
thead = '
' + ''.join(f'
{c}
' for c in header) + '
'
tbody = '' + ''.join('
' + ''.join(f'
{c}
' for c in r) + '
' for r in body) + ''
return f'
{thead}{tbody}
'
i = 0
n = len(page_content)
while i < n:
raw_line = page_content[i]
line = raw_line.rstrip('\r\n')
stripped = line.strip()
# Handle image references
if stripped.startswith(':
flush_paragraph()
match = re.match(r'!\[([^\]]+)\]\(([^)]+)\)', stripped)
if match and base_dir is not None:
caption = match.group(1)
rel_path = match.group(2).replace('\\\\', '/').replace('\\', '/').lstrip('/')
abs_path = (base_dir / rel_path).resolve()
try:
with open(abs_path, 'rb') as f:
b64 = base64.b64encode(f.read()).decode('ascii')
processed_content.append(f'{_html.escape(caption)}')
except Exception as e:
print(f"❌ Failed to embed image {rel_path}: {e}")
processed_content.append(f'
{_html.escape(caption)} (image not found)
')
else:
processed_content.append(f'
{_html.escape(stripped)}
')
i += 1
continue
# Handle markdown tables
if (stripped.startswith('|') or stripped.count('|') >= 2) and i + 1 < n and is_markdown_table_header(page_content[i + 1]):
flush_paragraph()
table_block = [stripped]
i += 1
table_block.append(page_content[i].strip())
i += 1
while i < n:
nxt = page_content[i].rstrip('\r\n')
if nxt.strip() == '' or (not nxt.strip().startswith('|') and nxt.count('|') < 2):
break
table_block.append(nxt.strip())
i += 1
html_table = render_markdown_table(table_block)
if html_table:
processed_content.append(html_table)
else:
for tl in table_block:
paragraph_buffer.append(tl)
continue
# Handle headers and content
if stripped.startswith('## '):
flush_paragraph()
processed_content.append(f'