DrishtiTable / app.py
Nalan-data's picture
Add KaTeX LaTeX math rendering in table preview
b7b5399 verified
"""
DrishtiTable: Table Structure Recognition Demo
Upload a table image -> get HTML structure back.
Runs on HuggingFace Spaces with ZeroGPU.
"""
import gradio as gr
import torch
import spaces
import os
from PIL import Image
SYSTEM_PROMPT = """You are a table structure recognition expert. Given an image of a table, output the HTML representation of the table structure and content.
Rules:
- Use <table>, <thead>, <tbody>, <tr>, <th>, <td> tags
- Use colspan and rowspan attributes for merged cells
- Use <b> for bold text and <sub> for subscripts
- Output ONLY the HTML table, nothing else
- Do NOT include any attributes like style, class, or id"""
@spaces.GPU(duration=300)
def predict(image: Image.Image) -> tuple[str, str]:
"""Run DrishtiTable on an uploaded table image."""
if image is None:
return "Please upload a table image.", "<p>No image uploaded.</p>"
from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor
from peft import PeftModel
from qwen_vl_utils import process_vision_info
# Load base model + LoRA adapter inside GPU context
print("Loading base model...")
base_model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
"Qwen/Qwen2.5-VL-7B-Instruct",
torch_dtype=torch.float16,
device_map="auto",
low_cpu_mem_usage=True,
)
print("Loading LoRA adapter...")
model = PeftModel.from_pretrained(
base_model,
"Nalandadata/DrishtiTable-Qwen2.5-VL-7B",
)
model.eval()
processor = AutoProcessor.from_pretrained(
"Qwen/Qwen2.5-VL-7B-Instruct",
)
print("Model loaded! Running inference...")
image = image.convert("RGB")
# Resize large images to reduce memory
max_dim = 1024
w, h = image.size
if max(w, h) > max_dim:
scale = max_dim / max(w, h)
image = image.resize((int(w * scale), int(h * scale)), Image.LANCZOS)
messages = [
{"role": "system", "content": SYSTEM_PROMPT},
{
"role": "user",
"content": [
{"type": "image", "image": image},
{"type": "text", "text": "Convert this table image to HTML. Output only the HTML table structure with cell content."},
],
},
]
text = processor.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
text=[text],
images=image_inputs,
videos=video_inputs,
padding=True,
return_tensors="pt",
).to(model.device)
with torch.no_grad():
output_ids = model.generate(
**inputs,
max_new_tokens=2048,
do_sample=False,
)
generated_ids = [
out[len(inp):] for inp, out in zip(inputs.input_ids, output_ids)
]
html = processor.batch_decode(
generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0].strip()
# Cleanup
del model, base_model, processor, inputs, output_ids
torch.cuda.empty_cache()
# Strip markdown code fences
if html.startswith("```html"):
html = html[7:]
if html.startswith("```"):
html = html[3:]
if html.endswith("```"):
html = html[:-3]
html = html.strip()
# Styled preview with KaTeX for LaTeX math rendering
preview_html = f"""
<!-- KaTeX CSS + JS for LaTeX rendering -->
<link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/katex@0.16.11/dist/katex.min.css">
<script defer src="https://cdn.jsdelivr.net/npm/katex@0.16.11/dist/katex.min.js"></script>
<script defer src="https://cdn.jsdelivr.net/npm/katex@0.16.11/dist/contrib/auto-render.min.js"
onload="renderMathInElement(document.querySelector('.dt-preview'), {{
delimiters: [
{{left: '$$', right: '$$', display: true}},
{{left: '$', right: '$', display: false}},
{{left: '\\\\(', right: '\\\\)', display: false}},
{{left: '\\\\[', right: '\\\\]', display: true}},
{{left: '(', right: ')', display: false}},
{{left: '[', right: ']', display: true}}
],
throwOnError: false
}});"></script>
<div style="
background-color: #ffffff;
border-radius: 12px;
padding: 20px;
margin: 10px 0;
box-shadow: 0 2px 12px rgba(0,0,0,0.15);
overflow-x: auto;
">
<style>
.dt-preview table {{
border-collapse: collapse;
width: 100%;
font-family: 'Segoe UI', -apple-system, system-ui, sans-serif;
font-size: 14px;
line-height: 1.5;
background-color: #ffffff;
}}
.dt-preview th, .dt-preview td {{
border: 1px solid #d0d0d0;
padding: 10px 14px;
text-align: left;
color: #1a1a1a;
vertical-align: top;
}}
.dt-preview thead th {{
background-color: #f0a500;
color: #ffffff;
font-weight: 700;
font-size: 13px;
letter-spacing: 0.3px;
}}
.dt-preview tbody tr:nth-child(even) {{
background-color: #fafafa;
}}
.dt-preview tbody tr:nth-child(odd) {{
background-color: #ffffff;
}}
.dt-preview tbody tr:hover {{
background-color: #fff8e1;
}}
.dt-preview td {{
color: #333333;
}}
.dt-preview b {{
color: #1a1a1a;
font-weight: 700;
}}
.dt-preview sub {{
font-size: 0.75em;
vertical-align: sub;
}}
.dt-preview .katex {{
font-size: 1.1em;
}}
</style>
<div class="dt-preview">{html}</div>
</div>
"""
return html, preview_html
# =====================
# Gradio Interface
# =====================
TITLE = """
<div style="text-align: center; margin-bottom: 10px;">
<h1>DrishtiTable</h1>
<h3>Table Structure Recognition</h3>
<p><em>Upload a table image, get HTML structure back. Powered by the fine-tuned DrishtiTable model.</em></p>
</div>
"""
DESCRIPTION = """
**DrishtiTable** is a fine-tuned [Qwen2.5-VL-7B](https://huggingface.co/Qwen/Qwen2.5-VL-7B-Instruct) model
that converts table images into structured HTML. Trained on 1,141 Indian academic textbook tables,
it achieves **83.2% TEDS** — outperforming GPT-4o (71.1%) by +12.1 points.
| Model | TEDS Score | Improvement |
|---|---|---|
| o4-mini (OpenAI) | 61.4% | — |
| GPT-4.1 (OpenAI) | 68.0% | — |
| GPT-4o (OpenAI) | 71.1% | — |
| **DrishtiTable (This Demo)** | **83.2%** | **+12.1 over GPT-4o** |
Upload any table image below to try it. First run takes ~60s to load the model, subsequent runs are faster.
"""
ARTICLE = """
---
### Run Locally (Fastest)
```python
from unsloth import FastVisionModel
model, tokenizer = FastVisionModel.from_pretrained(
"Nalandadata/DrishtiTable-Qwen2.5-VL-7B",
max_seq_length=4096, load_in_4bit=True,
)
FastVisionModel.for_inference(model)
```
### Resources
| Resource | Link |
|---|---|
| Fine-tuned Model | [Nalandadata/DrishtiTable-Qwen2.5-VL-7B](https://huggingface.co/Nalandadata/DrishtiTable-Qwen2.5-VL-7B) |
| Dataset (sample) | [Nalandadata/DrishtiTable](https://huggingface.co/datasets/Nalandadata/DrishtiTable) |
| Base Model | [Qwen/Qwen2.5-VL-7B-Instruct](https://huggingface.co/Qwen/Qwen2.5-VL-7B-Instruct) |
*Built by [Nalanda Data](https://huggingface.co/Nalandadata). DrishtiTable (Sanskrit: drishti = vision).*
"""
with gr.Blocks(
title="DrishtiTable - Table Structure Recognition",
theme=gr.themes.Soft(primary_hue="yellow", secondary_hue="gray"),
css="""
.gradio-container { max-width: 1200px !important; }
footer { display: none !important; }
""",
) as demo:
gr.HTML(TITLE)
gr.Markdown(DESCRIPTION)
with gr.Row(equal_height=True):
with gr.Column(scale=1):
input_image = gr.Image(
type="pil",
label="Upload Table Image",
height=400,
sources=["upload", "clipboard"],
)
submit_btn = gr.Button(
"Recognize Table Structure",
variant="primary",
size="lg",
)
with gr.Column(scale=1):
html_output = gr.Code(
label="Predicted HTML",
language="html",
lines=18,
)
gr.Markdown("### Rendered Table Preview")
rendered_output = gr.HTML(label="Rendered Table Preview")
submit_btn.click(
fn=predict,
inputs=[input_image],
outputs=[html_output, rendered_output],
)
gr.Markdown(ARTICLE)
if __name__ == "__main__":
demo.launch()