Spaces:

Nalan-data
/

DrishtiTable

Running on Zero

App Files Files Community

DrishtiTable / app.py

Nalan-data

Add KaTeX LaTeX math rendering in table preview

b7b5399 verified 27 days ago

raw

history blame contribute delete

8.92 kB

	"""
	DrishtiTable: Table Structure Recognition Demo
	Upload a table image -> get HTML structure back.
	Runs on HuggingFace Spaces with ZeroGPU.
	"""
	import gradio as gr
	import torch
	import spaces
	import os
	from PIL import Image

	SYSTEM_PROMPT = """You are a table structure recognition expert. Given an image of a table, output the HTML representation of the table structure and content.

	Rules:
	- Use <table>, <thead>, <tbody>, <tr>, <th>, <td> tags
	- Use colspan and rowspan attributes for merged cells
	- Use <b> for bold text and <sub> for subscripts
	- Output ONLY the HTML table, nothing else
	- Do NOT include any attributes like style, class, or id"""


	@spaces.GPU(duration=300)
	def predict(image: Image.Image) -> tuple[str, str]:
	"""Run DrishtiTable on an uploaded table image."""
	if image is None:
	return "Please upload a table image.", "<p>No image uploaded.</p>"

	from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor
	from peft import PeftModel
	from qwen_vl_utils import process_vision_info

	# Load base model + LoRA adapter inside GPU context
	print("Loading base model...")
	base_model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
	"Qwen/Qwen2.5-VL-7B-Instruct",
	torch_dtype=torch.float16,
	device_map="auto",
	low_cpu_mem_usage=True,
	)

	print("Loading LoRA adapter...")
	model = PeftModel.from_pretrained(
	base_model,
	"Nalandadata/DrishtiTable-Qwen2.5-VL-7B",
	)
	model.eval()

	processor = AutoProcessor.from_pretrained(
	"Qwen/Qwen2.5-VL-7B-Instruct",
	)

	print("Model loaded! Running inference...")

	image = image.convert("RGB")

	# Resize large images to reduce memory
	max_dim = 1024
	w, h = image.size
	if max(w, h) > max_dim:
	scale = max_dim / max(w, h)
	image = image.resize((int(w * scale), int(h * scale)), Image.LANCZOS)

	messages = [
	{"role": "system", "content": SYSTEM_PROMPT},
	{
	"role": "user",
	"content": [
	{"type": "image", "image": image},
	{"type": "text", "text": "Convert this table image to HTML. Output only the HTML table structure with cell content."},
	],
	},
	]

	text = processor.apply_chat_template(
	messages, tokenize=False, add_generation_prompt=True
	)
	image_inputs, video_inputs = process_vision_info(messages)
	inputs = processor(
	text=[text],
	images=image_inputs,
	videos=video_inputs,
	padding=True,
	return_tensors="pt",
	).to(model.device)

	with torch.no_grad():
	output_ids = model.generate(
	**inputs,
	max_new_tokens=2048,
	do_sample=False,
	)

	generated_ids = [
	out[len(inp):] for inp, out in zip(inputs.input_ids, output_ids)
	]
	html = processor.batch_decode(
	generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False
	)[0].strip()

	# Cleanup
	del model, base_model, processor, inputs, output_ids
	torch.cuda.empty_cache()

	# Strip markdown code fences
	if html.startswith("```html"):
	html = html[7:]
	if html.startswith("```"):
	html = html[3:]
	if html.endswith("```"):
	html = html[:-3]
	html = html.strip()

	# Styled preview with KaTeX for LaTeX math rendering
	preview_html = f"""
	<!-- KaTeX CSS + JS for LaTeX rendering -->
	<link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/katex@0.16.11/dist/katex.min.css">
	<script defer src="https://cdn.jsdelivr.net/npm/katex@0.16.11/dist/katex.min.js"></script>
	<script defer src="https://cdn.jsdelivr.net/npm/katex@0.16.11/dist/contrib/auto-render.min.js"
	onload="renderMathInElement(document.querySelector('.dt-preview'), {{
	delimiters: [
	{{left: '$$', right: '$$', display: true}},
	{{left: '$', right: '$', display: false}},
	{{left: '\\\$', right: '\\\$', display: false}},
	{{left: '\\\\[', right: '\\\\]', display: true}},
	{{left: '(', right: ')', display: false}},
	{{left: '[', right: ']', display: true}}
	],
	throwOnError: false
	}});"></script>

	<div style="
	background-color: #ffffff;
	border-radius: 12px;
	padding: 20px;
	margin: 10px 0;
	box-shadow: 0 2px 12px rgba(0,0,0,0.15);
	overflow-x: auto;
	">
	<style>
	.dt-preview table {{
	border-collapse: collapse;
	width: 100%;
	font-family: 'Segoe UI', -apple-system, system-ui, sans-serif;
	font-size: 14px;
	line-height: 1.5;
	background-color: #ffffff;
	}}
	.dt-preview th, .dt-preview td {{
	border: 1px solid #d0d0d0;
	padding: 10px 14px;
	text-align: left;
	color: #1a1a1a;
	vertical-align: top;
	}}
	.dt-preview thead th {{
	background-color: #f0a500;
	color: #ffffff;
	font-weight: 700;
	font-size: 13px;
	letter-spacing: 0.3px;
	}}
	.dt-preview tbody tr:nth-child(even) {{
	background-color: #fafafa;
	}}
	.dt-preview tbody tr:nth-child(odd) {{
	background-color: #ffffff;
	}}
	.dt-preview tbody tr:hover {{
	background-color: #fff8e1;
	}}
	.dt-preview td {{
	color: #333333;
	}}
	.dt-preview b {{
	color: #1a1a1a;
	font-weight: 700;
	}}
	.dt-preview sub {{
	font-size: 0.75em;
	vertical-align: sub;
	}}
	.dt-preview .katex {{
	font-size: 1.1em;
	}}
	</style>
	<div class="dt-preview">{html}</div>
	</div>
	"""

	return html, preview_html


	# =====================
	# Gradio Interface
	# =====================

	TITLE = """
	<div style="text-align: center; margin-bottom: 10px;">
	<h1>DrishtiTable</h1>
	<h3>Table Structure Recognition</h3>
	<p><em>Upload a table image, get HTML structure back. Powered by the fine-tuned DrishtiTable model.</em></p>
	</div>
	"""

	DESCRIPTION = """
	DrishtiTable is a fine-tuned [Qwen2.5-VL-7B](https://huggingface.co/Qwen/Qwen2.5-VL-7B-Instruct) model
	that converts table images into structured HTML. Trained on 1,141 Indian academic textbook tables,
	it achieves 83.2% TEDS — outperforming GPT-4o (71.1%) by +12.1 points.

	\| Model \| TEDS Score \| Improvement \|
	\|---\|---\|---\|
	\| o4-mini (OpenAI) \| 61.4% \| — \|
	\| GPT-4.1 (OpenAI) \| 68.0% \| — \|
	\| GPT-4o (OpenAI) \| 71.1% \| — \|
	\| DrishtiTable (This Demo) \| 83.2% \| +12.1 over GPT-4o \|

	Upload any table image below to try it. First run takes ~60s to load the model, subsequent runs are faster.
	"""

	ARTICLE = """
	---

	### Run Locally (Fastest)

	```python
	from unsloth import FastVisionModel

	model, tokenizer = FastVisionModel.from_pretrained(
	"Nalandadata/DrishtiTable-Qwen2.5-VL-7B",
	max_seq_length=4096, load_in_4bit=True,
	)
	FastVisionModel.for_inference(model)
	```

	### Resources

	\| Resource \| Link \|
	\|---\|---\|
	\| Fine-tuned Model \| [Nalandadata/DrishtiTable-Qwen2.5-VL-7B](https://huggingface.co/Nalandadata/DrishtiTable-Qwen2.5-VL-7B) \|
	\| Dataset (sample) \| [Nalandadata/DrishtiTable](https://huggingface.co/datasets/Nalandadata/DrishtiTable) \|
	\| Base Model \| [Qwen/Qwen2.5-VL-7B-Instruct](https://huggingface.co/Qwen/Qwen2.5-VL-7B-Instruct) \|

	Built by [Nalanda Data](https://huggingface.co/Nalandadata). DrishtiTable (Sanskrit: drishti = vision).
	"""

	with gr.Blocks(
	title="DrishtiTable - Table Structure Recognition",
	theme=gr.themes.Soft(primary_hue="yellow", secondary_hue="gray"),
	css="""
	.gradio-container { max-width: 1200px !important; }
	footer { display: none !important; }
	""",
	) as demo:
	gr.HTML(TITLE)
	gr.Markdown(DESCRIPTION)

	with gr.Row(equal_height=True):
	with gr.Column(scale=1):
	input_image = gr.Image(
	type="pil",
	label="Upload Table Image",
	height=400,
	sources=["upload", "clipboard"],
	)
	submit_btn = gr.Button(
	"Recognize Table Structure",
	variant="primary",
	size="lg",
	)

	with gr.Column(scale=1):
	html_output = gr.Code(
	label="Predicted HTML",
	language="html",
	lines=18,
	)

	gr.Markdown("### Rendered Table Preview")
	rendered_output = gr.HTML(label="Rendered Table Preview")

	submit_btn.click(
	fn=predict,
	inputs=[input_image],
	outputs=[html_output, rendered_output],
	)

	gr.Markdown(ARTICLE)

	if __name__ == "__main__":
	demo.launch()