""" DrishtiTable: Table Structure Recognition Demo Upload a table image -> get HTML structure back. Runs on HuggingFace Spaces with ZeroGPU. """ import gradio as gr import torch import spaces import os from PIL import Image SYSTEM_PROMPT = """You are a table structure recognition expert. Given an image of a table, output the HTML representation of the table structure and content. Rules: - Use , , , ,

,	tags - Use colspan and rowspan attributes for merged cells - Use for bold text and _{for subscripts - Output ONLY the HTML table, nothing else - Do NOT include any attributes like style, class, or id""" @spaces.GPU(duration=300) def predict(image: Image.Image) -> tuple[str, str]: """Run DrishtiTable on an uploaded table image.""" if image is None: return "Please upload a table image.", "No image uploaded." from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor from peft import PeftModel from qwen_vl_utils import process_vision_info # Load base model + LoRA adapter inside GPU context print("Loading base model...") base_model = Qwen2_5_VLForConditionalGeneration.from_pretrained( "Qwen/Qwen2.5-VL-7B-Instruct", torch_dtype=torch.float16, device_map="auto", low_cpu_mem_usage=True, ) print("Loading LoRA adapter...") model = PeftModel.from_pretrained( base_model, "Nalandadata/DrishtiTable-Qwen2.5-VL-7B", ) model.eval() processor = AutoProcessor.from_pretrained( "Qwen/Qwen2.5-VL-7B-Instruct", ) print("Model loaded! Running inference...") image = image.convert("RGB") # Resize large images to reduce memory max_dim = 1024 w, h = image.size if max(w, h) > max_dim: scale = max_dim / max(w, h) image = image.resize((int(w * scale), int(h * scale)), Image.LANCZOS) messages = [ {"role": "system", "content": SYSTEM_PROMPT}, { "role": "user", "content": [ {"type": "image", "image": image}, {"type": "text", "text": "Convert this table image to HTML. Output only the HTML table structure with cell content."}, ], }, ] text = processor.apply_chat_template( messages, tokenize=False, add_generation_prompt=True ) image_inputs, video_inputs = process_vision_info(messages) inputs = processor( text=[text], images=image_inputs, videos=video_inputs, padding=True, return_tensors="pt", ).to(model.device) with torch.no_grad(): output_ids = model.generate( inputs, max_new_tokens=2048, do_sample=False, ) generated_ids = [ out[len(inp):] for inp, out in zip(inputs.input_ids, output_ids) ] html = processor.batch_decode( generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False )[0].strip() # Cleanup del model, base_model, processor, inputs, output_ids torch.cuda.empty_cache() # Strip markdown code fences if html.startswith("```html"): html = html[7:] if html.startswith("```"): html = html[3:] if html.endswith("```"): html = html[:-3] html = html.strip() # Styled preview with KaTeX for LaTeX math rendering preview_html = f""" {html} """ return html, preview_html # ===================== # Gradio Interface # ===================== TITLE = """ DrishtiTable Table Structure Recognition Upload a table image, get HTML structure back. Powered by the fine-tuned DrishtiTable model. """ DESCRIPTION = """ DrishtiTable is a fine-tuned [Qwen2.5-VL-7B](https://huggingface.co/Qwen/Qwen2.5-VL-7B-Instruct) model that converts table images into structured HTML. Trained on 1,141 Indian academic textbook tables, it achieves 83.2% TEDS — outperforming GPT-4o (71.1%) by +12.1 points. \| Model \| TEDS Score \| Improvement \| \|---\|---\|---\| \| o4-mini (OpenAI) \| 61.4% \| — \| \| GPT-4.1 (OpenAI) \| 68.0% \| — \| \| GPT-4o (OpenAI) \| 71.1% \| — \| \| DrishtiTable (This Demo) \| 83.2% \| +12.1 over GPT-4o** \| Upload any table image below to try it. First run takes ~60s to load the model, subsequent runs are faster. """ ARTICLE = """ --- ### Run Locally (Fastest) ```python from unsloth import FastVisionModel model, tokenizer = FastVisionModel.from_pretrained( "Nalandadata/DrishtiTable-Qwen2.5-VL-7B", max_seq_length=4096, load_in_4bit=True, ) FastVisionModel.for_inference(model) ``` ### Resources \| Resource \| Link \| \|---\|---\| \| Fine-tuned Model \| [Nalandadata/DrishtiTable-Qwen2.5-VL-7B](https://huggingface.co/Nalandadata/DrishtiTable-Qwen2.5-VL-7B) \| \| Dataset (sample) \| [Nalandadata/DrishtiTable](https://huggingface.co/datasets/Nalandadata/DrishtiTable) \| \| Base Model \| [Qwen/Qwen2.5-VL-7B-Instruct](https://huggingface.co/Qwen/Qwen2.5-VL-7B-Instruct) \| Built by [Nalanda Data](https://huggingface.co/Nalandadata). DrishtiTable (Sanskrit: drishti = vision). """ with gr.Blocks( title="DrishtiTable - Table Structure Recognition", theme=gr.themes.Soft(primary_hue="yellow", secondary_hue="gray"), css=""" .gradio-container { max-width: 1200px !important; } footer { display: none !important; } """, ) as demo: gr.HTML(TITLE) gr.Markdown(DESCRIPTION) with gr.Row(equal_height=True): with gr.Column(scale=1): input_image = gr.Image( type="pil", label="Upload Table Image", height=400, sources=["upload", "clipboard"], ) submit_btn = gr.Button( "Recognize Table Structure", variant="primary", size="lg", ) with gr.Column(scale=1): html_output = gr.Code( label="Predicted HTML", language="html", lines=18, ) gr.Markdown("### Rendered Table Preview") rendered_output = gr.HTML(label="Rendered Table Preview") submit_btn.click( fn=predict, inputs=[input_image], outputs=[html_output, rendered_output], ) gr.Markdown(ARTICLE) if __name__ == "__main__": demo.launch()}

tags - Use colspan and rowspan attributes for merged cells - Use for bold text and _{for subscripts
- Output ONLY the HTML table, nothing else
- Do NOT include any attributes like style, class, or id"""

@spaces.GPU(duration=300)
def predict(image: Image.Image) -> tuple[str, str]:
"""Run DrishtiTable on an uploaded table image."""
if image is None:
return "Please upload a table image.", "No image uploaded."

from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor
from peft import PeftModel
from qwen_vl_utils import process_vision_info

# Load base model + LoRA adapter inside GPU context
print("Loading base model...")
base_model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
"Qwen/Qwen2.5-VL-7B-Instruct",
torch_dtype=torch.float16,
device_map="auto",
low_cpu_mem_usage=True,
)

print("Loading LoRA adapter...")
model = PeftModel.from_pretrained(
base_model,
"Nalandadata/DrishtiTable-Qwen2.5-VL-7B",
)
model.eval()

processor = AutoProcessor.from_pretrained(
"Qwen/Qwen2.5-VL-7B-Instruct",
)

print("Model loaded! Running inference...")

image = image.convert("RGB")

# Resize large images to reduce memory
max_dim = 1024
w, h = image.size
if max(w, h) > max_dim:
scale = max_dim / max(w, h)
image = image.resize((int(w * scale), int(h * scale)), Image.LANCZOS)

messages = [
{"role": "system", "content": SYSTEM_PROMPT},
{
"role": "user",
"content": [
{"type": "image", "image": image},
{"type": "text", "text": "Convert this table image to HTML. Output only the HTML table structure with cell content."},
],
},
]

text = processor.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
text=[text],
images=image_inputs,
videos=video_inputs,
padding=True,
return_tensors="pt",
).to(model.device)

with torch.no_grad():
output_ids = model.generate(
**inputs,
max_new_tokens=2048,
do_sample=False,
)

generated_ids = [
out[len(inp):] for inp, out in zip(inputs.input_ids, output_ids)
]
html = processor.batch_decode(
generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0].strip()

# Cleanup
del model, base_model, processor, inputs, output_ids
torch.cuda.empty_cache()

# Strip markdown code fences
if html.startswith("```html"):
html = html[7:]
if html.startswith("```"):
html = html[3:]
if html.endswith("```"):
html = html[:-3]
html = html.strip()

# Styled preview with KaTeX for LaTeX math rendering
preview_html = f"""

{html}

"""

return html, preview_html

# =====================
# Gradio Interface
# =====================

TITLE = """

DrishtiTable
Table Structure Recognition
Upload a table image, get HTML structure back. Powered by the fine-tuned DrishtiTable model.

"""

DESCRIPTION = """
**DrishtiTable** is a fine-tuned [Qwen2.5-VL-7B](https://huggingface.co/Qwen/Qwen2.5-VL-7B-Instruct) model
that converts table images into structured HTML. Trained on 1,141 Indian academic textbook tables,
it achieves **83.2% TEDS** — outperforming GPT-4o (71.1%) by +12.1 points.

| Model | TEDS Score | Improvement |
|---|---|---|
| o4-mini (OpenAI) | 61.4% | — |
| GPT-4.1 (OpenAI) | 68.0% | — |
| GPT-4o (OpenAI) | 71.1% | — |
| **DrishtiTable (This Demo)** | **83.2%** | **+12.1 over GPT-4o** |

Upload any table image below to try it. First run takes ~60s to load the model, subsequent runs are faster.
"""

ARTICLE = """
---

### Run Locally (Fastest)

```python
from unsloth import FastVisionModel

model, tokenizer = FastVisionModel.from_pretrained(
"Nalandadata/DrishtiTable-Qwen2.5-VL-7B",
max_seq_length=4096, load_in_4bit=True,
)
FastVisionModel.for_inference(model)
```

### Resources

| Resource | Link |
|---|---|
| Fine-tuned Model | [Nalandadata/DrishtiTable-Qwen2.5-VL-7B](https://huggingface.co/Nalandadata/DrishtiTable-Qwen2.5-VL-7B) |
| Dataset (sample) | [Nalandadata/DrishtiTable](https://huggingface.co/datasets/Nalandadata/DrishtiTable) |
| Base Model | [Qwen/Qwen2.5-VL-7B-Instruct](https://huggingface.co/Qwen/Qwen2.5-VL-7B-Instruct) |

*Built by [Nalanda Data](https://huggingface.co/Nalandadata). DrishtiTable (Sanskrit: drishti = vision).*
"""

with gr.Blocks(
title="DrishtiTable - Table Structure Recognition",
theme=gr.themes.Soft(primary_hue="yellow", secondary_hue="gray"),
css="""
.gradio-container { max-width: 1200px !important; }
footer { display: none !important; }
""",
) as demo:
gr.HTML(TITLE)
gr.Markdown(DESCRIPTION)

with gr.Row(equal_height=True):
with gr.Column(scale=1):
input_image = gr.Image(
type="pil",
label="Upload Table Image",
height=400,
sources=["upload", "clipboard"],
)
submit_btn = gr.Button(
"Recognize Table Structure",
variant="primary",
size="lg",
)

with gr.Column(scale=1):
html_output = gr.Code(
label="Predicted HTML",
language="html",
lines=18,
)

gr.Markdown("### Rendered Table Preview")
rendered_output = gr.HTML(label="Rendered Table Preview")

submit_btn.click(
fn=predict,
inputs=[input_image],
outputs=[html_output, rendered_output],
)

gr.Markdown(ARTICLE)

if __name__ == "__main__":
demo.launch()}