""" DrishtiTable: Table Structure Recognition Demo Upload a table image -> get HTML structure back. Runs on HuggingFace Spaces with ZeroGPU. """ import gradio as gr import torch import spaces import os from PIL import Image SYSTEM_PROMPT = """You are a table structure recognition expert. Given an image of a table, output the HTML representation of the table structure and content. Rules: - Use , , , ,
, tags - Use colspan and rowspan attributes for merged cells - Use for bold text and for subscripts - Output ONLY the HTML table, nothing else - Do NOT include any attributes like style, class, or id""" @spaces.GPU(duration=300) def predict(image: Image.Image) -> tuple[str, str]: """Run DrishtiTable on an uploaded table image.""" if image is None: return "Please upload a table image.", "

No image uploaded.

" from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor from peft import PeftModel from qwen_vl_utils import process_vision_info # Load base model + LoRA adapter inside GPU context print("Loading base model...") base_model = Qwen2_5_VLForConditionalGeneration.from_pretrained( "Qwen/Qwen2.5-VL-7B-Instruct", torch_dtype=torch.float16, device_map="auto", low_cpu_mem_usage=True, ) print("Loading LoRA adapter...") model = PeftModel.from_pretrained( base_model, "Nalandadata/DrishtiTable-Qwen2.5-VL-7B", ) model.eval() processor = AutoProcessor.from_pretrained( "Qwen/Qwen2.5-VL-7B-Instruct", ) print("Model loaded! Running inference...") image = image.convert("RGB") # Resize large images to reduce memory max_dim = 1024 w, h = image.size if max(w, h) > max_dim: scale = max_dim / max(w, h) image = image.resize((int(w * scale), int(h * scale)), Image.LANCZOS) messages = [ {"role": "system", "content": SYSTEM_PROMPT}, { "role": "user", "content": [ {"type": "image", "image": image}, {"type": "text", "text": "Convert this table image to HTML. Output only the HTML table structure with cell content."}, ], }, ] text = processor.apply_chat_template( messages, tokenize=False, add_generation_prompt=True ) image_inputs, video_inputs = process_vision_info(messages) inputs = processor( text=[text], images=image_inputs, videos=video_inputs, padding=True, return_tensors="pt", ).to(model.device) with torch.no_grad(): output_ids = model.generate( **inputs, max_new_tokens=2048, do_sample=False, ) generated_ids = [ out[len(inp):] for inp, out in zip(inputs.input_ids, output_ids) ] html = processor.batch_decode( generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False )[0].strip() # Cleanup del model, base_model, processor, inputs, output_ids torch.cuda.empty_cache() # Strip markdown code fences if html.startswith("```html"): html = html[7:] if html.startswith("```"): html = html[3:] if html.endswith("```"): html = html[:-3] html = html.strip() # Styled preview with KaTeX for LaTeX math rendering preview_html = f"""
{html}
""" return html, preview_html # ===================== # Gradio Interface # ===================== TITLE = """

DrishtiTable

Table Structure Recognition

Upload a table image, get HTML structure back. Powered by the fine-tuned DrishtiTable model.

""" DESCRIPTION = """ **DrishtiTable** is a fine-tuned [Qwen2.5-VL-7B](https://huggingface.co/Qwen/Qwen2.5-VL-7B-Instruct) model that converts table images into structured HTML. Trained on 1,141 Indian academic textbook tables, it achieves **83.2% TEDS** — outperforming GPT-4o (71.1%) by +12.1 points. | Model | TEDS Score | Improvement | |---|---|---| | o4-mini (OpenAI) | 61.4% | — | | GPT-4.1 (OpenAI) | 68.0% | — | | GPT-4o (OpenAI) | 71.1% | — | | **DrishtiTable (This Demo)** | **83.2%** | **+12.1 over GPT-4o** | Upload any table image below to try it. First run takes ~60s to load the model, subsequent runs are faster. """ ARTICLE = """ --- ### Run Locally (Fastest) ```python from unsloth import FastVisionModel model, tokenizer = FastVisionModel.from_pretrained( "Nalandadata/DrishtiTable-Qwen2.5-VL-7B", max_seq_length=4096, load_in_4bit=True, ) FastVisionModel.for_inference(model) ``` ### Resources | Resource | Link | |---|---| | Fine-tuned Model | [Nalandadata/DrishtiTable-Qwen2.5-VL-7B](https://huggingface.co/Nalandadata/DrishtiTable-Qwen2.5-VL-7B) | | Dataset (sample) | [Nalandadata/DrishtiTable](https://huggingface.co/datasets/Nalandadata/DrishtiTable) | | Base Model | [Qwen/Qwen2.5-VL-7B-Instruct](https://huggingface.co/Qwen/Qwen2.5-VL-7B-Instruct) | *Built by [Nalanda Data](https://huggingface.co/Nalandadata). DrishtiTable (Sanskrit: drishti = vision).* """ with gr.Blocks( title="DrishtiTable - Table Structure Recognition", theme=gr.themes.Soft(primary_hue="yellow", secondary_hue="gray"), css=""" .gradio-container { max-width: 1200px !important; } footer { display: none !important; } """, ) as demo: gr.HTML(TITLE) gr.Markdown(DESCRIPTION) with gr.Row(equal_height=True): with gr.Column(scale=1): input_image = gr.Image( type="pil", label="Upload Table Image", height=400, sources=["upload", "clipboard"], ) submit_btn = gr.Button( "Recognize Table Structure", variant="primary", size="lg", ) with gr.Column(scale=1): html_output = gr.Code( label="Predicted HTML", language="html", lines=18, ) gr.Markdown("### Rendered Table Preview") rendered_output = gr.HTML(label="Rendered Table Preview") submit_btn.click( fn=predict, inputs=[input_image], outputs=[html_output, rendered_output], ) gr.Markdown(ARTICLE) if __name__ == "__main__": demo.launch()