Spaces:
Running on Zero
Running on Zero
| """ | |
| DrishtiTable: Table Structure Recognition Demo | |
| Upload a table image -> get HTML structure back. | |
| Runs on HuggingFace Spaces with ZeroGPU. | |
| """ | |
| import gradio as gr | |
| import torch | |
| import spaces | |
| import os | |
| from PIL import Image | |
| SYSTEM_PROMPT = """You are a table structure recognition expert. Given an image of a table, output the HTML representation of the table structure and content. | |
| Rules: | |
| - Use <table>, <thead>, <tbody>, <tr>, <th>, <td> tags | |
| - Use colspan and rowspan attributes for merged cells | |
| - Use <b> for bold text and <sub> for subscripts | |
| - Output ONLY the HTML table, nothing else | |
| - Do NOT include any attributes like style, class, or id""" | |
| def predict(image: Image.Image) -> tuple[str, str]: | |
| """Run DrishtiTable on an uploaded table image.""" | |
| if image is None: | |
| return "Please upload a table image.", "<p>No image uploaded.</p>" | |
| from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor | |
| from peft import PeftModel | |
| from qwen_vl_utils import process_vision_info | |
| # Load base model + LoRA adapter inside GPU context | |
| print("Loading base model...") | |
| base_model = Qwen2_5_VLForConditionalGeneration.from_pretrained( | |
| "Qwen/Qwen2.5-VL-7B-Instruct", | |
| torch_dtype=torch.float16, | |
| device_map="auto", | |
| low_cpu_mem_usage=True, | |
| ) | |
| print("Loading LoRA adapter...") | |
| model = PeftModel.from_pretrained( | |
| base_model, | |
| "Nalandadata/DrishtiTable-Qwen2.5-VL-7B", | |
| ) | |
| model.eval() | |
| processor = AutoProcessor.from_pretrained( | |
| "Qwen/Qwen2.5-VL-7B-Instruct", | |
| ) | |
| print("Model loaded! Running inference...") | |
| image = image.convert("RGB") | |
| # Resize large images to reduce memory | |
| max_dim = 1024 | |
| w, h = image.size | |
| if max(w, h) > max_dim: | |
| scale = max_dim / max(w, h) | |
| image = image.resize((int(w * scale), int(h * scale)), Image.LANCZOS) | |
| messages = [ | |
| {"role": "system", "content": SYSTEM_PROMPT}, | |
| { | |
| "role": "user", | |
| "content": [ | |
| {"type": "image", "image": image}, | |
| {"type": "text", "text": "Convert this table image to HTML. Output only the HTML table structure with cell content."}, | |
| ], | |
| }, | |
| ] | |
| text = processor.apply_chat_template( | |
| messages, tokenize=False, add_generation_prompt=True | |
| ) | |
| image_inputs, video_inputs = process_vision_info(messages) | |
| inputs = processor( | |
| text=[text], | |
| images=image_inputs, | |
| videos=video_inputs, | |
| padding=True, | |
| return_tensors="pt", | |
| ).to(model.device) | |
| with torch.no_grad(): | |
| output_ids = model.generate( | |
| **inputs, | |
| max_new_tokens=2048, | |
| do_sample=False, | |
| ) | |
| generated_ids = [ | |
| out[len(inp):] for inp, out in zip(inputs.input_ids, output_ids) | |
| ] | |
| html = processor.batch_decode( | |
| generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False | |
| )[0].strip() | |
| # Cleanup | |
| del model, base_model, processor, inputs, output_ids | |
| torch.cuda.empty_cache() | |
| # Strip markdown code fences | |
| if html.startswith("```html"): | |
| html = html[7:] | |
| if html.startswith("```"): | |
| html = html[3:] | |
| if html.endswith("```"): | |
| html = html[:-3] | |
| html = html.strip() | |
| # Styled preview with KaTeX for LaTeX math rendering | |
| preview_html = f""" | |
| <!-- KaTeX CSS + JS for LaTeX rendering --> | |
| <link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/katex@0.16.11/dist/katex.min.css"> | |
| <script defer src="https://cdn.jsdelivr.net/npm/katex@0.16.11/dist/katex.min.js"></script> | |
| <script defer src="https://cdn.jsdelivr.net/npm/katex@0.16.11/dist/contrib/auto-render.min.js" | |
| onload="renderMathInElement(document.querySelector('.dt-preview'), {{ | |
| delimiters: [ | |
| {{left: '$$', right: '$$', display: true}}, | |
| {{left: '$', right: '$', display: false}}, | |
| {{left: '\\\\(', right: '\\\\)', display: false}}, | |
| {{left: '\\\\[', right: '\\\\]', display: true}}, | |
| {{left: '(', right: ')', display: false}}, | |
| {{left: '[', right: ']', display: true}} | |
| ], | |
| throwOnError: false | |
| }});"></script> | |
| <div style=" | |
| background-color: #ffffff; | |
| border-radius: 12px; | |
| padding: 20px; | |
| margin: 10px 0; | |
| box-shadow: 0 2px 12px rgba(0,0,0,0.15); | |
| overflow-x: auto; | |
| "> | |
| <style> | |
| .dt-preview table {{ | |
| border-collapse: collapse; | |
| width: 100%; | |
| font-family: 'Segoe UI', -apple-system, system-ui, sans-serif; | |
| font-size: 14px; | |
| line-height: 1.5; | |
| background-color: #ffffff; | |
| }} | |
| .dt-preview th, .dt-preview td {{ | |
| border: 1px solid #d0d0d0; | |
| padding: 10px 14px; | |
| text-align: left; | |
| color: #1a1a1a; | |
| vertical-align: top; | |
| }} | |
| .dt-preview thead th {{ | |
| background-color: #f0a500; | |
| color: #ffffff; | |
| font-weight: 700; | |
| font-size: 13px; | |
| letter-spacing: 0.3px; | |
| }} | |
| .dt-preview tbody tr:nth-child(even) {{ | |
| background-color: #fafafa; | |
| }} | |
| .dt-preview tbody tr:nth-child(odd) {{ | |
| background-color: #ffffff; | |
| }} | |
| .dt-preview tbody tr:hover {{ | |
| background-color: #fff8e1; | |
| }} | |
| .dt-preview td {{ | |
| color: #333333; | |
| }} | |
| .dt-preview b {{ | |
| color: #1a1a1a; | |
| font-weight: 700; | |
| }} | |
| .dt-preview sub {{ | |
| font-size: 0.75em; | |
| vertical-align: sub; | |
| }} | |
| .dt-preview .katex {{ | |
| font-size: 1.1em; | |
| }} | |
| </style> | |
| <div class="dt-preview">{html}</div> | |
| </div> | |
| """ | |
| return html, preview_html | |
| # ===================== | |
| # Gradio Interface | |
| # ===================== | |
| TITLE = """ | |
| <div style="text-align: center; margin-bottom: 10px;"> | |
| <h1>DrishtiTable</h1> | |
| <h3>Table Structure Recognition</h3> | |
| <p><em>Upload a table image, get HTML structure back. Powered by the fine-tuned DrishtiTable model.</em></p> | |
| </div> | |
| """ | |
| DESCRIPTION = """ | |
| **DrishtiTable** is a fine-tuned [Qwen2.5-VL-7B](https://huggingface.co/Qwen/Qwen2.5-VL-7B-Instruct) model | |
| that converts table images into structured HTML. Trained on 1,141 Indian academic textbook tables, | |
| it achieves **83.2% TEDS** — outperforming GPT-4o (71.1%) by +12.1 points. | |
| | Model | TEDS Score | Improvement | | |
| |---|---|---| | |
| | o4-mini (OpenAI) | 61.4% | — | | |
| | GPT-4.1 (OpenAI) | 68.0% | — | | |
| | GPT-4o (OpenAI) | 71.1% | — | | |
| | **DrishtiTable (This Demo)** | **83.2%** | **+12.1 over GPT-4o** | | |
| Upload any table image below to try it. First run takes ~60s to load the model, subsequent runs are faster. | |
| """ | |
| ARTICLE = """ | |
| --- | |
| ### Run Locally (Fastest) | |
| ```python | |
| from unsloth import FastVisionModel | |
| model, tokenizer = FastVisionModel.from_pretrained( | |
| "Nalandadata/DrishtiTable-Qwen2.5-VL-7B", | |
| max_seq_length=4096, load_in_4bit=True, | |
| ) | |
| FastVisionModel.for_inference(model) | |
| ``` | |
| ### Resources | |
| | Resource | Link | | |
| |---|---| | |
| | Fine-tuned Model | [Nalandadata/DrishtiTable-Qwen2.5-VL-7B](https://huggingface.co/Nalandadata/DrishtiTable-Qwen2.5-VL-7B) | | |
| | Dataset (sample) | [Nalandadata/DrishtiTable](https://huggingface.co/datasets/Nalandadata/DrishtiTable) | | |
| | Base Model | [Qwen/Qwen2.5-VL-7B-Instruct](https://huggingface.co/Qwen/Qwen2.5-VL-7B-Instruct) | | |
| *Built by [Nalanda Data](https://huggingface.co/Nalandadata). DrishtiTable (Sanskrit: drishti = vision).* | |
| """ | |
| with gr.Blocks( | |
| title="DrishtiTable - Table Structure Recognition", | |
| theme=gr.themes.Soft(primary_hue="yellow", secondary_hue="gray"), | |
| css=""" | |
| .gradio-container { max-width: 1200px !important; } | |
| footer { display: none !important; } | |
| """, | |
| ) as demo: | |
| gr.HTML(TITLE) | |
| gr.Markdown(DESCRIPTION) | |
| with gr.Row(equal_height=True): | |
| with gr.Column(scale=1): | |
| input_image = gr.Image( | |
| type="pil", | |
| label="Upload Table Image", | |
| height=400, | |
| sources=["upload", "clipboard"], | |
| ) | |
| submit_btn = gr.Button( | |
| "Recognize Table Structure", | |
| variant="primary", | |
| size="lg", | |
| ) | |
| with gr.Column(scale=1): | |
| html_output = gr.Code( | |
| label="Predicted HTML", | |
| language="html", | |
| lines=18, | |
| ) | |
| gr.Markdown("### Rendered Table Preview") | |
| rendered_output = gr.HTML(label="Rendered Table Preview") | |
| submit_btn.click( | |
| fn=predict, | |
| inputs=[input_image], | |
| outputs=[html_output, rendered_output], | |
| ) | |
| gr.Markdown(ARTICLE) | |
| if __name__ == "__main__": | |
| demo.launch() | |