""" DrishtiTable: Table Structure Recognition Demo Upload a table image -> get HTML structure back. Runs on HuggingFace Spaces with ZeroGPU. """ import gradio as gr import torch import spaces import os from PIL import Image SYSTEM_PROMPT = """You are a table structure recognition expert. Given an image of a table, output the HTML representation of the table structure and content. Rules: - Use
| , | tags
- Use colspan and rowspan attributes for merged cells
- Use for bold text and for subscripts
- Output ONLY the HTML table, nothing else
- Do NOT include any attributes like style, class, or id"""
@spaces.GPU(duration=300)
def predict(image: Image.Image) -> tuple[str, str]:
"""Run DrishtiTable on an uploaded table image."""
if image is None:
return "Please upload a table image.", " No image uploaded. " from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor from peft import PeftModel from qwen_vl_utils import process_vision_info # Load base model + LoRA adapter inside GPU context print("Loading base model...") base_model = Qwen2_5_VLForConditionalGeneration.from_pretrained( "Qwen/Qwen2.5-VL-7B-Instruct", torch_dtype=torch.float16, device_map="auto", low_cpu_mem_usage=True, ) print("Loading LoRA adapter...") model = PeftModel.from_pretrained( base_model, "Nalandadata/DrishtiTable-Qwen2.5-VL-7B", ) model.eval() processor = AutoProcessor.from_pretrained( "Qwen/Qwen2.5-VL-7B-Instruct", ) print("Model loaded! Running inference...") image = image.convert("RGB") # Resize large images to reduce memory max_dim = 1024 w, h = image.size if max(w, h) > max_dim: scale = max_dim / max(w, h) image = image.resize((int(w * scale), int(h * scale)), Image.LANCZOS) messages = [ {"role": "system", "content": SYSTEM_PROMPT}, { "role": "user", "content": [ {"type": "image", "image": image}, {"type": "text", "text": "Convert this table image to HTML. Output only the HTML table structure with cell content."}, ], }, ] text = processor.apply_chat_template( messages, tokenize=False, add_generation_prompt=True ) image_inputs, video_inputs = process_vision_info(messages) inputs = processor( text=[text], images=image_inputs, videos=video_inputs, padding=True, return_tensors="pt", ).to(model.device) with torch.no_grad(): output_ids = model.generate( **inputs, max_new_tokens=2048, do_sample=False, ) generated_ids = [ out[len(inp):] for inp, out in zip(inputs.input_ids, output_ids) ] html = processor.batch_decode( generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False )[0].strip() # Cleanup del model, base_model, processor, inputs, output_ids torch.cuda.empty_cache() # Strip markdown code fences if html.startswith("```html"): html = html[7:] if html.startswith("```"): html = html[3:] if html.endswith("```"): html = html[:-3] html = html.strip() # Styled preview with KaTeX for LaTeX math rendering preview_html = f"""{html}
DrishtiTableTable Structure RecognitionUpload a table image, get HTML structure back. Powered by the fine-tuned DrishtiTable model. |
|---|