import spaces from transformers import AutoProcessor, AutoModelForImageTextToText import torch import gradio as gr device = "cuda" if torch.cuda.is_available() else "cpu" MODEL_PATH = "zai-org/GLM-OCR" processor = AutoProcessor.from_pretrained(MODEL_PATH) model = AutoModelForImageTextToText.from_pretrained( pretrained_model_name_or_path=MODEL_PATH, torch_dtype="auto", device_map="auto", ).to(device) @spaces.GPU def read_img(img): ''' Takes in an image file and returns the text recognized from the image. Args: img: the input image file Returns: output_text: a string of the text recognized from the image ''' messages = [ { "role": "user", "content": [ {"type": "image", "url": img}, {"type": "text", "text": "Text Recognition:"}], } ] inputs = processor.apply_chat_template( messages, tokenize=True, add_generation_prompt=True, return_dict=True, return_tensors="pt" ).to(device) inputs.pop("token_type_ids", None) generated_ids = model.generate(**inputs, max_new_tokens=8192) output_text = processor.decode(generated_ids[0][inputs["input_ids"].shape[1]:], skip_special_tokens=False) return output_text with gr.Blocks() as imgsmiles: top = gr.Markdown( """ # OCR with ZAI GLM """) agent_flag_choice = gr.Radio(choices = ['True', 'False'],label="Are you an Agent?", interactive=True, value='False', scale = 2) with gr.Row(): inputs=gr.Image(type="filepath") text_out = gr.Textbox(lines=2, label="Text Output") submit_button = gr.Button("Submit") clear_button = gr.ClearButton([inputs, text_out], value = "Clear") # agent_button = gr.Button("Agent use only") submit_button.click(read_img, [inputs], [text_out]) # agent_button.click(agent_read_img, [agent_flag_choice, inputs], [text_out, None]) imgsmiles.launch(mcp_server=True)