File size: 1,964 Bytes
795b5f3
533ddd5
a3af039
fd726c5
 
 
0bcd9ba
 
 
 
 
 
 
795b5f3
0bcd9ba
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
795b5f3
0bcd9ba
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1c1c78f
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
import spaces
from transformers import AutoProcessor, AutoModelForImageTextToText
import torch
import gradio as gr

device = "cuda" if torch.cuda.is_available() else "cpu"

MODEL_PATH = "zai-org/GLM-OCR"
processor = AutoProcessor.from_pretrained(MODEL_PATH)
model = AutoModelForImageTextToText.from_pretrained(
    pretrained_model_name_or_path=MODEL_PATH,
    torch_dtype="auto",
    device_map="auto",
).to(device)



@spaces.GPU
def read_img(img):
  '''
    Takes in an image file and returns the text recognized from the image.
        Args:
            img: the input image file
        Returns:
            output_text: a string of the text recognized from the image
  '''
  messages = [
    {
        "role": "user",
        "content": [
            {"type": "image",
                "url": img},
            {"type": "text",
                "text": "Text Recognition:"}],
    }
    ]

  inputs = processor.apply_chat_template(
    messages,
    tokenize=True,
    add_generation_prompt=True,
    return_dict=True,
    return_tensors="pt"
  ).to(device)

  inputs.pop("token_type_ids", None)
  generated_ids = model.generate(**inputs, max_new_tokens=8192)
  output_text = processor.decode(generated_ids[0][inputs["input_ids"].shape[1]:], skip_special_tokens=False)

  return output_text

with gr.Blocks() as imgsmiles:
  top = gr.Markdown(
      """
      # OCR with ZAI GLM
      """)

  agent_flag_choice = gr.Radio(choices = ['True', 'False'],label="Are you an Agent?", interactive=True, value='False', scale = 2)
  with gr.Row():
    inputs=gr.Image(type="filepath")
    text_out = gr.Textbox(lines=2, label="Text Output")

  submit_button = gr.Button("Submit")
  clear_button = gr.ClearButton([inputs, text_out], value = "Clear")
  # agent_button = gr.Button("Agent use only")

  submit_button.click(read_img, [inputs], [text_out])
  # agent_button.click(agent_read_img, [agent_flag_choice, inputs], [text_out, None])

imgsmiles.launch(mcp_server=True)