import spaces
from transformers import AutoProcessor, AutoModelForImageTextToText
import torch
import gradio as gr

device = "cuda" if torch.cuda.is_available() else "cpu"

MODEL_PATH = "zai-org/GLM-OCR"
processor = AutoProcessor.from_pretrained(MODEL_PATH)
model = AutoModelForImageTextToText.from_pretrained(
    pretrained_model_name_or_path=MODEL_PATH,
    torch_dtype="auto",
    device_map="auto",
).to(device)


@spaces.GPU
def read_img(img):
  '''
    Takes in an image file and returns the text recognized from the image.
        Args:
            img: the input image file
        Returns:
            output_text: a string of the text recognized from the image
  '''
  messages = [
    {
        "role": "user",
        "content": [
            {"type": "image",
                "url": img},
            {"type": "text",
                "text": "Text Recognition:"}],
    }
    ]

  inputs = processor.apply_chat_template(
    messages,
    tokenize=True,
    add_generation_prompt=True,
    return_dict=True,
    return_tensors="pt"
  ).to(device)

  inputs.pop("token_type_ids", None)
  generated_ids = model.generate(**inputs, max_new_tokens=8192)
  output_text = processor.decode(generated_ids[0][inputs["input_ids"].shape[1]:], skip_special_tokens=False)

  return output_text

with gr.Blocks() as imgsmiles:
  top = gr.Markdown(
      """
      # OCR with ZAI GLM
      """)

  agent_flag_choice = gr.Radio(choices = ['True', 'False'],label="Are you an Agent?", interactive=True, value='False', scale = 2)
  with gr.Row():
    inputs=gr.Image(type="filepath")
    text_out = gr.Textbox(lines=2, label="Text Output")

  submit_button = gr.Button("Submit")
  clear_button = gr.ClearButton([inputs, text_out], value = "Clear")
  # agent_button = gr.Button("Agent use only")

  submit_button.click(read_img, [inputs], [text_out])
  # agent_button.click(agent_read_img, [agent_flag_choice, inputs], [text_out, None])

imgsmiles.launch(mcp_server=True)