cafierom commited on
Commit
0bcd9ba
·
verified ·
1 Parent(s): 1c1c78f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +69 -68
app.py CHANGED
@@ -1,69 +1,70 @@
1
- from transformers import AutoProcessor, AutoModelForImageTextToText
2
- import torch
3
- import gradio as gr
4
- import spaces
5
-
6
- MODEL_PATH = "zai-org/GLM-OCR"
7
- processor = AutoProcessor.from_pretrained(MODEL_PATH)
8
- model = AutoModelForImageTextToText.from_pretrained(
9
- pretrained_model_name_or_path=MODEL_PATH,
10
- torch_dtype="auto",
11
- device_map="auto",
12
- )
13
-
14
- device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
15
- model.to(device)
16
-
17
- @spaces.GPU
18
- def read_img(img):
19
- '''
20
- Takes in an image file and returns the text recognized from the image.
21
- Args:
22
- img: the input image file
23
- Returns:
24
- output_text: a string of the text recognized from the image
25
- '''
26
- messages = [
27
- {
28
- "role": "user",
29
- "content": [
30
- {"type": "image",
31
- "url": img},
32
- {"type": "text",
33
- "text": "Text Recognition:"}],
34
- }
35
- ]
36
-
37
- inputs = processor.apply_chat_template(
38
- messages,
39
- tokenize=True,
40
- add_generation_prompt=True,
41
- return_dict=True,
42
- return_tensors="pt"
43
- ).to(model.device)
44
-
45
- inputs.pop("token_type_ids", None)
46
- generated_ids = model.generate(**inputs, max_new_tokens=8192)
47
- output_text = processor.decode(generated_ids[0][inputs["input_ids"].shape[1]:], skip_special_tokens=False)
48
-
49
- return output_text
50
-
51
- with gr.Blocks() as imgsmiles:
52
- top = gr.Markdown(
53
- """
54
- # OCR with ZAI GLM
55
- """)
56
-
57
- agent_flag_choice = gr.Radio(choices = ['True', 'False'],label="Are you an Agent?", interactive=True, value='False', scale = 2)
58
- with gr.Row():
59
- inputs=gr.Image(type="filepath")
60
- text_out = gr.Textbox(lines=2, label="Text Output")
61
-
62
- submit_button = gr.Button("Submit")
63
- clear_button = gr.ClearButton([inputs, text_out], value = "Clear")
64
- # agent_button = gr.Button("Agent use only")
65
-
66
- submit_button.click(read_img, [inputs], [text_out])
67
- # agent_button.click(agent_read_img, [agent_flag_choice, inputs], [text_out, None])
68
-
 
69
  imgsmiles.launch(mcp_server=True)
 
1
+ import spaces
2
+ from transformers import AutoProcessor, AutoModelForImageTextToText
3
+ import torch
4
+ import gradio as gr
5
+
6
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
7
+
8
+ MODEL_PATH = "zai-org/GLM-OCR"
9
+ processor = AutoProcessor.from_pretrained(MODEL_PATH)
10
+ model = AutoModelForImageTextToText.from_pretrained(
11
+ pretrained_model_name_or_path=MODEL_PATH,
12
+ torch_dtype="auto",
13
+ device_map="auto",
14
+ )
15
+
16
+
17
+
18
+ @spaces.GPU
19
+ def read_img(img):
20
+ '''
21
+ Takes in an image file and returns the text recognized from the image.
22
+ Args:
23
+ img: the input image file
24
+ Returns:
25
+ output_text: a string of the text recognized from the image
26
+ '''
27
+ messages = [
28
+ {
29
+ "role": "user",
30
+ "content": [
31
+ {"type": "image",
32
+ "url": img},
33
+ {"type": "text",
34
+ "text": "Text Recognition:"}],
35
+ }
36
+ ]
37
+
38
+ inputs = processor.apply_chat_template(
39
+ messages,
40
+ tokenize=True,
41
+ add_generation_prompt=True,
42
+ return_dict=True,
43
+ return_tensors="pt"
44
+ )
45
+
46
+ inputs.pop("token_type_ids", None)
47
+ generated_ids = model.generate(**inputs, max_new_tokens=8192)
48
+ output_text = processor.decode(generated_ids[0][inputs["input_ids"].shape[1]:], skip_special_tokens=False)
49
+
50
+ return output_text
51
+
52
+ with gr.Blocks() as imgsmiles:
53
+ top = gr.Markdown(
54
+ """
55
+ # OCR with ZAI GLM
56
+ """)
57
+
58
+ agent_flag_choice = gr.Radio(choices = ['True', 'False'],label="Are you an Agent?", interactive=True, value='False', scale = 2)
59
+ with gr.Row():
60
+ inputs=gr.Image(type="filepath")
61
+ text_out = gr.Textbox(lines=2, label="Text Output")
62
+
63
+ submit_button = gr.Button("Submit")
64
+ clear_button = gr.ClearButton([inputs, text_out], value = "Clear")
65
+ # agent_button = gr.Button("Agent use only")
66
+
67
+ submit_button.click(read_img, [inputs], [text_out])
68
+ # agent_button.click(agent_read_img, [agent_flag_choice, inputs], [text_out, None])
69
+
70
  imgsmiles.launch(mcp_server=True)