Sereinia commited on
Commit
746e79c
Β·
verified Β·
1 Parent(s): 8d02bd7

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +41 -0
app.py ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from PIL import Image
3
+ import gradio as gr
4
+ from transformers import AutoModelForCausalLM, LlamaTokenizer
5
+
6
+ MODEL_ID = "zai-org/cogvlm-grounding-generalist-hf"
7
+
8
+ tokenizer = LlamaTokenizer.from_pretrained("lmsys/vicuna-7b-v1.5")
9
+ model = AutoModelForCausalLM.from_pretrained(
10
+ MODEL_ID,
11
+ torch_dtype=torch.bfloat16,
12
+ low_cpu_mem_usage=True,
13
+ trust_remote_code=True
14
+ ).to("cuda").eval()
15
+
16
+ def describe_and_count(image):
17
+ query = "Count the number of people visible in this image and provide coordinates [[x0,y0,x1,y1]] for each detected person."
18
+ inputs = model.build_conversation_input_ids(tokenizer, query=query, images=[image])
19
+ inputs = {
20
+ "input_ids": inputs["input_ids"].unsqueeze(0).to("cuda"),
21
+ "token_type_ids": inputs["token_type_ids"].unsqueeze(0).to("cuda"),
22
+ "attention_mask": inputs["attention_mask"].unsqueeze(0).to("cuda"),
23
+ "images": [[inputs["images"][0].to("cuda").to(torch.bfloat16)]],
24
+ }
25
+
26
+ gen_kwargs = {"max_length": 2048, "do_sample": False}
27
+ with torch.no_grad():
28
+ outputs = model.generate(**inputs, **gen_kwargs)
29
+ outputs = outputs[:, inputs["input_ids"].shape[1]:]
30
+ response = tokenizer.decode(outputs[0], skip_special_tokens=True)
31
+ return response
32
+
33
+ demo = gr.Interface(
34
+ fn=describe_and_count,
35
+ inputs=gr.Image(type="pil"),
36
+ outputs="text",
37
+ title="CogVLM-Grounding for Crowd Counting",
38
+ description="Upload an image to estimate and localize people using CogVLM-Grounding-Generalist."
39
+ )
40
+
41
+ demo.launch()