jxu124
/

TiO

@@ -9,9 +9,13 @@ language:
 TiO is an Interactive Visual Grounding Model for Disambiguation.  (WIP)
-## Online / offline Demo
 ```python
 from transformers import AutoModel, AutoTokenizer, AutoImageProcessor
 model_id = "jxu124/TiO"
@@ -25,25 +29,32 @@ model = AutoModel.from_pretrained(
 )
 tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=False)
 image_processor = AutoImageProcessor.from_pretrained(model_id)
-# setup gradio demo
-model.get_gradio_demo(tokenizer, image_processor).\
-    queue(max_size=20).launch(server_name="0.0.0.0", server_port=7860)
 ```
 ## Mini-Example
 ```python
 from transformers import AutoModel, AutoTokenizer, AutoImageProcessor
 from PIL import Image
 from io import BytesIO
 import torch
 import requests
-# Load model, tokenizer, image_processor
-tokenizer = AutoTokenizer.from_pretrained("jxu124/TiO", use_fast=False)
-image_processor = AutoImageProcessor.from_pretrained("jxu124/TiO")
-model = AutoModel.from_pretrained("jxu124/TiO", trust_remote_code=True)
-model = model.to(torch.float16).cuda()  # It would be faster.
 # Prepare example
 image = Image.open(BytesIO(requests.get("http://images.cocodataset.org/val2014/COCO_val2014_000000429913.jpg").content))
 text = """\
@@ -64,25 +75,28 @@ print(tokenizer.batch_decode(gen, skip_special_tokens=True).replace("not yet.",
 Guesser(grounding):
 ```python
-text = """ #instruction: which region does the context describe? \n #context: \"\
 human: look that man in white!
 agent: is he the one who just threw the ball?
-human: yes. I mean the pitcher.\"
-"""
 ```
 Questioner(question generation):
 ```python
-text = """ #instruction: guess what I want? \n #context: \"\
-human: look that man in white! \"
-"""
 ```
 Oracle(answering):
 ```python
-text = """ #instruction: answer the question based on the region. \n #context: \"\
 agent: look that man in white!
-human: is he the one who just threw the ball? \"
-#region: <bin_847> <bin_319> <bin_923> <bin_467>
-"""
 ```

 TiO is an Interactive Visual Grounding Model for Disambiguation.  (WIP)
+## Online / Offline Demo
+- [Colab Online Demo](https://colab.research.google.com/drive/195eDITKi6dahnVz8Cum91sNUCF_lFle8?usp=sharing) - Free T4 is available on Google Colab.
+- Gradio Offline Demo:
 ```python
+import os; os.system("pip3 install transformers accelerate bitsandbytes gradio fire")
 from transformers import AutoModel, AutoTokenizer, AutoImageProcessor
 model_id = "jxu124/TiO"
 )
 tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=False)
 image_processor = AutoImageProcessor.from_pretrained(model_id)
+# ---- setup gradio demo ----
+model.get_gradio_demo(tokenizer, image_processor).queue(max_size=20).launch(server_name="0.0.0.0", server_port=7860)
 ```
 ## Mini-Example
 ```python
+import os; os.system("pip3 install transformers accelerate bitsandbytes gradio fire")
 from transformers import AutoModel, AutoTokenizer, AutoImageProcessor
+model_id = "jxu124/TiO"
+model = AutoModel.from_pretrained(
+    model_id,
+    trust_remote_code=True,
+    torch_dtype=torch.float16,
+    device_map='cuda'
+)
+tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=False)
+image_processor = AutoImageProcessor.from_pretrained(model_id)
+# ---- mini example ----
 from PIL import Image
 from io import BytesIO
 import torch
 import requests
 # Prepare example
 image = Image.open(BytesIO(requests.get("http://images.cocodataset.org/val2014/COCO_val2014_000000429913.jpg").content))
 text = """\
 Guesser(grounding):
 ```python
+text = """\
+#instruction: which region does the context describe?
+#context:
 human: look that man in white!
 agent: is he the one who just threw the ball?
+human: yes. I mean the pitcher."""
 ```
 Questioner(question generation):
 ```python
+text = """\
+#instruction: guess what I want?
+#context:
+human: look that man in white!"""
 ```
 Oracle(answering):
 ```python
+text = """\
+#instruction: answer the question based on the region.
+#context:
 agent: look that man in white!
+human: is he the one who just threw the ball?
+#region: <bin_847> <bin_319> <bin_923> <bin_467>"""
 ```