monurcan commited on
Commit
4aff560
ยท
1 Parent(s): baf007d
Files changed (5) hide show
  1. .gitattributes +1 -0
  2. .gitignore +2 -0
  3. README.md +8 -10
  4. app.py +111 -58
  5. requirements.txt +6 -0
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ example_images/้˜ฟ่ƒฝ_129888755.jpg filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ /example_images/*
2
+ /env/*
README.md CHANGED
@@ -1,16 +1,14 @@
1
  ---
2
- title: Efficient Test Time Scaling
3
- emoji: ๐Ÿ’ฌ
4
- colorFrom: yellow
5
- colorTo: purple
6
  sdk: gradio
7
- sdk_version: 5.42.0
8
  app_file: app.py
9
  pinned: false
10
- hf_oauth: true
11
- hf_oauth_scopes:
12
- - inference-api
13
- short_description: Efficient Test-Time Scaling for Small Vision-Language Models
14
  ---
15
 
16
- An example chatbot using [Gradio](https://gradio.app), [`huggingface_hub`](https://huggingface.co/docs/huggingface_hub/v0.22.2/en/index), and the [Hugging Face Inference API](https://huggingface.co/docs/api-inference/index).
 
1
  ---
2
+ title: Smolvlm2 500M Illustration Description
3
+ emoji: ๐Ÿ“Š
4
+ colorFrom: red
5
+ colorTo: yellow
6
  sdk: gradio
7
+ sdk_version: 5.33.0
8
  app_file: app.py
9
  pinned: false
10
+ license: apache-2.0
11
+ short_description: Illustration Description
 
 
12
  ---
13
 
14
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py CHANGED
@@ -1,70 +1,123 @@
1
  import gradio as gr
2
- from huggingface_hub import InferenceClient
3
-
4
-
5
- def respond(
6
- message,
7
- history: list[dict[str, str]],
8
- system_message,
9
- max_tokens,
10
- temperature,
11
- top_p,
12
- hf_token: gr.OAuthToken,
13
- ):
 
 
 
 
 
14
  """
15
- For more information on `huggingface_hub` Inference API support, please check the docs: https://huggingface.co/docs/huggingface_hub/v0.22.2/en/guides/inference
 
 
 
 
 
 
 
 
 
 
 
 
16
  """
17
- client = InferenceClient(token=hf_token.token, model="openai/gpt-oss-20b")
18
 
19
- messages = [{"role": "system", "content": system_message}]
20
 
21
- messages.extend(history)
 
 
 
 
 
 
 
 
 
22
 
23
- messages.append({"role": "user", "content": message})
 
 
24
 
25
- response = ""
 
 
 
 
 
26
 
27
- for message in client.chat_completion(
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
28
  messages,
29
- max_tokens=max_tokens,
30
- stream=True,
31
- temperature=temperature,
32
- top_p=top_p,
33
- ):
34
- choices = message.choices
35
- token = ""
36
- if len(choices) and choices[0].delta.content:
37
- token = choices[0].delta.content
38
-
39
- response += token
40
- yield response
41
-
42
-
43
- """
44
- For information on how to customize the ChatInterface, peruse the gradio docs: https://www.gradio.app/docs/chatinterface
45
- """
46
- chatbot = gr.ChatInterface(
47
- respond,
48
- type="messages",
49
- additional_inputs=[
50
- gr.Textbox(value="You are a friendly Chatbot.", label="System message"),
51
- gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
52
- gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
53
- gr.Slider(
54
- minimum=0.1,
55
- maximum=1.0,
56
- value=0.95,
57
- step=0.05,
58
- label="Top-p (nucleus sampling)",
59
- ),
60
- ],
61
- )
62
 
63
- with gr.Blocks() as demo:
64
- with gr.Sidebar():
65
- gr.LoginButton()
66
- chatbot.render()
67
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
68
 
69
- if __name__ == "__main__":
70
- demo.launch()
 
1
  import gradio as gr
2
+ import torch
3
+ from transformers import (
4
+ AutoModelForImageTextToText,
5
+ AutoProcessor,
6
+ TextIteratorStreamer,
7
+ )
8
+ from peft import PeftModel
9
+ from transformers.image_utils import load_image
10
+ from threading import Thread
11
+ import time
12
+ import html
13
+
14
+
15
+ def progress_bar_html(label: str) -> str:
16
+ """
17
+ Returns an HTML snippet for a thin progress bar with a label.
18
+ The progress bar is styled as a dark animated bar.
19
  """
20
+ return f"""
21
+ <div style="display: flex; align-items: center;">
22
+ <span style="margin-right: 10px; font-size: 14px;">{label}</span>
23
+ <div style="width: 110px; height: 5px; background-color: #9370DB; border-radius: 2px; overflow: hidden;">
24
+ <div style="width: 100%; height: 100%; background-color: #4B0082; animation: loading 1.5s linear infinite;"></div>
25
+ </div>
26
+ </div>
27
+ <style>
28
+ @keyframes loading {{
29
+ 0% {{ transform: translateX(-100%); }}
30
+ 100% {{ transform: translateX(100%); }}
31
+ }}
32
+ </style>
33
  """
 
34
 
 
35
 
36
+ model_name = "HuggingFaceTB/SmolVLM2-256M-Video-Instruct"
37
+
38
+ model = AutoModelForImageTextToText.from_pretrained(
39
+ model_name, dtype=torch.bfloat16, device_map="auto"
40
+ ).eval()
41
+
42
+ processor = AutoProcessor.from_pretrained(model_name)
43
+
44
+ print(f"Successfully load the model: {model}")
45
+
46
 
47
+ def model_inference(input_dict, history):
48
+ text = input_dict["text"]
49
+ files = input_dict["files"]
50
 
51
+ if len(files) > 1:
52
+ images = [load_image(image) for image in files]
53
+ elif len(files) == 1:
54
+ images = [load_image(files[0])]
55
+ else:
56
+ images = []
57
 
58
+ if text == "" and not images:
59
+ gr.Error("Please input a query and optionally image(s).")
60
+ return
61
+ if text == "" and images:
62
+ gr.Error("Please input a text query along with the image(s).")
63
+ return
64
+
65
+ messages = [
66
+ {
67
+ "role": "user",
68
+ "content": [
69
+ *[{"type": "image", "image": image} for image in images],
70
+ {"type": "text", "text": text},
71
+ ],
72
+ }
73
+ ]
74
+ inputs = processor.apply_chat_template(
75
  messages,
76
+ add_generation_prompt=True,
77
+ tokenize=True,
78
+ return_dict=True,
79
+ return_tensors="pt",
80
+ ).to(model.device, dtype=model.dtype)
81
+ streamer = TextIteratorStreamer(
82
+ processor, skip_prompt=True, skip_special_tokens=True
83
+ )
84
+ generation_kwargs = dict(inputs, streamer=streamer, max_new_tokens=1024)
85
+ thread = Thread(target=model.generate, kwargs=generation_kwargs)
86
+ thread.start()
87
+ buffer = ""
88
+ yield progress_bar_html("Processing...")
89
+ for new_text in streamer:
90
+ escaped_new_text = html.escape(new_text)
91
+ buffer += escaped_new_text
92
+
93
+ time.sleep(0.001)
94
+ yield buffer
 
 
 
 
 
 
 
 
 
 
 
 
 
 
95
 
 
 
 
 
96
 
97
+ examples = [
98
+ [
99
+ {
100
+ "text": "Write a descriptive caption for this image in a formal tone.",
101
+ "files": ["example_images/example.png"],
102
+ }
103
+ ],
104
+ [
105
+ {
106
+ "text": "What are the characters wearing?",
107
+ "files": ["example_images/example.png"],
108
+ }
109
+ ],
110
+ ]
111
+
112
+ demo = gr.ChatInterface(
113
+ fn=model_inference,
114
+ description="# **Smolvlm2-500M-illustration-description** \n (running on CPU) The model only sees the last input, it ignores the previous conversation history.",
115
+ examples=examples,
116
+ fill_height=True,
117
+ textbox=gr.MultimodalTextbox(label="Query Input", file_types=["image"]),
118
+ stop_btn="Stop Generation",
119
+ multimodal=True,
120
+ cache_examples=False,
121
+ )
122
 
123
+ demo.launch(debug=True)
 
requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ gradio
2
+ transformers
3
+ peft
4
+ torch
5
+ num2words
6
+ torchvision