John Ho commited on
Commit
a31ba73
·
unverified ·
0 Parent(s):

Initial commit

Browse files
Files changed (7) hide show
  1. .github/workflows/deploy_to_hf_space.yaml +80 -0
  2. .gitignore +11 -0
  3. .python-version +1 -0
  4. README.md +29 -0
  5. app.py +211 -0
  6. pyproject.toml +13 -0
  7. uv.lock +0 -0
.github/workflows/deploy_to_hf_space.yaml ADDED
@@ -0,0 +1,80 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: CI/CD to Hugging Face Space with uv
2
+
3
+ on:
4
+ push:
5
+ branches:
6
+ - main # Or your default branch, e.g., 'master', 'dev'
7
+
8
+ jobs:
9
+ deploy:
10
+ runs-on: ubuntu-latest
11
+ steps:
12
+ - name: Check for HF_TOKEN availability
13
+ id: check_hf_token
14
+ env:
15
+ HF_TOKEN_CHECK: ${{ secrets.HF_TOKEN }} # Pass the secret to an env var for shell check
16
+ run: |
17
+ if [ -z "$HF_TOKEN_CHECK" ]; then
18
+ echo "::notice::HF_TOKEN secret is not set. Hugging Face Space push will be skipped."
19
+ echo "push_enabled=false" >> $GITHUB_OUTPUT
20
+ else
21
+ echo "::notice::HF_TOKEN secret is set. Proceeding with Hugging Face Space push."
22
+ echo "push_enabled=true" >> $GITHUB_OUTPUT
23
+ fi
24
+ - name: Checkout repository
25
+ uses: actions/checkout@v4
26
+ with:
27
+ fetch-depth: 0
28
+ lfs: true
29
+
30
+ - name: Set up Python
31
+ uses: actions/setup-python@v5
32
+ with:
33
+ python-version: "3.12" # Recommended: specify a precise version like '3.10', '3.11', or '3.12'
34
+
35
+ - name: Install uv
36
+ # Installs the uv tool on the GitHub Actions runner
37
+ uses: astral-sh/setup-uv@v1
38
+
39
+ - name: Check for pyproject.toml existence
40
+ id: check_pyproject
41
+ run: |
42
+ if [ -f pyproject.toml ]; then
43
+ echo "::notice::pyproject.toml found. Proceeding with uv pip compile."
44
+ echo "pyproject_exists=true" >> $GITHUB_OUTPUT
45
+ else
46
+ echo "::notice::pyproject.toml not found. Skipping requirements.txt generation via uv pip compile."
47
+ echo "pyproject_exists=false" >> $GITHUB_OUTPUT
48
+ fi
49
+
50
+ - name: Generate requirements.txt using uv
51
+ id: generate_reqs
52
+ # This step will only run if pyproject.toml was found in the previous step
53
+ if: ${{ steps.check_pyproject.outputs.pyproject_exists == 'true' }}
54
+ run: |
55
+ # Use uv pip compile to generate a locked requirements.txt from pyproject.toml
56
+ # This ensures reproducibility.
57
+ uv export --no-hashes --format requirements-txt > requirements.txt
58
+ # uv pip compile pyproject.toml -o requirements.txt
59
+
60
+ # Check if requirements.txt was created
61
+ if [ -f requirements.txt ]; then
62
+ echo "requirements.txt generated successfully:"
63
+ cat requirements.txt
64
+ else
65
+ echo "Error: requirements.txt was not generated despite pyproject.toml existing."
66
+ exit 1
67
+ fi
68
+
69
+ # - name: Get ready to push to HuggingFace Space
70
+ # # This step will only run if 'push_enabled' output from the previous step is 'true'
71
+ # if: ${{ steps.check_hf_token.outputs.push_enabled == 'true' }}
72
+ # uses: actions/checkout@v3
73
+ # with:
74
+ # fetch-depth: 0
75
+ # lfs: true
76
+ - name: Push to HuggingFace Space
77
+ if: ${{ steps.check_hf_token.outputs.push_enabled == 'true' }}
78
+ env:
79
+ HF_TOKEN: ${{ secrets.HF_TOKEN }}
80
+ run: git push https://HF_USERNAME:$HF_TOKEN@huggingface.co/spaces/HF_USERNAME/SPACE_NAME main
.gitignore ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Python-generated files
2
+ __pycache__/
3
+ *.py[oc]
4
+ build/
5
+ dist/
6
+ wheels/
7
+ *.egg-info
8
+ .ropeproject
9
+
10
+ # Virtual environments
11
+ .venv
.python-version ADDED
@@ -0,0 +1 @@
 
 
1
+ 3.10
README.md ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Name for you Space App
3
+ emoji: 📚
4
+ colorFrom: blue
5
+ colorTo: yellow
6
+ sdk: gradio
7
+ sdk_version: 5.32.0
8
+ app_file: app.py
9
+ pinned: false
10
+ short_description: short description for your Space App
11
+ ---
12
+
13
+ # The HuggingFace Space Template
14
+ setup with [github action to update automatically update your space](https://huggingface.co/docs/hub/spaces-github-actions)
15
+ and manage dependencies with `uv`
16
+
17
+ You will need to update [`deploy_to_hf_space.yaml`](.github/workflows/deploy_to_hf_space.yaml) with the details for your space and
18
+ setup your `HF_TOKEN` in your [Github secret](https://docs.github.com/en/actions/security-guides/encrypted-secrets#creating-encrypted-secrets-for-an-environment)
19
+
20
+ > [!WARNING]
21
+ > The Githuh Action *Force* push changes to HuggingFace Space
22
+ > This is due to the creation of the requirements.txt that happens on the fly.
23
+ > This template assumes that you are the sole contributor to your space.
24
+
25
+ ## Resources
26
+
27
+ * [Gradio Course](https://huggingface.co/learn/llm-course/chapter9/2?fw=pt)
28
+ * [Gradio Doc](https://www.gradio.app/guides/quickstart)
29
+ * Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py ADDED
@@ -0,0 +1,211 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import spaces, torch, time
2
+ import gradio as gr
3
+ from transformers import (
4
+ AutoModelForImageTextToText,
5
+ AutoProcessor,
6
+ BitsAndBytesConfig,
7
+ )
8
+
9
+ # Flash Attention for ZeroGPU
10
+ import subprocess
11
+
12
+ subprocess.run(
13
+ "pip install flash-attn --no-build-isolation",
14
+ env={"FLASH_ATTENTION_SKIP_CUDA_BUILD": "TRUE"},
15
+ shell=True,
16
+ )
17
+
18
+ # Set target DEVICE and DTYPE
19
+ DTYPE = (
20
+ torch.bfloat16
21
+ if torch.cuda.is_available() and torch.cuda.is_bf16_supported()
22
+ else torch.float16
23
+ )
24
+ DEVICE = "auto"
25
+ print(f"Device: {DEVICE}, dtype: {DTYPE}")
26
+
27
+
28
+ def load_model(
29
+ model_name: str = "chancharikm/qwen2.5-vl-7b-cam-motion-preview",
30
+ use_flash_attention: bool = True,
31
+ apply_quantization: bool = True,
32
+ ):
33
+ bnb_config = BitsAndBytesConfig(
34
+ load_in_4bit=True, # Load model weights in 4-bit
35
+ bnb_4bit_quant_type="nf4", # Use NF4 quantization (or "fp4")
36
+ bnb_4bit_compute_dtype=DTYPE, # Perform computations in bfloat16/float16
37
+ bnb_4bit_use_double_quant=True, # Optional: further quantization for slightly more memory saving
38
+ )
39
+
40
+ # Determine model family from model name
41
+ model_family = model_name.split("/")[-1].split("-")[0]
42
+
43
+ # Common model loading arguments
44
+ common_args = {
45
+ "torch_dtype": DTYPE,
46
+ "device_map": DEVICE,
47
+ "low_cpu_mem_usage": True,
48
+ "quantization_config": bnb_config if apply_quantization else None,
49
+ }
50
+ if use_flash_attention:
51
+ common_args["attn_implementation"] = "flash_attention_2"
52
+
53
+ # Load model based on family
54
+ match model_family:
55
+ # case "qwen2.5" | "Qwen2.5":
56
+ # model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
57
+ # model_name, **common_args
58
+ # )
59
+ case "InternVL3":
60
+ model = AutoModelForImageTextToText.from_pretrained(
61
+ model_name, **common_args
62
+ )
63
+ case _:
64
+ raise ValueError(f"Unsupported model family: {model_family}")
65
+
66
+ # Set model to evaluation mode for inference (disables dropout, etc.)
67
+ return model.eval()
68
+
69
+
70
+ def load_processor(model_name="Qwen/Qwen2.5-VL-7B-Instruct"):
71
+ return AutoProcessor.from_pretrained(
72
+ model_name,
73
+ device_map=DEVICE,
74
+ use_fast=True,
75
+ torch_dtype=DTYPE,
76
+ )
77
+
78
+
79
+ print("Loading Models and Processors...")
80
+ MODEL_ZOO = {
81
+ "qwen2.5-vl-7b-instruct": load_model(
82
+ model_name="Qwen/Qwen2.5-VL-7B-Instruct",
83
+ use_flash_attention=False,
84
+ apply_quantization=False,
85
+ ),
86
+ "InternVL3-1B-hf": load_model(
87
+ model_name="OpenGVLab/InternVL3-1B-hf",
88
+ use_flash_attention=False,
89
+ apply_quantization=False,
90
+ ),
91
+ "InternVL3-2B-hf": load_model(
92
+ model_name="OpenGVLab/InternVL3-2B-hf",
93
+ use_flash_attention=False,
94
+ apply_quantization=False,
95
+ ),
96
+ "InternVL3-8B-hf": load_model(
97
+ model_name="OpenGVLab/InternVL3-8B-hf",
98
+ use_flash_attention=False,
99
+ apply_quantization=True,
100
+ ),
101
+ }
102
+
103
+ PROCESSORS = {
104
+ "qwen2.5-vl-7b-instruct": load_processor("Qwen/Qwen2.5-VL-7B-Instruct"),
105
+ "InternVL3-1B-hf": load_processor("OpenGVLab/InternVL3-1B-hf"),
106
+ "InternVL3-2B-hf": load_processor("OpenGVLab/InternVL3-2B-hf"),
107
+ "InternVL3-8B-hf": load_processor("OpenGVLab/InternVL3-8B-hf"),
108
+ }
109
+ print("Models and Processors Loaded!")
110
+
111
+
112
+ # Our Inference Function
113
+ @spaces.GPU(duration=120)
114
+ def video_inference(
115
+ video_path: str,
116
+ prompt: str,
117
+ model_name: str,
118
+ fps: int = 8,
119
+ max_tokens: int = 512,
120
+ temperature: float = 0.1,
121
+ ):
122
+ s_time = time.time()
123
+ model = MODEL_ZOO[model_name]
124
+ processor = PROCESSORS[model_name]
125
+ messages = [
126
+ {
127
+ "role": "user",
128
+ "content": [
129
+ {
130
+ "type": "video",
131
+ "video": video_path,
132
+ },
133
+ {"type": "text", "text": prompt},
134
+ ],
135
+ }
136
+ ]
137
+ with torch.no_grad():
138
+ model_family = model_name.split("-")[0]
139
+ match model_family:
140
+ case "InternVL3":
141
+ inputs = processor.apply_chat_template(
142
+ messages,
143
+ add_generation_prompt=True,
144
+ tokenize=True,
145
+ return_dict=True,
146
+ return_tensors="pt",
147
+ fps=fps,
148
+ # num_frames = 8
149
+ ).to("cuda", dtype=DTYPE)
150
+
151
+ output = model.generate(
152
+ **inputs,
153
+ max_new_tokens=max_tokens,
154
+ temperature=float(temperature),
155
+ do_sample=temperature > 0.0,
156
+ )
157
+ output_text = processor.decode(
158
+ output[0, inputs["input_ids"].shape[1] :], skip_special_tokens=True
159
+ )
160
+ case _:
161
+ raise ValueError(f"{model_name} is not currently supported")
162
+ return {
163
+ "output_text": output_text,
164
+ "fps": fps,
165
+ "inference_time": time.time() - s_time,
166
+ }
167
+
168
+
169
+ # the Gradio App
170
+ app = gr.Interface(
171
+ fn=inference,
172
+ inputs=[
173
+ gr.Video(label="Input Video"),
174
+ gr.Textbox(
175
+ label="Prompt",
176
+ lines=3,
177
+ info="Some models like [cam motion](https://huggingface.co/chancharikm/qwen2.5-vl-7b-cam-motion-preview) are trained specific prompts",
178
+ value="Describe the camera motion in this video.",
179
+ ),
180
+ gr.Dropdown(label="Model", choices=list(MODEL_ZOO.keys())),
181
+ gr.Number(
182
+ label="FPS",
183
+ info="inference sampling rate (Qwen2.5VL is trained on videos with 8 fps); a value of 0 means the FPS of the input video will be used",
184
+ value=8,
185
+ minimum=0,
186
+ step=1,
187
+ ),
188
+ gr.Slider(
189
+ label="Max Tokens",
190
+ info="maximum number of tokens to generate",
191
+ value=128,
192
+ minimum=32,
193
+ maximum=512,
194
+ step=32,
195
+ ),
196
+ gr.Slider(
197
+ label="Temperature",
198
+ value=0.0,
199
+ minimum=0.0,
200
+ maximum=1.0,
201
+ step=0.1,
202
+ ),
203
+ ],
204
+ outputs=gr.JSON(label="Output JSON"),
205
+ title="Video Chat with VLM",
206
+ description='comparing various "small" VLMs on the task of video captioning',
207
+ api_name="video_inference",
208
+ )
209
+ app.launch(
210
+ mcp_server=True, app_kwargs={"docs_url": "/docs"} # add FastAPI Swagger API Docs
211
+ )
pyproject.toml ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [project]
2
+ name = "hfs-template"
3
+ version = "0.1.0"
4
+ description = "Add your description here"
5
+ readme = "README.md"
6
+ requires-python = ">=3.10"
7
+ dependencies = [
8
+ "gradio>=5.38.0",
9
+ "transformers==4.44.0",
10
+ "pydantic==2.10.6",
11
+ "loguru>=0.7.3",
12
+ "qwen-vl-utils>=0.0.11"
13
+ ]
uv.lock ADDED
The diff for this file is too large to render. See raw diff