Spaces:

GF-John
/

sam3

Sleeping

App Files Files Community

John Ho commited on Dec 19, 2025

Commit

a31ba73

unverified ·

0 Parent(s):

Initial commit

Browse files

Files changed (7) hide show

.github/workflows/deploy_to_hf_space.yaml +80 -0
.gitignore +11 -0
.python-version +1 -0
README.md +29 -0
app.py +211 -0
pyproject.toml +13 -0
uv.lock +0 -0

.github/workflows/deploy_to_hf_space.yaml ADDED Viewed

	@@ -0,0 +1,80 @@

+name: CI/CD to Hugging Face Space with uv
+on:
+  push:
+    branches:
+      - main # Or your default branch, e.g., 'master', 'dev'
+jobs:
+  deploy:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Check for HF_TOKEN availability
+        id: check_hf_token
+        env:
+          HF_TOKEN_CHECK: ${{ secrets.HF_TOKEN }} # Pass the secret to an env var for shell check
+        run: |
+          if [ -z "$HF_TOKEN_CHECK" ]; then
+            echo "::notice::HF_TOKEN secret is not set. Hugging Face Space push will be skipped."
+            echo "push_enabled=false" >> $GITHUB_OUTPUT
+          else
+            echo "::notice::HF_TOKEN secret is set. Proceeding with Hugging Face Space push."
+            echo "push_enabled=true" >> $GITHUB_OUTPUT
+          fi
+      - name: Checkout repository
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+          lfs: true
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: "3.12" # Recommended: specify a precise version like '3.10', '3.11', or '3.12'
+      - name: Install uv
+        # Installs the uv tool on the GitHub Actions runner
+        uses: astral-sh/setup-uv@v1
+      - name: Check for pyproject.toml existence
+        id: check_pyproject
+        run: |
+          if [ -f pyproject.toml ]; then
+            echo "::notice::pyproject.toml found. Proceeding with uv pip compile."
+            echo "pyproject_exists=true" >> $GITHUB_OUTPUT
+          else
+            echo "::notice::pyproject.toml not found. Skipping requirements.txt generation via uv pip compile."
+            echo "pyproject_exists=false" >> $GITHUB_OUTPUT
+          fi
+      - name: Generate requirements.txt using uv
+        id: generate_reqs
+        # This step will only run if pyproject.toml was found in the previous step
+        if: ${{ steps.check_pyproject.outputs.pyproject_exists == 'true' }}
+        run: |
+          # Use uv pip compile to generate a locked requirements.txt from pyproject.toml
+          # This ensures reproducibility.
+          uv export --no-hashes --format requirements-txt > requirements.txt
+          # uv pip compile pyproject.toml -o requirements.txt
+          # Check if requirements.txt was created
+          if [ -f requirements.txt ]; then
+            echo "requirements.txt generated successfully:"
+            cat requirements.txt
+          else
+            echo "Error: requirements.txt was not generated despite pyproject.toml existing."
+            exit 1
+          fi
+      # - name: Get ready to push to HuggingFace Space
+      #   # This step will only run if 'push_enabled' output from the previous step is 'true'
+      #   if: ${{ steps.check_hf_token.outputs.push_enabled == 'true' }}
+      #   uses: actions/checkout@v3
+      #   with:
+      #     fetch-depth: 0
+      #     lfs: true
+      - name: Push to HuggingFace Space
+        if: ${{ steps.check_hf_token.outputs.push_enabled == 'true' }}
+        env:
+          HF_TOKEN: ${{ secrets.HF_TOKEN }}
+        run: git push https://HF_USERNAME:$HF_TOKEN@huggingface.co/spaces/HF_USERNAME/SPACE_NAME main

.gitignore ADDED Viewed

	@@ -0,0 +1,11 @@

+# Python-generated files
+__pycache__/
+*.py[oc]
+build/
+dist/
+wheels/
+*.egg-info
+.ropeproject
+# Virtual environments
+.venv

.python-version ADDED Viewed

	@@ -0,0 +1 @@


1	+ 3.10

README.md ADDED Viewed

	@@ -0,0 +1,29 @@

+---
+title: Name for you Space App
+emoji: 📚
+colorFrom: blue
+colorTo: yellow
+sdk: gradio
+sdk_version: 5.32.0
+app_file: app.py
+pinned: false
+short_description: short description for your Space App
+---
+# The HuggingFace Space Template
+setup with [github action to update automatically update your space](https://huggingface.co/docs/hub/spaces-github-actions)
+and manage dependencies with `uv`
+You will need to update [`deploy_to_hf_space.yaml`](.github/workflows/deploy_to_hf_space.yaml) with the details for your space and
+setup your `HF_TOKEN` in your [Github secret](https://docs.github.com/en/actions/security-guides/encrypted-secrets#creating-encrypted-secrets-for-an-environment)
+> [!WARNING]
+> The Githuh Action *Force* push changes to HuggingFace Space
+> This is due to the creation of the requirements.txt that happens on the fly.
+> This template assumes that you are the sole contributor to your space.
+## Resources
+* [Gradio Course](https://huggingface.co/learn/llm-course/chapter9/2?fw=pt)
+* [Gradio Doc](https://www.gradio.app/guides/quickstart)
+* Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

app.py ADDED Viewed

	@@ -0,0 +1,211 @@

+import spaces, torch, time
+import gradio as gr
+from transformers import (
+    AutoModelForImageTextToText,
+    AutoProcessor,
+    BitsAndBytesConfig,
+)
+# Flash Attention for ZeroGPU
+import subprocess
+subprocess.run(
+    "pip install flash-attn --no-build-isolation",
+    env={"FLASH_ATTENTION_SKIP_CUDA_BUILD": "TRUE"},
+    shell=True,
+)
+# Set target DEVICE and DTYPE
+DTYPE = (
+    torch.bfloat16
+    if torch.cuda.is_available() and torch.cuda.is_bf16_supported()
+    else torch.float16
+)
+DEVICE = "auto"
+print(f"Device: {DEVICE}, dtype: {DTYPE}")
+def load_model(
+    model_name: str = "chancharikm/qwen2.5-vl-7b-cam-motion-preview",
+    use_flash_attention: bool = True,
+    apply_quantization: bool = True,
+):
+    bnb_config = BitsAndBytesConfig(
+        load_in_4bit=True,  # Load model weights in 4-bit
+        bnb_4bit_quant_type="nf4",  # Use NF4 quantization (or "fp4")
+        bnb_4bit_compute_dtype=DTYPE,  # Perform computations in bfloat16/float16
+        bnb_4bit_use_double_quant=True,  # Optional: further quantization for slightly more memory saving
+    )
+    # Determine model family from model name
+    model_family = model_name.split("/")[-1].split("-")[0]
+    # Common model loading arguments
+    common_args = {
+        "torch_dtype": DTYPE,
+        "device_map": DEVICE,
+        "low_cpu_mem_usage": True,
+        "quantization_config": bnb_config if apply_quantization else None,
+    }
+    if use_flash_attention:
+        common_args["attn_implementation"] = "flash_attention_2"
+    # Load model based on family
+    match model_family:
+        # case "qwen2.5" | "Qwen2.5":
+        #     model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
+        #         model_name, **common_args
+        #     )
+        case "InternVL3":
+            model = AutoModelForImageTextToText.from_pretrained(
+                model_name, **common_args
+            )
+        case _:
+            raise ValueError(f"Unsupported model family: {model_family}")
+    # Set model to evaluation mode for inference (disables dropout, etc.)
+    return model.eval()
+def load_processor(model_name="Qwen/Qwen2.5-VL-7B-Instruct"):
+    return AutoProcessor.from_pretrained(
+        model_name,
+        device_map=DEVICE,
+        use_fast=True,
+        torch_dtype=DTYPE,
+    )
+print("Loading Models and Processors...")
+MODEL_ZOO = {
+    "qwen2.5-vl-7b-instruct": load_model(
+        model_name="Qwen/Qwen2.5-VL-7B-Instruct",
+        use_flash_attention=False,
+        apply_quantization=False,
+    ),
+    "InternVL3-1B-hf": load_model(
+        model_name="OpenGVLab/InternVL3-1B-hf",
+        use_flash_attention=False,
+        apply_quantization=False,
+    ),
+    "InternVL3-2B-hf": load_model(
+        model_name="OpenGVLab/InternVL3-2B-hf",
+        use_flash_attention=False,
+        apply_quantization=False,
+    ),
+    "InternVL3-8B-hf": load_model(
+        model_name="OpenGVLab/InternVL3-8B-hf",
+        use_flash_attention=False,
+        apply_quantization=True,
+    ),
+}
+PROCESSORS = {
+    "qwen2.5-vl-7b-instruct": load_processor("Qwen/Qwen2.5-VL-7B-Instruct"),
+    "InternVL3-1B-hf": load_processor("OpenGVLab/InternVL3-1B-hf"),
+    "InternVL3-2B-hf": load_processor("OpenGVLab/InternVL3-2B-hf"),
+    "InternVL3-8B-hf": load_processor("OpenGVLab/InternVL3-8B-hf"),
+}
+print("Models and Processors Loaded!")
+# Our Inference Function
+@spaces.GPU(duration=120)
+def video_inference(
+    video_path: str,
+    prompt: str,
+    model_name: str,
+    fps: int = 8,
+    max_tokens: int = 512,
+    temperature: float = 0.1,
+):
+    s_time = time.time()
+    model = MODEL_ZOO[model_name]
+    processor = PROCESSORS[model_name]
+    messages = [
+        {
+            "role": "user",
+            "content": [
+                {
+                    "type": "video",
+                    "video": video_path,
+                },
+                {"type": "text", "text": prompt},
+            ],
+        }
+    ]
+    with torch.no_grad():
+        model_family = model_name.split("-")[0]
+        match model_family:
+            case "InternVL3":
+                inputs = processor.apply_chat_template(
+                    messages,
+                    add_generation_prompt=True,
+                    tokenize=True,
+                    return_dict=True,
+                    return_tensors="pt",
+                    fps=fps,
+                    # num_frames = 8
+                ).to("cuda", dtype=DTYPE)
+                output = model.generate(
+                    **inputs,
+                    max_new_tokens=max_tokens,
+                    temperature=float(temperature),
+                    do_sample=temperature > 0.0,
+                )
+                output_text = processor.decode(
+                    output[0, inputs["input_ids"].shape[1] :], skip_special_tokens=True
+                )
+            case _:
+                raise ValueError(f"{model_name} is not currently supported")
+    return {
+        "output_text": output_text,
+        "fps": fps,
+        "inference_time": time.time() - s_time,
+    }
+# the Gradio App
+app = gr.Interface(
+    fn=inference,
+    inputs=[
+        gr.Video(label="Input Video"),
+        gr.Textbox(
+            label="Prompt",
+            lines=3,
+            info="Some models like [cam motion](https://huggingface.co/chancharikm/qwen2.5-vl-7b-cam-motion-preview) are trained specific prompts",
+            value="Describe the camera motion in this video.",
+        ),
+        gr.Dropdown(label="Model", choices=list(MODEL_ZOO.keys())),
+        gr.Number(
+            label="FPS",
+            info="inference sampling rate (Qwen2.5VL is trained on videos with 8 fps); a value of 0 means the FPS of the input video will be used",
+            value=8,
+            minimum=0,
+            step=1,
+        ),
+        gr.Slider(
+            label="Max Tokens",
+            info="maximum number of tokens to generate",
+            value=128,
+            minimum=32,
+            maximum=512,
+            step=32,
+        ),
+        gr.Slider(
+            label="Temperature",
+            value=0.0,
+            minimum=0.0,
+            maximum=1.0,
+            step=0.1,
+        ),
+    ],
+    outputs=gr.JSON(label="Output JSON"),
+    title="Video Chat with VLM",
+    description='comparing various "small" VLMs on the task of video captioning',
+    api_name="video_inference",
+)
+app.launch(
+    mcp_server=True, app_kwargs={"docs_url": "/docs"}  # add FastAPI Swagger API Docs
+)

pyproject.toml ADDED Viewed

	@@ -0,0 +1,13 @@

+[project]
+name = "hfs-template"
+version = "0.1.0"
+description = "Add your description here"
+readme = "README.md"
+requires-python = ">=3.10"
+dependencies = [
+    "gradio>=5.38.0",
+    "transformers==4.44.0",
+    "pydantic==2.10.6",
+    "loguru>=0.7.3",
+    "qwen-vl-utils>=0.0.11"
+]

uv.lock ADDED Viewed

The diff for this file is too large to render. See raw diff