gemma-demo / src /app.py
AC2513's picture
added test for framing
eab0adb
raw
history blame
1.62 kB
import torch
from huggingface_hub import login
from collections.abc import Iterator
from transformers import Gemma3ForConditionalGeneration, TextIteratorStreamer, Gemma3Processor
import spaces
from threading import Thread
import gradio as gr
import os
from dotenv import load_dotenv, find_dotenv
import cv2
from loguru import logger
from PIL import Image
dotenv_path = find_dotenv()
load_dotenv(dotenv_path)
model_id = os.getenv("MODEL_ID", "google/gemma-3-4b-it")
input_processor = Gemma3Processor.from_pretrained(model_id)
model = Gemma3ForConditionalGeneration.from_pretrained(
model_id,
torch_dtype=torch.bfloat16,
device_map="auto",
attn_implementation="eager",
)
def get_frames(video_path: str, max_images: int) -> list[tuple[Image.Image, float]]:
frames: list[tuple[Image.Image, float]] = []
capture = cv2.VideoCapture(video_path)
if not capture.isOpened():
raise ValueError(f"Could not open video file: {video_path}")
fps = capture.get(cv2.CAP_PROP_FPS)
total_frames = int(capture.get(cv2.CAP_PROP_FRAME_COUNT))
frame_interval = max(total_frames // max_images, 1)
for i in range(0, min(total_frames, max_images * frame_interval), frame_interval):
if len(frames) >= max_images:
break
capture.set(cv2.CAP_PROP_POS_FRAMES, i)
success, image = capture.read()
if success:
image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
pil_image = Image.fromarray(image)
timestamp = round(i / fps, 2)
frames.append((pil_image, timestamp))
capture.release()
return frames