Spaces:

anaspro
/

chatbox

Runtime error

App Files Files Community

anaspro commited on Oct 29, 2025

Commit

f202e6b

1 Parent(s): 6b28f41

updatE

Browse files

Files changed (4) hide show

app.py +147 -203
app2.py +10 -4
examples/image1.jpg +0 -0
requirements.txt +7 -8

app.py CHANGED Viewed

@@ -1,234 +1,178 @@
-import os
-import pathlib
-import tempfile
-from collections.abc import Iterator
-from threading import Thread
-import av
 import gradio as gr
-import spaces
 import torch
-from transformers import AutoModelForImageTextToText, AutoProcessor
-from transformers.generation.streamers import TextIteratorStreamer
-# Model configuration
-model_id = "anaspro/Shako-4B-it-v3"
-processor = AutoProcessor.from_pretrained(model_id)
-model = AutoModelForImageTextToText.from_pretrained(
-    model_id,
     device_map="auto",
     torch_dtype=torch.bfloat16
-)
-# Supported file types
-IMAGE_FILE_TYPES = (".jpg", ".jpeg", ".png", ".webp")
-VIDEO_FILE_TYPES = (".mp4", ".mov", ".webm")
-AUDIO_FILE_TYPES = (".mp3", ".wav")
-# Video processing settings
-TARGET_FPS = int(os.getenv("TARGET_FPS", "3"))
-MAX_FRAMES = int(os.getenv("MAX_FRAMES", "30"))
-MAX_INPUT_TOKENS = int(os.getenv("MAX_INPUT_TOKENS", "10_000"))
-def get_file_type(path: str) -> str:
-    if path.endswith(IMAGE_FILE_TYPES):
-        return "image"
-    if path.endswith(VIDEO_FILE_TYPES):
-        return "video"
-    if path.endswith(AUDIO_FILE_TYPES):
-        return "audio"
-    error_message = f"Unsupported file type: {path}"
-    raise ValueError(error_message)
-def count_files_in_new_message(paths: list[str]) -> tuple[int, int]:
-    video_count = 0
-    non_video_count = 0
-    for path in paths:
-        if path.endswith(VIDEO_FILE_TYPES):
-            video_count += 1
-        else:
-            non_video_count += 1
-    return video_count, non_video_count
-def validate_media_constraints(message: dict) -> bool:
-    video_count, non_video_count = count_files_in_new_message(message["files"])
-    if video_count > 1:
-        gr.Warning("Only one video is supported.")
-        return False
-    if video_count == 1 and non_video_count > 0:
-        gr.Warning("Mixing images and videos is not allowed.")
-        return False
-    return True
-def extract_frames_to_tempdir(
-    video_path: str,
-    target_fps: float,
-    max_frames: int | None = None,
-    parent_dir: str | None = None,
-    prefix: str = "frames_",
-) -> str:
-    temp_dir = tempfile.mkdtemp(prefix=prefix, dir=parent_dir)
-    container = av.open(video_path)
-    video_stream = container.streams.video[0]
-    if video_stream.duration is None or video_stream.time_base is None:
-        raise ValueError("video_stream is missing duration or time_base")
-    time_base = video_stream.time_base
-    duration = float(video_stream.duration * time_base)
-    interval = 1.0 / target_fps
-    total_frames = int(duration * target_fps)
-    if max_frames is not None:
-        total_frames = min(total_frames, max_frames)
-    target_times = [i * interval for i in range(total_frames)]
-    target_index = 0
-    for frame in container.decode(video=0):
-        if frame.pts is None:
-            continue
-        timestamp = float(frame.pts * time_base)
-        if target_index < len(target_times) and abs(timestamp - target_times[target_index]) < (interval / 2):
-            frame_path = pathlib.Path(temp_dir) / f"frame_{target_index:04d}.jpg"
-            frame.to_image().save(frame_path)
-            target_index += 1
-            if max_frames is not None and target_index >= max_frames:
-                break
-    container.close()
-    return temp_dir
-def process_new_user_message(message: dict) -> list[dict]:
-    if not message["files"]:
-        return [{"type": "text", "text": message["text"]}]
-    file_types = [get_file_type(path) for path in message["files"]]
-    if len(file_types) == 1 and file_types[0] == "video":
-        gr.Info(f"Video will be processed at {TARGET_FPS} FPS, max {MAX_FRAMES} frames in this Space.")
-        temp_dir = extract_frames_to_tempdir(
-            message["files"][0],
-            target_fps=TARGET_FPS,
-            max_frames=MAX_FRAMES,
-        )
-        paths = sorted(pathlib.Path(temp_dir).glob("*.jpg"))
-        return [
-            {"type": "text", "text": message["text"]},
-            *[{"type": "image", "image": path.as_posix()} for path in paths],
-        ]
-    return [
-        {"type": "text", "text": message["text"]},
-        *[{"type": file_type, file_type: path} for path, file_type in zip(message["files"], file_types, strict=True)],
-    ]
-def process_history(history: list[dict]) -> list[dict]:
     messages = []
-    current_user_content: list[dict] = []
-    for item in history:
-        if item["role"] == "assistant":
-            if current_user_content:
-                messages.append({"role": "user", "content": current_user_content})
-                current_user_content = []
-            messages.append({"role": "assistant", "content": [{"type": "text", "text": item["content"]}]})
-        else:
-            content = item["content"]
             if isinstance(content, str):
                 current_user_content.append({"type": "text", "text": content})
             else:
-                filepath = content[0]
-                file_type = get_file_type(filepath)
-                current_user_content.append({"type": file_type, file_type: filepath})
     return messages
-@spaces.GPU()
-@torch.inference_mode()
-def generate(message: dict, history: list[dict], system_prompt: str = "", max_new_tokens: int = 512) -> Iterator[str]:
-    if not validate_media_constraints(message):
-        yield ""
-        return
-    messages = []
-    if system_prompt:
-        messages.append({"role": "system", "content": [{"type": "text", "text": system_prompt}]})
-    messages.extend(process_history(history))
-    messages.append({"role": "user", "content": process_new_user_message(message)})
     inputs = processor.apply_chat_template(
         messages,
         add_generation_prompt=True,
         tokenize=True,
-        return_dict=True,
         return_tensors="pt",
-    )
-    n_tokens = inputs["input_ids"].shape[1]
-    if n_tokens > MAX_INPUT_TOKENS:
-        gr.Warning(
-            f"Input too long. Max {MAX_INPUT_TOKENS} tokens. Got {n_tokens} tokens. This limit is set to avoid CUDA out-of-memory errors in this Space."
-        )
-        yield ""
-        return
-    inputs = inputs.to(device=model.device, dtype=torch.bfloat16)
-    streamer = TextIteratorStreamer(processor, timeout=30.0, skip_prompt=True, skip_special_tokens=True)
-    generate_kwargs = dict(
         inputs,
         streamer=streamer,
         max_new_tokens=max_new_tokens,
         do_sample=True,
-        temperature=1.0,
-        top_k=64,
-        top_p=0.95,
-        min_p=0.0,
-        disable_compile=True,
     )
-    t = Thread(target=model.generate, kwargs=generate_kwargs)
-    t.start()
-    output = ""
-    for delta in streamer:
-        output += delta
-        yield output
-# Examples for the chat interface (with additional inputs: system_prompt, max_new_tokens)
-examples = [
-    ["What is the capital of France?", "You are a helpful assistant.", 700],
-    ["Explain quantum computing in simple terms", "You are a helpful assistant.", 512],
-    ["Write a short story about a robot learning to paint", "You are a helpful assistant.", 1000]
-]
-# Create the chat interface
 demo = gr.ChatInterface(
-    fn=generate,
     type="messages",
     textbox=gr.MultimodalTextbox(
-        file_types=list(IMAGE_FILE_TYPES + VIDEO_FILE_TYPES + AUDIO_FILE_TYPES),
         file_count="multiple",
-        autofocus=True,
     ),
     multimodal=True,
-    additional_inputs=[
-        gr.Textbox(label="System Prompt", value="انت موديل عراقي عادي من بغداد، ذكي ومرح. تتحدث بالعراقي فقط وتجاوب بتفصيل حسب السؤال. ما تستخدم فصحى ابدا."),
-        gr.Slider(label="Max New Tokens", minimum=100, maximum=2000, step=10, value=700),
-    ],
-    title="Shako IRAQI AI",
-    examples=examples,
-    stop_btn=False,
 )
 if __name__ == "__main__":

 import gradio as gr
+import cv2
 import torch
+from PIL import Image
+from pathlib import Path
+from threading import Thread
+from transformers import AutoProcessor, Gemma3ForConditionalGeneration, TextIteratorStreamer
+import spaces
+import time
+# model config
+model_12b_name = "google/gemma-3-12b-it"
+model_4b_name = "google/gemma-3-4b-it"
+model_12b = Gemma3ForConditionalGeneration.from_pretrained(
+    model_12b_name,
     device_map="auto",
     torch_dtype=torch.bfloat16
+).eval()
+processor_12b = AutoProcessor.from_pretrained(model_12b_name)
+model_4b = Gemma3ForConditionalGeneration.from_pretrained(
+    model_4b_name,
+    device_map="auto",
+    torch_dtype=torch.bfloat16
+).eval()
+processor_4b = AutoProcessor.from_pretrained(model_4b_name)
+# I will add timestamp later
+def extract_video_frames(video_path, num_frames=8):
+    cap = cv2.VideoCapture(video_path)
+    frames = []
+    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
+    step = max(total_frames // num_frames, 1)
+    for i in range(num_frames):
+        cap.set(cv2.CAP_PROP_POS_FRAMES, i * step)
+        ret, frame = cap.read()
+        if ret:
+            frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
+            frames.append(Image.fromarray(frame))
+    cap.release()
+    return frames
+def format_message(content, files):
+    message_content = []
+    if content:
+        parts = content.split('<image>')
+        for i, part in enumerate(parts):
+            if part.strip():
+                message_content.append({"type": "text", "text": part.strip()})
+            if i < len(parts) - 1 and files:
+                img = Image.open(files.pop(0))
+                message_content.append({"type": "image", "image": img})
+    for file in files:
+        file_path = file if isinstance(file, str) else file.name
+        if Path(file_path).suffix.lower() in ['.jpg', '.jpeg', '.png']:
+            img = Image.open(file_path)
+            message_content.append({"type": "image", "image": img})
+        elif Path(file_path).suffix.lower() in ['.mp4', '.mov']:
+            frames = extract_video_frames(file_path)
+            for frame in frames:
+                message_content.append({"type": "image", "image": frame})
+    return message_content
+def format_conversation_history(chat_history):
     messages = []
+    current_user_content = []
+    for item in chat_history:
+        role = item["role"]
+        content = item["content"]
+        if role == "user":
             if isinstance(content, str):
                 current_user_content.append({"type": "text", "text": content})
+            elif isinstance(content, list):
+                current_user_content.extend(content)
             else:
+                current_user_content.append({"type": "text", "text": str(content)})
+        elif role == "assistant":
+            if current_user_content:
+                messages.append({"role": "user", "content": current_user_content})
+                current_user_content = []
+            messages.append({"role": "assistant", "content": [{"type": "text", "text": str(content)}]})
+    if current_user_content:
+        messages.append({"role": "user", "content": current_user_content})
     return messages
+@spaces.GPU(duration=120)
+def generate_response(input_data, chat_history, model_choice, max_new_tokens, system_prompt, temperature, top_p, top_k, repetition_penalty):
+    if isinstance(input_data, dict) and "text" in input_data:
+        text = input_data["text"]
+        files = input_data.get("files", [])
+    else:
+        text = str(input_data)
+        files = []
+    new_message_content = format_message(text, files)
+    new_message = {"role": "user", "content": new_message_content}
+    system_message = [{"role": "system", "content": [{"type": "text", "text": system_prompt}]}] if system_prompt else []
+    processed_history = format_conversation_history(chat_history)
+    messages = system_message + processed_history
+    if messages and messages[-1]["role"] == "user":
+        messages[-1]["content"].extend(new_message["content"])
+    else:
+        messages.append(new_message)
+    if model_choice == "Gemma 3 12B":
+        model = model_12b
+        processor = processor_12b
+    else:
+        model = model_4b
+        processor = processor_4b
     inputs = processor.apply_chat_template(
         messages,
         add_generation_prompt=True,
         tokenize=True,
         return_tensors="pt",
+        return_dict=True
+    ).to(model.device)
+    streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
+    generation_kwargs = dict(
         inputs,
         streamer=streamer,
         max_new_tokens=max_new_tokens,
         do_sample=True,
+        temperature=temperature,
+        top_p=top_p,
+        top_k=top_k,
+        repetition_penalty=repetition_penalty
     )
+    thread = Thread(target=model.generate, kwargs=generation_kwargs)
+    thread.start()
+    outputs = []
+    for text in streamer:
+        outputs.append(text)
+        yield "".join(outputs)
 demo = gr.ChatInterface(
+    fn=generate_response,
+    additional_inputs=[
+        gr.Dropdown(
+            label="Model",
+            choices=["Gemma 3 12B", "Gemma 3 4B"],
+            value="Gemma 3 12B"
+        ),
+        gr.Slider(label="Max new tokens", minimum=100, maximum=2000, step=1, value=512),
+        gr.Textbox(
+            label="System Prompt",
+            value="You are a friendly chatbot. ",
+            lines=4,
+            placeholder="Change system prompt"
+        ),
+        gr.Slider(label="Temperature", minimum=0.1, maximum=2.0, step=0.1, value=0.7),
+        gr.Slider(label="Top-p", minimum=0.05, maximum=1.0, step=0.05, value=0.9),
+        gr.Slider(label="Top-k", minimum=1, maximum=100, step=1, value=50),
+        gr.Slider(label="Repetition Penalty", minimum=1.0, maximum=2.0, step=0.05, value=1.0),
+    ],
+    examples=[
+        [{"text": "Explain this image", "files": ["examples/image1.jpg"]}],
+    ],
+    cache_examples=False,
     type="messages",
+    description="""
+    # Gemma 3
+    You can pick your model 12B or 4B, upload images or videos, and adjust settings below to customize your experience.
+    """,
+    fill_height=True,
     textbox=gr.MultimodalTextbox(
+        label="Query Input",
+        file_types=["image", "video"],
         file_count="multiple",
+        placeholder="Type your message or upload media"
     ),
+    stop_btn="Stop Generation",
     multimodal=True,
+    theme=gr.themes.Soft(),
 )
 if __name__ == "__main__":

app2.py CHANGED Viewed

@@ -12,7 +12,7 @@ from transformers import AutoModelForImageTextToText, AutoProcessor
 from transformers.generation.streamers import TextIteratorStreamer
 # Model configuration
-model_id = "anaspro/Shako-4B-it-v2"
 processor = AutoProcessor.from_pretrained(model_id)
 model = AutoModelForImageTextToText.from_pretrained(
     model_id,
@@ -189,7 +189,11 @@ def generate(message: dict, history: list[dict], system_prompt: str = "", max_ne
         inputs,
         streamer=streamer,
         max_new_tokens=max_new_tokens,
-        do_sample=False,
         disable_compile=True,
     )
     t = Thread(target=model.generate, kwargs=generate_kwargs)
@@ -203,7 +207,9 @@ def generate(message: dict, history: list[dict], system_prompt: str = "", max_ne
 # Examples for the chat interface (with additional inputs: system_prompt, max_new_tokens)
 examples = [
-    ["انت موديل عراقي تحكي هعراقي فقط وتكون ترفيهي", 700]
 ]
 # Create the chat interface
@@ -217,7 +223,7 @@ demo = gr.ChatInterface(
     ),
     multimodal=True,
     additional_inputs=[
-        gr.Textbox(label="System Prompt", value="انت ذكاء صناعي يتحدث باللهجة العراقية بس ما تستخدم فصحى ابدا"),
         gr.Slider(label="Max New Tokens", minimum=100, maximum=2000, step=10, value=700),
     ],
     title="Shako IRAQI AI",

 from transformers.generation.streamers import TextIteratorStreamer
 # Model configuration
+model_id = "anaspro/Shako-4B-it-v3"
 processor = AutoProcessor.from_pretrained(model_id)
 model = AutoModelForImageTextToText.from_pretrained(
     model_id,
         inputs,
         streamer=streamer,
         max_new_tokens=max_new_tokens,
+        do_sample=True,
+        temperature=1.0,
+        top_k=64,
+        top_p=0.95,
+        min_p=0.0,
         disable_compile=True,
     )
     t = Thread(target=model.generate, kwargs=generate_kwargs)
 # Examples for the chat interface (with additional inputs: system_prompt, max_new_tokens)
 examples = [
+    ["What is the capital of France?", "You are a helpful assistant.", 700],
+    ["Explain quantum computing in simple terms", "You are a helpful assistant.", 512],
+    ["Write a short story about a robot learning to paint", "You are a helpful assistant.", 1000]
 ]
 # Create the chat interface
     ),
     multimodal=True,
     additional_inputs=[
+        gr.Textbox(label="System Prompt", value="انت موديل عراقي عادي من بغداد، ذكي ومرح. تتحدث بالعراقي فقط وتجاوب بتفصيل حسب السؤال. ما تستخدم فصحى ابدا."),
         gr.Slider(label="Max New Tokens", minimum=100, maximum=2000, step=10, value=700),
     ],
     title="Shako IRAQI AI",

examples/image1.jpg ADDED Viewed

requirements.txt CHANGED Viewed

@@ -1,8 +1,7 @@
-gradio>=4.0.0
-spaces[huggingface]>=0.28.0
-transformers>=4.35.0
-torch>=2.1.0
-av
-accelerate>=0.25.0
-timm
-gTTS>=2.5.0

+transformers
+spaces
+torch
+transformers @ git+https://github.com/huggingface/transformers@v4.49.0-Gemma-3
+pillow
+opencv-python
+accelerate