Spaces:
Build error
Build error
File size: 3,582 Bytes
7766a5c 028a367 49d3ba7 f275d7c 7766a5c 49d3ba7 71b45b9 49d3ba7 7766a5c 028a367 851e8b5 49d3ba7 028a367 851e8b5 7766a5c 851e8b5 49d3ba7 028a367 49d3ba7 7766a5c 49d3ba7 7766a5c 49d3ba7 028a367 49d3ba7 028a367 49d3ba7 028a367 49d3ba7 7766a5c 49d3ba7 71b45b9 49d3ba7 851e8b5 71b45b9 7766a5c 851e8b5 71b45b9 49d3ba7 71b45b9 49d3ba7 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 | import os
import io
import sys
import time
import requests
from PIL import Image, ImageSequence
import gradio as gr
# Try to import llama-cpp-python
try:
from llama_cpp import Llama
except Exception as e:
raise RuntimeError("llama-cpp-python import failed; ensure requirements installed and wheel built: " + str(e))
MODEL_PATH = os.path.join("model", "model.gguf") # start.sh places GGUF here
if not os.path.exists(MODEL_PATH):
raise FileNotFoundError(f"Model not found at {MODEL_PATH}. Set correct GGUF in start.sh and redeploy.")
# Helper: load first frame and convert to JPEG bytes
def download_bytes(url: str, timeout: int = 30) -> bytes:
with requests.get(url, stream=True, timeout=timeout) as resp:
resp.raise_for_status()
return resp.content
def load_first_frame_from_bytes(raw: bytes):
img = Image.open(io.BytesIO(raw))
if getattr(img, "is_animated", False):
img = next(ImageSequence.Iterator(img))
if img.mode != "RGB":
img = img.convert("RGB")
return img
# Minimal image caption prompt template — adjust for your model's expected prompt
def make_prompt_for_image(image_path: str, user_prompt: str = "Describe the image."):
# Many llama.cpp-based multimodal ggufs accept: "<img>{path}</img>\nUser: {prompt}\nAssistant:"
# We'll use that pattern.
return f"<img>{image_path}</img>\nUser: {user_prompt}\nAssistant:"
# Start model (llama-cpp-python will mmap model and run inference)
# Use low-memory opts: n_ctx small, use_mlock=0, n_gpu_layers=0
print("Loading model (this may take a minute)...", file=sys.stderr)
llm = Llama(model_path=MODEL_PATH, n_ctx=2048, n_threads=2)
def generate_caption_from_url(url: str, prompt: str = "Describe the image."):
if not url:
return "No URL provided."
try:
raw = download_bytes(url)
except Exception as e:
return f"Download error: {e}"
try:
img = load_first_frame_from_bytes(raw)
except Exception as e:
return f"Image processing error: {e}"
# Save a temporary JPEG locally so the gguf image token loader can access it
tmp_dir = "/tmp/joycap"
os.makedirs(tmp_dir, exist_ok=True)
ts = int(time.time() * 1000)
tmp_path = os.path.join(tmp_dir, f"{ts}.jpg")
try:
img.save(tmp_path, format="JPEG", quality=85)
except Exception as e:
return f"Failed to save temp image: {e}"
prompt_full = make_prompt_for_image(tmp_path, prompt)
try:
# llama-cpp-python generate call
resp = llm.create(
prompt=prompt_full,
max_tokens=256,
temperature=0.2,
top_p=0.95,
stop=["User:", "Assistant:"],
)
text = resp.get("choices", [{}])[0].get("text", "").strip()
return text or "No caption generated."
except Exception as e:
return f"Inference error: {e}"
finally:
try:
os.remove(tmp_path)
except Exception:
pass
iface = gr.Interface(
fn=generate_caption_from_url,
inputs=[
gr.Textbox(label="Image / GIF / MP4 URL", placeholder="https://example.com/photo.jpg"),
gr.Textbox(label="Prompt (optional)", value="Describe the image."),
],
outputs=gr.Textbox(label="Generated caption"),
title="JoyCaption - local GGUF (Q4)",
description="Runs a quantized GGUF model locally via llama.cpp (no external APIs). Ensure the GGUF in start.sh is a multimodal model that supports <img> tags.",
)
if __name__ == "__main__":
iface.launch(server_name="0.0.0.0", server_port=7860)
|