File size: 3,582 Bytes
7766a5c
028a367
49d3ba7
 
f275d7c
7766a5c
 
 
49d3ba7
 
 
 
 
 
 
 
 
71b45b9
49d3ba7
7766a5c
028a367
 
 
851e8b5
49d3ba7
028a367
851e8b5
 
 
 
7766a5c
851e8b5
49d3ba7
 
 
 
 
 
 
 
 
 
028a367
49d3ba7
7766a5c
 
 
 
 
 
 
 
 
 
 
49d3ba7
 
 
 
 
7766a5c
49d3ba7
 
 
028a367
49d3ba7
028a367
49d3ba7
 
 
 
 
 
 
028a367
49d3ba7
 
7766a5c
 
49d3ba7
 
 
 
 
71b45b9
49d3ba7
851e8b5
71b45b9
7766a5c
851e8b5
71b45b9
 
49d3ba7
 
71b45b9
 
 
49d3ba7
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
import os
import io
import sys
import time
import requests
from PIL import Image, ImageSequence
import gradio as gr

# Try to import llama-cpp-python
try:
    from llama_cpp import Llama
except Exception as e:
    raise RuntimeError("llama-cpp-python import failed; ensure requirements installed and wheel built: " + str(e))

MODEL_PATH = os.path.join("model", "model.gguf")  # start.sh places GGUF here
if not os.path.exists(MODEL_PATH):
    raise FileNotFoundError(f"Model not found at {MODEL_PATH}. Set correct GGUF in start.sh and redeploy.")

# Helper: load first frame and convert to JPEG bytes
def download_bytes(url: str, timeout: int = 30) -> bytes:
    with requests.get(url, stream=True, timeout=timeout) as resp:
        resp.raise_for_status()
        return resp.content

def load_first_frame_from_bytes(raw: bytes):
    img = Image.open(io.BytesIO(raw))
    if getattr(img, "is_animated", False):
        img = next(ImageSequence.Iterator(img))
    if img.mode != "RGB":
        img = img.convert("RGB")
    return img

# Minimal image caption prompt template — adjust for your model's expected prompt
def make_prompt_for_image(image_path: str, user_prompt: str = "Describe the image."):
    # Many llama.cpp-based multimodal ggufs accept: "<img>{path}</img>\nUser: {prompt}\nAssistant:"
    # We'll use that pattern.
    return f"<img>{image_path}</img>\nUser: {user_prompt}\nAssistant:"

# Start model (llama-cpp-python will mmap model and run inference)
# Use low-memory opts: n_ctx small, use_mlock=0, n_gpu_layers=0
print("Loading model (this may take a minute)...", file=sys.stderr)
llm = Llama(model_path=MODEL_PATH, n_ctx=2048, n_threads=2)

def generate_caption_from_url(url: str, prompt: str = "Describe the image."):
    if not url:
        return "No URL provided."
    try:
        raw = download_bytes(url)
    except Exception as e:
        return f"Download error: {e}"
    try:
        img = load_first_frame_from_bytes(raw)
    except Exception as e:
        return f"Image processing error: {e}"

    # Save a temporary JPEG locally so the gguf image token loader can access it
    tmp_dir = "/tmp/joycap"
    os.makedirs(tmp_dir, exist_ok=True)
    ts = int(time.time() * 1000)
    tmp_path = os.path.join(tmp_dir, f"{ts}.jpg")
    try:
        img.save(tmp_path, format="JPEG", quality=85)
    except Exception as e:
        return f"Failed to save temp image: {e}"

    prompt_full = make_prompt_for_image(tmp_path, prompt)
    try:
        # llama-cpp-python generate call
        resp = llm.create(
            prompt=prompt_full,
            max_tokens=256,
            temperature=0.2,
            top_p=0.95,
            stop=["User:", "Assistant:"],
        )
        text = resp.get("choices", [{}])[0].get("text", "").strip()
        return text or "No caption generated."
    except Exception as e:
        return f"Inference error: {e}"
    finally:
        try:
            os.remove(tmp_path)
        except Exception:
            pass

iface = gr.Interface(
    fn=generate_caption_from_url,
    inputs=[
        gr.Textbox(label="Image / GIF / MP4 URL", placeholder="https://example.com/photo.jpg"),
        gr.Textbox(label="Prompt (optional)", value="Describe the image."),
    ],
    outputs=gr.Textbox(label="Generated caption"),
    title="JoyCaption - local GGUF (Q4)",
    description="Runs a quantized GGUF model locally via llama.cpp (no external APIs). Ensure the GGUF in start.sh is a multimodal model that supports <img> tags.",
)

if __name__ == "__main__":
    iface.launch(server_name="0.0.0.0", server_port=7860)