File size: 3,007 Bytes
7766a5c
028a367
49d3ba7
d125cdc
f275d7c
7766a5c
 
 
d125cdc
49d3ba7
 
 
d125cdc
49d3ba7
d125cdc
49d3ba7
d125cdc
71b45b9
7766a5c
d125cdc
 
 
851e8b5
49d3ba7
028a367
851e8b5
 
 
 
7766a5c
851e8b5
49d3ba7
d125cdc
49d3ba7
 
d125cdc
 
49d3ba7
028a367
49d3ba7
7766a5c
 
 
 
 
 
 
 
 
 
 
49d3ba7
 
 
 
7766a5c
49d3ba7
 
 
028a367
49d3ba7
028a367
49d3ba7
 
 
 
 
 
028a367
49d3ba7
 
7766a5c
 
49d3ba7
 
 
 
 
71b45b9
49d3ba7
851e8b5
71b45b9
d125cdc
851e8b5
71b45b9
 
d125cdc
 
71b45b9
 
 
49d3ba7
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
import os
import io
import time
import sys
import requests
from PIL import Image, ImageSequence
import gradio as gr

# llama-cpp-python import
try:
    from llama_cpp import Llama
except Exception as e:
    raise RuntimeError("llama-cpp-python import failed: " + str(e))

MODEL_PATH = os.path.join("model", "llama-joycaption-q4_k_m.gguf")
if not os.path.exists(MODEL_PATH):
    raise FileNotFoundError(f"Model not found at {MODEL_PATH}. Ensure start.sh downloaded the GGUF.")

def download_bytes(url: str, timeout: int = 30) -> bytes:
    with requests.get(url, stream=True, timeout=timeout) as r:
        r.raise_for_status()
        return r.content

def load_first_frame_from_bytes(raw: bytes):
    img = Image.open(io.BytesIO(raw))
    if getattr(img, "is_animated", False):
        img = next(ImageSequence.Iterator(img))
    if img.mode != "RGB":
        img = img.convert("RGB")
    return img

def make_prompt_for_image(image_path: str, user_prompt: str = "Describe the image."):
    # JoyCaption-style multimodal GGUFs accept <img>{path}</img>
    return f"<img>{image_path}</img>\nUser: {user_prompt}\nAssistant:"

# Initialize model (low-resource options)
print("Loading GGUF model (this can take 30–120s)...", file=sys.stderr)
llm = Llama(model_path=MODEL_PATH, n_ctx=2048, n_threads=2)

def generate_caption_from_url(url: str, prompt: str = "Describe the image."):
    if not url:
        return "No URL provided."
    try:
        raw = download_bytes(url)
    except Exception as e:
        return f"Download error: {e}"
    try:
        img = load_first_frame_from_bytes(raw)
    except Exception as e:
        return f"Image processing error: {e}"

    tmp_dir = "/tmp/joycap"
    os.makedirs(tmp_dir, exist_ok=True)
    ts = int(time.time() * 1000)
    tmp_path = os.path.join(tmp_dir, f"{ts}.jpg")
    try:
        img.save(tmp_path, format="JPEG", quality=85)
    except Exception as e:
        return f"Failed to save temp image: {e}"

    prompt_full = make_prompt_for_image(tmp_path, prompt)
    try:
        resp = llm.create(
            prompt=prompt_full,
            max_tokens=256,
            temperature=0.2,
            top_p=0.95,
            stop=["User:", "Assistant:"],
        )
        text = resp.get("choices", [{}])[0].get("text", "").strip()
        return text or "No caption generated."
    except Exception as e:
        return f"Inference error: {e}"
    finally:
        try:
            os.remove(tmp_path)
        except Exception:
            pass

iface = gr.Interface(
    fn=generate_caption_from_url,
    inputs=[
        gr.Textbox(label="Image URL", placeholder="https://example.com/photo.jpg"),
        gr.Textbox(label="Prompt (optional)", value="Describe the image."),
    ],
    outputs=gr.Textbox(label="Generated caption"),
    title="JoyCaption GGUF (Q4_K_M)",
    description="Runs a quantized JoyCaption GGUF locally via llama.cpp (no external API).",
)

if __name__ == "__main__":
    iface.launch(server_name="0.0.0.0", server_port=7860)