File size: 7,131 Bytes
7766a5c
028a367
49d3ba7
d125cdc
e76c937
e4bf697
7766a5c
 
 
d125cdc
49d3ba7
 
 
d125cdc
49d3ba7
e4bf697
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e76c937
 
e4bf697
e76c937
 
 
e4bf697
 
e76c937
e4bf697
 
 
 
 
 
 
 
 
 
 
 
e76c937
 
 
e4bf697
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e76c937
49d3ba7
e76c937
71b45b9
7766a5c
d125cdc
 
 
851e8b5
49d3ba7
028a367
851e8b5
 
 
 
7766a5c
851e8b5
49d3ba7
d125cdc
49d3ba7
 
d125cdc
 
e4bf697
49d3ba7
028a367
49d3ba7
7766a5c
 
 
 
 
 
 
 
 
 
 
49d3ba7
 
 
 
7766a5c
49d3ba7
 
 
028a367
49d3ba7
028a367
49d3ba7
 
 
 
 
 
028a367
49d3ba7
 
7766a5c
 
49d3ba7
 
 
 
 
71b45b9
49d3ba7
851e8b5
71b45b9
d125cdc
851e8b5
71b45b9
 
e76c937
d125cdc
71b45b9
 
 
49d3ba7
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
import os
import io
import time
import sys
import subprocess
import requests
from PIL import Image, ImageSequence
import gradio as gr

# llama-cpp-python import
try:
    from llama_cpp import Llama
except Exception as e:
    raise RuntimeError("llama-cpp-python import failed: " + str(e))

MODEL_DIR = "model"
EXPECTED_TARGET = os.path.join(MODEL_DIR, "llama-joycaption-q4_k_m.gguf")

# Candidate direct-download URLs (try in order)
CANDIDATES = [
    # Jasaga7818 copy (often a direct GGUF)
    ("https://huggingface.co/Jasaga7818/llama-joycaption-beta-one-hf-llava-Q4_K_M-GGUF/resolve/main/llama-joycaption-beta-one-hf-llava-q4_k_m.gguf",
     EXPECTED_TARGET),
    # mradermacher (alternate host)
    ("https://huggingface.co/mradermacher/llama-joycaption-beta-one-hf-llava-GGUF/resolve/main/llama-joycaption-beta-one-hf-llava-q4_k_m.gguf",
     EXPECTED_TARGET),
    # Fallback to Q4_K_S (Jasaga)
    ("https://huggingface.co/Jasaga7818/llama-joycaption-beta-one-hf-llava-Q4_K_M-GGUF/resolve/main/llama-joycaption-beta-one-hf-llava-q4_k_s.gguf",
     os.path.join(MODEL_DIR, "llama-joycaption-q4_k_s.gguf")),
    ("https://huggingface.co/mradermacher/llama-joycaption-beta-one-hf-llava-GGUF/resolve/main/llama-joycaption-beta-one-hf-llava-q4_k_s.gguf",
     os.path.join(MODEL_DIR, "llama-joycaption-q4_k_s.gguf")),
]

def download_curl(url: str, path: str) -> bool:
    os.makedirs(os.path.dirname(path), exist_ok=True)
    try:
        # Use curl for resume support and progress in logs
        subprocess.check_call(["curl", "-L", "-C", "-", "-o", path, url])
        return True
    except Exception:
        try:
            if os.path.exists(path):
                os.remove(path)
        except Exception:
            pass
        return False

def is_valid_gguf(path: str) -> bool:
    # GGUF files start with "GGUF" in ASCII at offset 0 (0x47 0x47 0x55 0x46).
    # Some converted uploads may be HTML pages or redirects; check header.
    try:
        with open(path, "rb") as f:
            head = f.read(8)
        return head.startswith(b"GGUF")
    except Exception:
        return False

def ensure_model() -> str:
    # If already present (and valid), use it.
    if os.path.exists(EXPECTED_TARGET) and is_valid_gguf(EXPECTED_TARGET):
        sys.stderr.write(f"Model already present and valid at {EXPECTED_TARGET}\n")
        return EXPECTED_TARGET

    sys.stderr.write("Model not found locally or invalid, attempting download (several GB)...\n")
    for url, dest in CANDIDATES:
        sys.stderr.write(f"Attempting download: {url} -> {dest}\n")
        if download_curl(url, dest):
            sys.stderr.write(f"Downloaded candidate to {dest}; verifying header...\n")
            if is_valid_gguf(dest):
                # If candidate wasn't the expected filename, create symlink so rest of code can use EXPECTED_TARGET.
                if os.path.abspath(dest) != os.path.abspath(EXPECTED_TARGET):
                    try:
                        if os.path.exists(EXPECTED_TARGET):
                            os.remove(EXPECTED_TARGET)
                        os.symlink(os.path.basename(dest), EXPECTED_TARGET)
                        sys.stderr.write(f"Created symlink {EXPECTED_TARGET} -> {os.path.basename(dest)}\n")
                    except Exception:
                        # fallback: copy
                        try:
                            import shutil
                            shutil.copyfile(dest, EXPECTED_TARGET)
                            sys.stderr.write(f"Copied {dest} to {EXPECTED_TARGET}\n")
                        except Exception:
                            sys.stderr.write("Warning: failed to symlink or copy candidate to expected filename.\n")
                sys.stderr.write("Model verified as GGUF and ready.\n")
                return EXPECTED_TARGET
            else:
                sys.stderr.write("Downloaded file is not a valid GGUF (header mismatch). Removing and trying next.\n")
                try:
                    os.remove(dest)
                except Exception:
                    pass
        else:
            sys.stderr.write("Download failed for candidate; trying next.\n")

    raise FileNotFoundError("Failed to download a valid GGUF model from candidates. Check URLs and repo availability.")

# Ensure model exists and is a GGUF before importing/initializing Llama
MODEL_PATH = ensure_model()
if not os.path.exists(MODEL_PATH):
    raise FileNotFoundError(f"Model not found at {MODEL_PATH} after download attempt.")

def download_bytes(url: str, timeout: int = 30) -> bytes:
    with requests.get(url, stream=True, timeout=timeout) as r:
        r.raise_for_status()
        return r.content

def load_first_frame_from_bytes(raw: bytes):
    img = Image.open(io.BytesIO(raw))
    if getattr(img, "is_animated", False):
        img = next(ImageSequence.Iterator(img))
    if img.mode != "RGB":
        img = img.convert("RGB")
    return img

def make_prompt_for_image(image_path: str, user_prompt: str = "Describe the image."):
    # JoyCaption-style multimodal GGUFs accept <img>{path}</img>
    return f"<img>{image_path}</img>\nUser: {user_prompt}\nAssistant:"

# Initialize model (low-resource options)
print("Loading GGUF model (this can take 30–120s)...", file=sys.stderr)
# Adjust n_threads for the Space CPU; increase if you know you have more cores available.
llm = Llama(model_path=MODEL_PATH, n_ctx=2048, n_threads=2)

def generate_caption_from_url(url: str, prompt: str = "Describe the image."):
    if not url:
        return "No URL provided."
    try:
        raw = download_bytes(url)
    except Exception as e:
        return f"Download error: {e}"
    try:
        img = load_first_frame_from_bytes(raw)
    except Exception as e:
        return f"Image processing error: {e}"

    tmp_dir = "/tmp/joycap"
    os.makedirs(tmp_dir, exist_ok=True)
    ts = int(time.time() * 1000)
    tmp_path = os.path.join(tmp_dir, f"{ts}.jpg")
    try:
        img.save(tmp_path, format="JPEG", quality=85)
    except Exception as e:
        return f"Failed to save temp image: {e}"

    prompt_full = make_prompt_for_image(tmp_path, prompt)
    try:
        resp = llm.create(
            prompt=prompt_full,
            max_tokens=256,
            temperature=0.2,
            top_p=0.95,
            stop=["User:", "Assistant:"],
        )
        text = resp.get("choices", [{}])[0].get("text", "").strip()
        return text or "No caption generated."
    except Exception as e:
        return f"Inference error: {e}"
    finally:
        try:
            os.remove(tmp_path)
        except Exception:
            pass

iface = gr.Interface(
    fn=generate_caption_from_url,
    inputs=[
        gr.Textbox(label="Image URL", placeholder="https://example.com/photo.jpg"),
        gr.Textbox(label="Prompt (optional)", value="Describe the image."),
    ],
    outputs=gr.Textbox(label="Generated caption"),
    title="JoyCaption GGUF (Q4_K)",
    description="Runs a quantized JoyCaption GGUF locally via llama.cpp (no external API).",
)

if __name__ == "__main__":
    iface.launch(server_name="0.0.0.0", server_port=7860)