File size: 7,535 Bytes
cd5ca02
b3b505f
cd5ca02
e4bf697
a8bd35c
7766a5c
 
cd5ca02
b3b505f
e4bf697
b3b505f
 
 
e4bf697
b3b505f
 
71b45b9
cd5ca02
b3b505f
cd5ca02
 
b3b505f
cd5ca02
b3b505f
 
 
 
 
cd5ca02
 
b3b505f
cd5ca02
b3b505f
 
cd5ca02
b3b505f
cd5ca02
b3b505f
cd5ca02
 
 
b3b505f
 
 
 
 
 
 
 
 
 
 
 
 
a8bd35c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cd5ca02
b3b505f
cd5ca02
 
 
 
 
 
b3b505f
cd5ca02
b3b505f
 
 
 
 
 
cd5ca02
b3b505f
 
cd5ca02
 
b3b505f
 
 
 
cd5ca02
b3b505f
 
cd5ca02
b3b505f
cd5ca02
 
b3b505f
 
cd5ca02
 
b3b505f
cd5ca02
 
b3b505f
 
 
cd5ca02
b3b505f
cd5ca02
b3b505f
 
 
cd5ca02
b3b505f
7766a5c
 
 
b3b505f
7766a5c
b3b505f
7766a5c
b3b505f
cd5ca02
 
 
 
b3b505f
a8bd35c
 
b3b505f
a8bd35c
 
7766a5c
a8bd35c
7766a5c
b3b505f
a8bd35c
 
 
 
028a367
b3b505f
 
 
a8bd35c
 
 
 
 
b3b505f
 
cd5ca02
b3b505f
 
 
 
 
 
 
7766a5c
b3b505f
cd5ca02
b3b505f
851e8b5
b3b505f
71b45b9
b3b505f
 
71b45b9
 
 
a8bd35c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
#!/usr/bin/env python3
import os, io, re, sys, subprocess, hashlib, pathlib, time
from typing import Optional
import requests
from PIL import Image, ImageSequence, UnidentifiedImageError # Import UnidentifiedImageError
import gradio as gr

MODEL_DIR = pathlib.Path("model")
MODEL_DIR.mkdir(exist_ok=True, parents=True)

# Public mradermacher GGUF links (no tokens)
PRIMARY_URL = "https://huggingface.co/mradermacher/llama-joycaption-beta-one-hf-llava-GGUF/resolve/main/llama-joycaption-q4_k_s.gguf"
FALLBACK_URL = "https://huggingface.co/mradermacher/llama-joycaption-beta-one-hf-llava-GGUF/resolve/main/llama-joycaption-q4_k_m.gguf"

PRIMARY_NAME = MODEL_DIR / "llama-joycaption-q4_k_s.gguf"
FALLBACK_NAME = MODEL_DIR / "llama-joycaption-q4_k_m.gguf"

# Generation params
MAX_TOKENS = 128
TEMPERATURE = 0.2
TOP_P = 0.95
STOP = ["\n"]

def download_file(url: str, dest: pathlib.Path, timeout=120):
    if dest.exists():
        return
    print("Downloading", url)
    with requests.get(url, stream=True, timeout=timeout) as r:
        r.raise_for_status()
        total = int(r.headers.get("content-length", 0) or 0)
        done = 0
        with open(dest, "wb") as f:
            for chunk in r.iter_content(8192):
                if not chunk: continue
                f.write(chunk)
                done += len(chunk)
                if total:
                    pct = done * 100 // total
                    print(f"\r{dest.name}: {pct}% ", end="", flush=True)
    print()

def mp4_to_gif(mp4_bytes: bytes) -> bytes:
    files = {"new-file": ("video.mp4", mp4_bytes, "video/mp4")}
    resp = requests.post("https://s.ezgif.com/video-to-gif", files=files, data={"file":"video.mp4"}, timeout=120)
    resp.raise_for_status()
    m = re.search(r'<img[^>]+src="([^"]+\.gif)"', resp.text) or re.search(r'src="([^"]+?/tmp/[^"]+\.gif)"', resp.text)
    if not m:
        raise RuntimeError("GIF URL not found")
    gif_url = m.group(1)
    if gif_url.startswith("//"): gif_url = "https:" + gif_url
    elif gif_url.startswith("/"): gif_url = "https://s.ezgif.com" + gif_url
    r2 = requests.get(gif_url, timeout=60); r2.raise_for_status(); return r2.content

def load_first_frame(raw: bytes):
    # Added specific handling for PIL errors
    try:
        img = Image.open(io.BytesIO(raw))
        if getattr(img, "is_animated", False):
            # Also wrap ImageSequence.Iterator in case of corrupted animated images
            try:
                img = next(ImageSequence.Iterator(img))
            except Exception as e:
                raise ValueError(f"Could not extract first frame from animated image: {e}")
        if img.mode != "RGB":
            img = img.convert("RGB")
        return img
    except UnidentifiedImageError:
        raise ValueError("Could not identify image format or image is corrupted.")
    except Exception as e:
        # Catch other PIL errors (e.g., IOErrors during parsing)
        raise ValueError(f"Failed to load or process image with PIL: {e}")


def rebuild_llama_cpp():
    env = os.environ.copy()
    env["PIP_NO_BINARY"] = "llama-cpp-python"
    subprocess.check_call([sys.executable, "-m", "pip", "install", "--upgrade", "pip"], env=env)
    subprocess.check_call([sys.executable, "-m", "pip", "install", "--upgrade", "cmake", "wheel", "setuptools"], env=env)
    subprocess.check_call([sys.executable, "-m", "pip", "install", "--upgrade", "llama-cpp-python"], env=env)

_llama = None

def ensure_model():
    global _llama
    if _llama is not None:
        return
    # try primary then fallback
    for url, path in ((PRIMARY_URL, PRIMARY_NAME), (FALLBACK_URL, FALLBACK_NAME)):
        try:
            download_file(url, path)
            import importlib
            llama_cpp = importlib.import_module("llama_cpp")
            Llama = getattr(llama_cpp, "Llama")
            print("Loading", path)
            _llama = Llama(model_path=str(path), n_ctx=2048, n_gpu_layers=0, verbose=False)
            print("Loaded model:", path.name)
            return
        except Exception as e:
            print("Load failed for", path.name, ":", e)
    # rebuild once
    try:
        print("Rebuilding llama-cpp-python from source...")
        rebuild_llama_cpp()
    except Exception as e:
        raise RuntimeError("Rebuild failed: " + str(e))
    # retry primary
    try:
        import importlib
        download_file(PRIMARY_URL, PRIMARY_NAME)
        llama_cpp = importlib.reload(importlib.import_module("llama_cpp"))
        Llama = getattr(llama_cpp, "Llama")
        _llama = Llama(model_path=str(PRIMARY_NAME), n_ctx=2048, n_gpu_layers=0, verbose=False)
        print("Loaded after rebuild.")
        return
    except Exception as e:
        raise RuntimeError("Load after rebuild failed: " + str(e))

def build_prompt(img_tag: str, user_prompt: str):
    # Minimal prompt: image placeholder and the user request
    return f"<image>{img_tag}</image>\n{user_prompt}\nAnswer:"

def generate_caption_from_url(url: str, prompt: str="Describe the image."):
    if not url:
        return "No URL provided."
    try:
        r = requests.get(url, timeout=30); r.raise_for_status(); raw = r.content
    except Exception as e:
        return "Download error: " + str(e)
    try:
        lower = url.lower().split("?")[0]
        if lower.endswith(".mp4") or raw[:16].lower().find(b"ftyp") != -1:
            try:
                raw = mp4_to_gif(raw)
            except Exception as e:
                return "MP4→GIF conversion failed: " + str(e)
        img = load_first_frame(raw) # This function now has specific error handling
    except ValueError as e: # Catch the specific ValueError raised by load_first_frame
        return "Image processing error: " + str(e)
    except Exception as e: # General fallback for other unexpected image issues
        return "An unexpected image processing error occurred: " + str(e)

    # Added try-except for image resizing as well
    try:
        img = img.resize((512,512), resample=Image.BICUBIC)
    except Exception as e:
        print(f"Warning: Image resizing failed: {e}. Attempting to proceed without resizing.")
        # Optionally, you might want to return an error here if resizing is critical.
        # For captioning, not resizing might just lead to a slightly different result.

    # create a tiny base64 tag to signal image presence (model must understand this format)
    import base64
    buf = io.BytesIO()
    try:
        img.save(buf, format="PNG") # Wrap image save in try-except
    except Exception as e:
        return "Failed to encode image to base64: " + str(e)

    b64 = base64.b64encode(buf.getvalue()).decode()
    img_tag = b64  # minimal

    prompt_text = build_prompt(img_tag, prompt or "Describe the image.")
    try:
        ensure_model()
        # call llama-cpp model
        out = _llama(prompt_text, max_tokens=MAX_TOKENS, temperature=TEMPERATURE, top_p=TOP_P, stop=STOP)
        text = out.get("choices", [{}])[0].get("text", "")
        return text.strip()
    except Exception as e:
        return "Inference error: " + str(e)

iface = gr.Interface(
    fn=generate_caption_from_url,
    inputs=[gr.Textbox(label="Image / GIF / MP4 URL"), gr.Textbox(label="Prompt", value="Describe the image.")],
    outputs=gr.Textbox(label="Generated caption"),
    title="JoyCaption (minimal GGUF, auto-rebuild)",
    description="No tokens required. Downloads a public GGUF and runs locally via llama-cpp."
)

if __name__ == "__main__":
    iface.launch(server_name="0.0.0.0", server_port=7860)