Hug0endob commited on
Commit
b3b505f
·
verified ·
1 Parent(s): 17b564d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +91 -264
app.py CHANGED
@@ -1,331 +1,158 @@
1
  #!/usr/bin/env python3
2
- # -*- coding: utf-8 -*-
3
-
4
- import os
5
- import io
6
- import re
7
- import sys
8
- import time
9
- import hashlib
10
- import pathlib
11
- import subprocess
12
  from typing import Optional
13
-
14
  import requests
15
  from PIL import Image, ImageSequence
16
  import gradio as gr
17
 
18
- # If you still want to use HF AutoProcessor / LlavaForConditionalGeneration for decoding,
19
- # keep transformers installed and uncomment the imports below. This file instead uses
20
- # llama-cpp-python for model inference (GGUF).
21
- from transformers import AutoProcessor
22
-
23
- # ----------------------------------------------------------------------
24
- # Config: set model URLs and optional checksums
25
- # ----------------------------------------------------------------------
26
  MODEL_DIR = pathlib.Path("model")
27
- MODEL_DIR.mkdir(parents=True, exist_ok=True)
28
 
29
- # Replace these with your preferred GGUF files (mradermacher or TheBloke variants)
30
- Q4_K_M_URL = (
31
- "https://huggingface.co/mradermacher/joycaption-llama/resolve/main/llama-joycaption-q4_k_m.gguf"
32
- )
33
- Q4_K_S_URL = (
34
- "https://huggingface.co/mradermacher/joycaption-llama/resolve/main/llama-joycaption-q4_k_s.gguf"
35
- )
36
 
37
- # Optional: set SHA256 checksums to validate downloads (replace with real values)
38
- Q4_K_M_SHA256: Optional[str] = None
39
- Q4_K_S_SHA256: Optional[str] = None
40
 
41
  # Generation params
42
- MAX_NEW_TOKENS = 128
43
  TEMPERATURE = 0.2
44
  TOP_P = 0.95
45
- STOP_STRS = ["\n"]
46
-
47
- # HF processor/model name used previously for tokenization/chat template
48
- HF_PROCESSOR_NAME = "fancyfeast/llama-joycaption-beta-one-hf-llava"
49
- HF_TOKEN = os.getenv("HF_TOKEN") # optional
50
-
51
- # ----------------------------------------------------------------------
52
- # Utilities: downloads, checksum, mp4->gif, image load
53
- # ----------------------------------------------------------------------
54
- def download_bytes(url: str, timeout: int = 30) -> bytes:
55
- with requests.get(url, stream=True, timeout=timeout) as resp:
56
- resp.raise_for_status()
57
- return resp.content
58
-
59
-
60
- def mp4_to_gif(mp4_bytes: bytes) -> bytes:
61
- files = {"new-file": ("video.mp4", mp4_bytes, "video/mp4")}
62
- resp = requests.post(
63
- "https://s.ezgif.com/video-to-gif",
64
- files=files,
65
- data={"file": "video.mp4"},
66
- timeout=120,
67
- )
68
- resp.raise_for_status()
69
- match = re.search(r'<img[^>]+src="([^"]+\.gif)"', resp.text)
70
- if not match:
71
- match = re.search(r'src="([^"]+?/tmp/[^"]+\.gif)"', resp.text)
72
- if not match:
73
- raise RuntimeError("Failed to extract GIF URL from ezgif response")
74
- gif_url = match.group(1)
75
- if gif_url.startswith("//"):
76
- gif_url = "https:" + gif_url
77
- elif gif_url.startswith("/"):
78
- gif_url = "https://s.ezgif.com" + gif_url
79
- with requests.get(gif_url, timeout=60) as gif_resp:
80
- gif_resp.raise_for_status()
81
- return gif_resp.content
82
-
83
 
84
- def load_first_frame_from_bytes(raw: bytes) -> Image.Image:
85
- img = Image.open(io.BytesIO(raw))
86
- if getattr(img, "is_animated", False):
87
- img = next(ImageSequence.Iterator(img))
88
- if img.mode != "RGB":
89
- img = img.convert("RGB")
90
- return img
91
-
92
-
93
- def sha256_of_file(path: pathlib.Path) -> str:
94
- h = hashlib.sha256()
95
- with open(path, "rb") as f:
96
- for block in iter(lambda: f.read(65536), b""):
97
- h.update(block)
98
- return h.hexdigest()
99
-
100
-
101
- def download_file(url: str, dest: pathlib.Path, expected_sha256: Optional[str] = None) -> None:
102
- if dest.is_file():
103
- if expected_sha256:
104
- try:
105
- if sha256_of_file(dest) == expected_sha256:
106
- return
107
- except Exception:
108
- pass
109
- # remove possibly corrupted/old file
110
- dest.unlink()
111
- print(f"Downloading model from {url} -> {dest}")
112
- with requests.get(url, stream=True, timeout=120) as r:
113
  r.raise_for_status()
114
  total = int(r.headers.get("content-length", 0) or 0)
115
- downloaded = 0
116
  with open(dest, "wb") as f:
117
- for chunk in r.iter_content(chunk_size=8192):
118
- if not chunk:
119
- continue
120
  f.write(chunk)
121
- downloaded += len(chunk)
122
  if total:
123
- pct = downloaded * 100 // total
124
  print(f"\r{dest.name}: {pct}% ", end="", flush=True)
125
  print()
126
- if expected_sha256:
127
- got = sha256_of_file(dest)
128
- if got != expected_sha256:
129
- raise ValueError(f"Checksum mismatch for {dest}: got {got}, expected {expected_sha256}")
130
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
131
 
132
- # ----------------------------------------------------------------------
133
- # llama-cpp loading + automated rebuild
134
- # ----------------------------------------------------------------------
135
- def rebuild_llama_cpp() -> None:
136
  env = os.environ.copy()
137
  env["PIP_NO_BINARY"] = "llama-cpp-python"
138
- # upgrade pip then reinstall
139
  subprocess.check_call([sys.executable, "-m", "pip", "install", "--upgrade", "pip"], env=env)
140
  subprocess.check_call([sys.executable, "-m", "pip", "install", "--upgrade", "cmake", "wheel", "setuptools"], env=env)
141
  subprocess.check_call([sys.executable, "-m", "pip", "install", "--upgrade", "llama-cpp-python"], env=env)
142
 
 
143
 
144
- def try_load_gguf() -> "llama_cpp.Llama":
145
- """
146
- Download Q4_K_M then Q4_K_S and attempt to load with llama_cpp.Llama.
147
- If both fail, rebuild llama-cpp-python from source and retry once.
148
- """
149
- import importlib
150
- from pathlib import Path
151
-
152
- candidates = [
153
- (Q4_K_M_URL, MODEL_DIR / "llama-joycaption-q4_k_m.gguf", Q4_K_M_SHA256),
154
- (Q4_K_S_URL, MODEL_DIR / "llama-joycaption-q4_k_s.gguf", Q4_K_S_SHA256),
155
- ]
156
-
157
- last_exc = None
158
-
159
- for url, path, sha in candidates:
160
  try:
161
- download_file(url, path, expected_sha256=sha)
162
- print(f"Attempting to load GGUF: {path}")
163
- # lazy import so we catch import-time errors before rebuild attempt
164
  llama_cpp = importlib.import_module("llama_cpp")
165
  Llama = getattr(llama_cpp, "Llama")
166
- # minimal params; adjust n_ctx or gpu settings if available
167
- lm = Llama(model_path=str(path), n_ctx=2048, n_gpu_layers=0, verbose=False)
168
- print("Model loaded successfully.")
169
- return lm
170
  except Exception as e:
171
- print(f"Loading {path.name} failed: {e}")
172
- last_exc = e
173
-
174
- # If both failed, attempt a rebuild then retry first candidate once
175
  try:
176
- print("Both GGUF variants failed to load. Rebuilding llama-cpp-python from source...")
177
  rebuild_llama_cpp()
178
  except Exception as e:
179
- print(f"Rebuild failed: {e}")
180
- raise last_exc or e
181
-
182
- # After rebuild, import & load primary model
183
  try:
184
  import importlib
185
-
186
  llama_cpp = importlib.reload(importlib.import_module("llama_cpp"))
187
  Llama = getattr(llama_cpp, "Llama")
188
- path = candidates[0][1]
189
- if not path.is_file():
190
- download_file(candidates[0][0], path, expected_sha256=candidates[0][2])
191
- lm = Llama(model_path=str(path), n_ctx=2048, n_gpu_layers=0, verbose=False)
192
- print("Model loaded successfully after rebuild.")
193
- return lm
194
  except Exception as e:
195
- print(f"Load after rebuild failed: {e}")
196
- raise e
197
-
198
-
199
- # ----------------------------------------------------------------------
200
- # Processor and model wrapper
201
- # ----------------------------------------------------------------------
202
- # We keep AutoProcessor to reuse the chat template behaviour you used previously.
203
- processor = AutoProcessor.from_pretrained(
204
- HF_PROCESSOR_NAME,
205
- trust_remote_code=True,
206
- num_additional_image_tokens=1,
207
- **({} if not HF_TOKEN else {"token": HF_TOKEN}),
208
- )
209
-
210
- # Lazy model holder
211
- class ModelWrapper:
212
- def __init__(self):
213
- self.llm = None # llama-cpp Llama instance
214
 
215
- def ensure_model(self):
216
- if self.llm is None:
217
- self.llm = try_load_gguf()
218
 
219
- def generate(self, prompt: str, max_new_tokens: int = MAX_NEW_TOKENS):
220
- self.ensure_model()
221
- # llama-cpp-python call style: model(prompt=..., max_tokens=..., temperature=..., top_p=..., stop=...)
222
- out = self.llm(prompt, max_tokens=max_new_tokens, temperature=TEMPERATURE, top_p=TOP_P, stop=STOP_STRS)
223
- # llama-cpp-python responses usually in out["choices"][0]["text"]
224
- txt = out.get("choices", [{}])[0].get("text", "")
225
- return txt
226
-
227
- MODEL = ModelWrapper()
228
-
229
- # ----------------------------------------------------------------------
230
- # Inference: convert URL->image, build prompt via processor chat template, run llama-cpp
231
- # ----------------------------------------------------------------------
232
- def generate_caption_from_url(url: str, prompt: str = "Describe the image.") -> str:
233
  if not url:
234
  return "No URL provided."
235
  try:
236
- raw = download_bytes(url)
237
  except Exception as e:
238
- return f"Download error: {e}"
239
-
240
- lower = url.lower().split("?")[0]
241
  try:
 
242
  if lower.endswith(".mp4") or raw[:16].lower().find(b"ftyp") != -1:
243
  try:
244
  raw = mp4_to_gif(raw)
245
  except Exception as e:
246
- return f"MP4→GIF conversion failed: {e}"
247
- img = load_first_frame_from_bytes(raw)
248
  except Exception as e:
249
- return f"Image processing error: {e}"
250
 
251
- # Resize to a conservative size (512) expected by many VLMs
252
  try:
253
- img = img.resize((512, 512), resample=Image.BICUBIC)
254
  except Exception:
255
  pass
256
 
257
- try:
258
- # Produce conversation so the processor inserts image token correctly
259
- conversation = [
260
- {"role": "user", "content": [{"type": "image"}, {"type": "text", "text": prompt}]}
261
- ]
262
- inputs = processor.apply_chat_template(
263
- conversation,
264
- add_generation_prompt=True,
265
- return_tensors="pt",
266
- return_dict=True,
267
- images=img,
268
- )
269
-
270
- # The processor provides a textual input (input_ids). We'll decode it to a plain prompt
271
- # string to feed llama-cpp. The processor has a `decode` helper; else we build a simple prompt.
272
- # Use processor.tokenizer if available to decode input_ids -> text.
273
- text_prompt = None
274
- if hasattr(processor, "tokenizer") and getattr(inputs, "input_ids", None) is not None:
275
- try:
276
- # inputs may be dict tensors; extract CPU numpy/torch then decode
277
- input_ids = inputs["input_ids"][0]
278
- # convert to list of ints if tensor
279
- import torch
280
- if hasattr(input_ids, "cpu"):
281
- ids = input_ids.cpu().numpy().tolist()
282
- else:
283
- ids = list(input_ids)
284
- text_prompt = processor.tokenizer.decode(ids, skip_special_tokens=True)
285
- except Exception:
286
- text_prompt = None
287
-
288
- if not text_prompt:
289
- # Fallback: simple textual template with a tag where the image is referenced.
290
- text_prompt = f"<img> [image here] </img>\n{prompt}\nAnswer:"
291
 
292
- # Debug prints (Space logs)
293
- print("Prompt to model (truncated):", text_prompt[:512].replace("\n", "\\n"))
294
-
295
- out_text = MODEL.generate(text_prompt, max_new_tokens=MAX_NEW_TOKENS)
296
- # Postprocess: strip, remove accidental stop tokens, etc.
297
- return out_text.strip()
 
298
  except Exception as e:
299
- return f"Inference error: {e}"
300
-
301
 
302
- # ----------------------------------------------------------------------
303
- # Gradio UI (URL + prompt -> text)
304
- # ----------------------------------------------------------------------
305
- gradio_kwargs = dict(
306
  fn=generate_caption_from_url,
307
- inputs=[
308
- gr.Textbox(label="Image / GIF / MP4 URL", placeholder="https://example.com/photo.jpg"),
309
- gr.Textbox(label="Prompt (optional)", value="Describe the image."),
310
- ],
311
  outputs=gr.Textbox(label="Generated caption"),
312
- title="JoyCaption - URL input (GGUF + auto-rebuild)",
313
- description="Paste a direct link to an image/GIF/MP4 (MP4 will be converted).",
314
  )
315
 
316
- try:
317
- iface = gr.Interface(**gradio_kwargs, allow_flagging="never")
318
- except TypeError:
319
- iface = gr.Interface(**gradio_kwargs)
320
-
321
  if __name__ == "__main__":
322
- try:
323
- iface.launch(server_name="0.0.0.0", server_port=7860)
324
- finally:
325
- try:
326
- import asyncio
327
- loop = asyncio.get_event_loop()
328
- if not loop.is_closed():
329
- loop.close()
330
- except Exception:
331
- pass
 
1
  #!/usr/bin/env python3
2
+ import os, io, re, sys, subprocess, hashlib, pathlib, time
 
 
 
 
 
 
 
 
 
3
  from typing import Optional
 
4
  import requests
5
  from PIL import Image, ImageSequence
6
  import gradio as gr
7
 
 
 
 
 
 
 
 
 
8
  MODEL_DIR = pathlib.Path("model")
9
+ MODEL_DIR.mkdir(exist_ok=True, parents=True)
10
 
11
+ # Public mradermacher GGUF links (no tokens)
12
+ PRIMARY_URL = "https://huggingface.co/mradermacher/llama-joycaption-beta-one-hf-llava-GGUF/resolve/main/llama-joycaption-q4_k_s.gguf"
13
+ FALLBACK_URL = "https://huggingface.co/mradermacher/llama-joycaption-beta-one-hf-llava-GGUF/resolve/main/llama-joycaption-q4_k_m.gguf"
 
 
 
 
14
 
15
+ PRIMARY_NAME = MODEL_DIR / "llama-joycaption-q4_k_s.gguf"
16
+ FALLBACK_NAME = MODEL_DIR / "llama-joycaption-q4_k_m.gguf"
 
17
 
18
  # Generation params
19
+ MAX_TOKENS = 128
20
  TEMPERATURE = 0.2
21
  TOP_P = 0.95
22
+ STOP = ["\n"]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
23
 
24
+ def download_file(url: str, dest: pathlib.Path, timeout=120):
25
+ if dest.exists():
26
+ return
27
+ print("Downloading", url)
28
+ with requests.get(url, stream=True, timeout=timeout) as r:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
29
  r.raise_for_status()
30
  total = int(r.headers.get("content-length", 0) or 0)
31
+ done = 0
32
  with open(dest, "wb") as f:
33
+ for chunk in r.iter_content(8192):
34
+ if not chunk: continue
 
35
  f.write(chunk)
36
+ done += len(chunk)
37
  if total:
38
+ pct = done * 100 // total
39
  print(f"\r{dest.name}: {pct}% ", end="", flush=True)
40
  print()
 
 
 
 
41
 
42
+ def mp4_to_gif(mp4_bytes: bytes) -> bytes:
43
+ files = {"new-file": ("video.mp4", mp4_bytes, "video/mp4")}
44
+ resp = requests.post("https://s.ezgif.com/video-to-gif", files=files, data={"file":"video.mp4"}, timeout=120)
45
+ resp.raise_for_status()
46
+ m = re.search(r'<img[^>]+src="([^"]+\.gif)"', resp.text) or re.search(r'src="([^"]+?/tmp/[^"]+\.gif)"', resp.text)
47
+ if not m:
48
+ raise RuntimeError("GIF URL not found")
49
+ gif_url = m.group(1)
50
+ if gif_url.startswith("//"): gif_url = "https:" + gif_url
51
+ elif gif_url.startswith("/"): gif_url = "https://s.ezgif.com" + gif_url
52
+ r2 = requests.get(gif_url, timeout=60); r2.raise_for_status(); return r2.content
53
+
54
+ def load_first_frame(raw: bytes):
55
+ img = Image.open(io.BytesIO(raw))
56
+ if getattr(img, "is_animated", False):
57
+ img = next(ImageSequence.Iterator(img))
58
+ if img.mode != "RGB": img = img.convert("RGB")
59
+ return img
60
 
61
+ def rebuild_llama_cpp():
 
 
 
62
  env = os.environ.copy()
63
  env["PIP_NO_BINARY"] = "llama-cpp-python"
 
64
  subprocess.check_call([sys.executable, "-m", "pip", "install", "--upgrade", "pip"], env=env)
65
  subprocess.check_call([sys.executable, "-m", "pip", "install", "--upgrade", "cmake", "wheel", "setuptools"], env=env)
66
  subprocess.check_call([sys.executable, "-m", "pip", "install", "--upgrade", "llama-cpp-python"], env=env)
67
 
68
+ _llama = None
69
 
70
+ def ensure_model():
71
+ global _llama
72
+ if _llama is not None:
73
+ return
74
+ # try primary then fallback
75
+ for url, path in ((PRIMARY_URL, PRIMARY_NAME), (FALLBACK_URL, FALLBACK_NAME)):
 
 
 
 
 
 
 
 
 
 
76
  try:
77
+ download_file(url, path)
78
+ import importlib
 
79
  llama_cpp = importlib.import_module("llama_cpp")
80
  Llama = getattr(llama_cpp, "Llama")
81
+ print("Loading", path)
82
+ _llama = Llama(model_path=str(path), n_ctx=2048, n_gpu_layers=0, verbose=False)
83
+ print("Loaded model:", path.name)
84
+ return
85
  except Exception as e:
86
+ print("Load failed for", path.name, ":", e)
87
+ # rebuild once
 
 
88
  try:
89
+ print("Rebuilding llama-cpp-python from source...")
90
  rebuild_llama_cpp()
91
  except Exception as e:
92
+ raise RuntimeError("Rebuild failed: " + str(e))
93
+ # retry primary
 
 
94
  try:
95
  import importlib
96
+ download_file(PRIMARY_URL, PRIMARY_NAME)
97
  llama_cpp = importlib.reload(importlib.import_module("llama_cpp"))
98
  Llama = getattr(llama_cpp, "Llama")
99
+ _llama = Llama(model_path=str(PRIMARY_NAME), n_ctx=2048, n_gpu_layers=0, verbose=False)
100
+ print("Loaded after rebuild.")
101
+ return
 
 
 
102
  except Exception as e:
103
+ raise RuntimeError("Load after rebuild failed: " + str(e))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
104
 
105
+ def build_prompt(img_tag: str, user_prompt: str):
106
+ # Minimal prompt: image placeholder and the user request
107
+ return f"<image>{img_tag}</image>\n{user_prompt}\nAnswer:"
108
 
109
+ def generate_caption_from_url(url: str, prompt: str="Describe the image."):
 
 
 
 
 
 
 
 
 
 
 
 
 
110
  if not url:
111
  return "No URL provided."
112
  try:
113
+ r = requests.get(url, timeout=30); r.raise_for_status(); raw = r.content
114
  except Exception as e:
115
+ return "Download error: " + str(e)
 
 
116
  try:
117
+ lower = url.lower().split("?")[0]
118
  if lower.endswith(".mp4") or raw[:16].lower().find(b"ftyp") != -1:
119
  try:
120
  raw = mp4_to_gif(raw)
121
  except Exception as e:
122
+ return "MP4→GIF conversion failed: " + str(e)
123
+ img = load_first_frame(raw)
124
  except Exception as e:
125
+ return "Image processing error: " + str(e)
126
 
 
127
  try:
128
+ img = img.resize((512,512), resample=Image.BICUBIC)
129
  except Exception:
130
  pass
131
 
132
+ # create a tiny base64 tag to signal image presence (model must understand this format)
133
+ import base64
134
+ buf = io.BytesIO()
135
+ img.save(buf, format="PNG")
136
+ b64 = base64.b64encode(buf.getvalue()).decode()
137
+ img_tag = b64 # minimal
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
138
 
139
+ prompt_text = build_prompt(img_tag, prompt or "Describe the image.")
140
+ try:
141
+ ensure_model()
142
+ # call llama-cpp model
143
+ out = _llama(prompt_text, max_tokens=MAX_TOKENS, temperature=TEMPERATURE, top_p=TOP_P, stop=STOP)
144
+ text = out.get("choices", [{}])[0].get("text", "")
145
+ return text.strip()
146
  except Exception as e:
147
+ return "Inference error: " + str(e)
 
148
 
149
+ iface = gr.Interface(
 
 
 
150
  fn=generate_caption_from_url,
151
+ inputs=[gr.Textbox(label="Image / GIF / MP4 URL"), gr.Textbox(label="Prompt", value="Describe the image.")],
 
 
 
152
  outputs=gr.Textbox(label="Generated caption"),
153
+ title="JoyCaption (minimal GGUF, auto-rebuild)",
154
+ description="No tokens required. Downloads a public GGUF and runs locally via llama-cpp."
155
  )
156
 
 
 
 
 
 
157
  if __name__ == "__main__":
158
+ iface.launch(server_name="0.0.0.0", server_port=7860)