|
|
import spaces |
|
|
import os |
|
|
os.environ["LLAMA_CUBLAS"] = "1" |
|
|
os.environ["GGML_CUDA_FORCE_DMMV"] = "1" |
|
|
import json |
|
|
import subprocess |
|
|
|
|
|
|
|
|
|
|
|
import llama_cpp._internals as internals |
|
|
from llama_cpp.llama_chat_format import Qwen3VLChatHandler |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
_original_close = internals.LlamaModel.close |
|
|
|
|
|
def safe_close(self): |
|
|
try: |
|
|
if hasattr(self, "sampler") and self.sampler is not None: |
|
|
return _original_close(self) |
|
|
except Exception: |
|
|
pass |
|
|
|
|
|
internals.LlamaModel.close = safe_close |
|
|
|
|
|
|
|
|
def safe_del(self): |
|
|
try: |
|
|
self.close() |
|
|
except Exception: |
|
|
pass |
|
|
|
|
|
internals.LlamaModel.__del__ = safe_del |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
from llama_cpp import Llama |
|
|
from llama_cpp_agent import LlamaCppAgent, MessagesFormatterType |
|
|
from llama_cpp_agent.providers import LlamaCppPythonProvider |
|
|
from llama_cpp_agent.chat_history import BasicChatHistory |
|
|
from llama_cpp_agent.chat_history.messages import Roles |
|
|
import gradio as gr |
|
|
from huggingface_hub import hf_hub_download |
|
|
|
|
|
import llama_cpp |
|
|
print(llama_cpp.__file__) |
|
|
print(llama_cpp.__version__) |
|
|
|
|
|
huggingface_token = os.getenv("HUGGINGFACE_TOKEN") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
hf_hub_download( |
|
|
repo_id="unsloth/Qwen3-VL-32B-Thinking-GGUF", |
|
|
filename="Qwen3-VL-32B-Thinking-Q8_0.gguf", |
|
|
local_dir="./models" |
|
|
) |
|
|
|
|
|
|
|
|
hf_hub_download( |
|
|
repo_id="Qwen/Qwen3-VL-32B-Thinking-GGUF", |
|
|
filename="mmproj-Qwen3VL-32B-Thinking-F16.gguf", |
|
|
local_dir="./models" |
|
|
) |
|
|
from huggingface_hub import snapshot_download |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
import os |
|
|
|
|
|
def print_tree(start_path="models"): |
|
|
for root, dirs, files in os.walk(start_path): |
|
|
level = root.replace(start_path, "").count(os.sep) |
|
|
indent = "│ " * level |
|
|
print(f"{indent}├── {os.path.basename(root)}/") |
|
|
|
|
|
subindent = "│ " * (level + 1) |
|
|
for f in files: |
|
|
print(f"{subindent}├── {f}") |
|
|
|
|
|
print_tree("models") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
import gc |
|
|
import torch |
|
|
|
|
|
def delete_llama_model(llm): |
|
|
|
|
|
|
|
|
if llm is not None: |
|
|
try: |
|
|
llm.close() |
|
|
except Exception as e: |
|
|
print("Close error:", e) |
|
|
|
|
|
llm = None |
|
|
|
|
|
|
|
|
gc.collect() |
|
|
|
|
|
|
|
|
try: |
|
|
torch.cuda.empty_cache() |
|
|
torch.cuda.ipc_collect() |
|
|
torch.cuda.synchronize() |
|
|
except: |
|
|
pass |
|
|
|
|
|
print("Model fully unloaded.") |
|
|
|
|
|
|
|
|
|
|
|
llm = None |
|
|
llm_model_glm = None |
|
|
llm_model_qwen= None |
|
|
|
|
|
|
|
|
|
|
|
_IMAGE_MIME_TYPES = { |
|
|
|
|
|
'.png': 'image/png', |
|
|
'.jpg': 'image/jpeg', |
|
|
'.jpeg': 'image/jpeg', |
|
|
'.gif': 'image/gif', |
|
|
'.webp': 'image/webp', |
|
|
|
|
|
|
|
|
'.avif': 'image/avif', |
|
|
'.jp2': 'image/jp2', |
|
|
'.j2k': 'image/jp2', |
|
|
'.jpx': 'image/jp2', |
|
|
|
|
|
|
|
|
'.bmp': 'image/bmp', |
|
|
'.ico': 'image/x-icon', |
|
|
'.pcx': 'image/x-pcx', |
|
|
'.tga': 'image/x-tga', |
|
|
'.icns': 'image/icns', |
|
|
|
|
|
|
|
|
'.tif': 'image/tiff', |
|
|
'.tiff': 'image/tiff', |
|
|
'.eps': 'application/postscript', |
|
|
'.dds': 'image/vnd-ms.dds', |
|
|
'.dib': 'image/dib', |
|
|
'.sgi': 'image/sgi', |
|
|
|
|
|
|
|
|
'.pbm': 'image/x-portable-bitmap', |
|
|
'.pgm': 'image/x-portable-graymap', |
|
|
'.ppm': 'image/x-portable-pixmap', |
|
|
|
|
|
|
|
|
'.xbm': 'image/x-xbitmap', |
|
|
'.mpo': 'image/mpo', |
|
|
'.msp': 'image/msp', |
|
|
'.im': 'image/x-pillow-im', |
|
|
'.qoi': 'image/qoi', |
|
|
} |
|
|
|
|
|
def image_to_base64_data_uri( |
|
|
file_path: str, |
|
|
*, |
|
|
fallback_mime: str = "application/octet-stream" |
|
|
) -> str: |
|
|
""" |
|
|
Convert a local image file to a base64-encoded data URI with the correct MIME type. |
|
|
|
|
|
Supports 20+ image formats (PNG, JPEG, WebP, AVIF, BMP, ICO, TIFF, etc.). |
|
|
|
|
|
Args: |
|
|
file_path: Path to the image file on disk. |
|
|
fallback_mime: MIME type used when the file extension is unknown. |
|
|
|
|
|
Returns: |
|
|
A valid data URI string (e.g., data:image/webp;base64,...). |
|
|
|
|
|
Raises: |
|
|
FileNotFoundError: If the file does not exist. |
|
|
OSError: If reading the file fails. |
|
|
""" |
|
|
if not os.path.isfile(file_path): |
|
|
raise FileNotFoundError(f"Image file not found: {file_path}") |
|
|
|
|
|
extension = os.path.splitext(file_path)[1].lower() |
|
|
mime_type = _IMAGE_MIME_TYPES.get(extension, fallback_mime) |
|
|
|
|
|
if mime_type == fallback_mime: |
|
|
print(f"Warning: Unknown extension '{extension}' for '{file_path}'. " |
|
|
f"Using fallback MIME type: {fallback_mime}") |
|
|
|
|
|
try: |
|
|
with open(file_path, "rb") as img_file: |
|
|
encoded_data = base64.b64encode(img_file.read()).decode("utf-8") |
|
|
except OSError as e: |
|
|
raise OSError(f"Failed to read image file '{file_path}': {e}") from e |
|
|
|
|
|
return f"data:{mime_type};base64,{encoded_data}" |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@spaces.GPU(duration=120) |
|
|
def respond( |
|
|
message, |
|
|
history: list[tuple[str, str]], |
|
|
model, |
|
|
system_message, |
|
|
max_tokens, |
|
|
temperature, |
|
|
top_p, |
|
|
top_k, |
|
|
repeat_penalty, |
|
|
): |
|
|
|
|
|
chat_template = MessagesFormatterType.CHATML |
|
|
|
|
|
|
|
|
global llm |
|
|
global llm_model |
|
|
global llm_model_glm |
|
|
global llm_model_qwen |
|
|
|
|
|
|
|
|
if model == "Qwen3-VL-32B-Thinking-Q8_0.gguf" : |
|
|
if llm_model_qwen == None: |
|
|
llm_model_qwen = Llama( |
|
|
model_path=f"models/Qwen3-VL-32B-Thinking-Q8_0.gguf", |
|
|
flash_attn=True, |
|
|
n_gpu_layers=-1, |
|
|
n_batch=2048, |
|
|
n_ctx= 8196, |
|
|
n_threads=16, |
|
|
use_mlock=True, |
|
|
verbose=True, |
|
|
chat_handler=Qwen3VLChatHandler( |
|
|
clip_model_path=f"models/mmproj-Qwen3VL-32B-Thinking-F16.gguf", |
|
|
force_reasoning=True, |
|
|
image_min_tokens=1024, |
|
|
), |
|
|
) |
|
|
|
|
|
x=llm_model_qwen.create_chat_completion( |
|
|
messages = [ |
|
|
{"role": "system", "content": "hi"}, |
|
|
{ |
|
|
"role": "user", |
|
|
"content": str(message) |
|
|
} |
|
|
] |
|
|
) |
|
|
print(x) |
|
|
|
|
|
yield str(x) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
description = """<p align="center">Defaults to 2B (you can switch to 9B or 27B from additional inputs)</p> |
|
|
<p><center> |
|
|
<a href="https://huggingface.co/google/gemma-2-27b-it" target="_blank">[27B it Model]</a> |
|
|
<a href="https://huggingface.co/google/gemma-2-9b-it" target="_blank">[9B it Model]</a> |
|
|
<a href="https://huggingface.co/google/gemma-2-2b-it" target="_blank">[2B it Model]</a> |
|
|
<a href="https://huggingface.co/bartowski/gemma-2-27b-it-GGUF" target="_blank">[27B it Model GGUF]</a> |
|
|
<a href="https://huggingface.co/bartowski/gemma-2-9b-it-GGUF" target="_blank">[9B it Model GGUF]</a> |
|
|
<a href="https://huggingface.co/google/gemma-2-2b-it-GGUF" target="_blank">[2B it Model GGUF]</a> |
|
|
</center></p> |
|
|
""" |
|
|
import gradio as gr |
|
|
|
|
|
demo = gr.ChatInterface( |
|
|
fn=respond, |
|
|
additional_inputs=[ |
|
|
gr.Dropdown( |
|
|
[ |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
"Qwen3-VL-32B-Thinking-Q8_0.gguf", |
|
|
|
|
|
|
|
|
], |
|
|
value="Qwen3-VL-32B-Thinking-Q8_0.gguf", |
|
|
label="Model", |
|
|
), |
|
|
gr.Textbox( |
|
|
value="You are a helpful assistant.", |
|
|
label="System message", |
|
|
), |
|
|
gr.Slider(1, 4096, value=2048, step=1, label="Max tokens"), |
|
|
gr.Slider(0.1, 4.0, value=0.7, step=0.1, label="Temperature"), |
|
|
gr.Slider(0.1, 1.0, value=0.95, step=0.05, label="Top-p"), |
|
|
gr.Slider(0, 100, value=40, step=1, label="Top-k"), |
|
|
gr.Slider(0.0, 2.0, value=1.1, step=0.1, label="Repetition penalty"), |
|
|
], |
|
|
title="Chat with Gemma 2 using llama.cpp", |
|
|
description=description, |
|
|
) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
demo.launch() |