| import spaces |
| import os |
| os.environ["LLAMA_CUBLAS"] = "1" |
| os.environ["GGML_CUDA_FORCE_DMMV"] = "1" |
| import json |
| import subprocess |
|
|
|
|
| from cachetools import LRUCache |
| import threading |
|
|
| MODEL_CACHE = LRUCache(maxsize=10) |
| CACHE_LOCK = threading.Lock() |
|
|
| |
|
|
| import llama_cpp._internals as internals |
| from llama_cpp.llama_chat_format import Qwen35ChatHandler |
|
|
|
|
| |
|
|
| _original_close = internals.LlamaModel.close |
|
|
| def safe_close(self): |
| try: |
| if hasattr(self, "sampler") and self.sampler is not None: |
| return _original_close(self) |
| except Exception: |
| pass |
|
|
| internals.LlamaModel.close = safe_close |
|
|
|
|
| def safe_del(self): |
| try: |
| self.close() |
| except Exception: |
| pass |
|
|
| internals.LlamaModel.__del__ = safe_del |
|
|
|
|
|
|
|
|
| |
|
|
| |
| |
| |
| |
| |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| from llama_cpp import Llama |
| from llama_cpp_agent import LlamaCppAgent, MessagesFormatterType |
| from llama_cpp_agent.providers import LlamaCppPythonProvider |
| from llama_cpp_agent.chat_history import BasicChatHistory |
| from llama_cpp_agent.chat_history.messages import Roles |
| import gradio as gr |
| from huggingface_hub import hf_hub_download |
|
|
| import llama_cpp |
| print(llama_cpp.__file__) |
| print(llama_cpp.__version__) |
|
|
| huggingface_token = os.getenv("HUGGINGFACE_TOKEN") |
|
|
| |
| |
| |
| |
| |
|
|
| |
| |
| |
| |
| |
|
|
| |
| |
| |
| |
| |
| |
|
|
| |
| |
| |
| |
| |
| |
|
|
| |
| |
| |
| |
| |
| |
|
|
| |
| |
| |
| |
| |
| |
|
|
|
|
|
|
| import base64 |
| |
| |
| |
| |
| |
| |
|
|
|
|
| hf_hub_download( |
| repo_id="unsloth/Qwen3.5-9B-GGUF", |
| filename="Qwen3.5-9B-UD-Q8_K_XL.gguf", |
| local_dir="./models" |
| ) |
|
|
|
|
| hf_hub_download( |
| repo_id="unsloth/Qwen3.5-9B-GGUF", |
| filename="mmproj-BF16.gguf", |
| local_dir="./models" |
| ) |
| from huggingface_hub import snapshot_download |
|
|
| |
| |
| |
| |
| |
| |
| |
|
|
|
|
|
|
|
|
| |
| |
| |
| |
| |
| |
| |
| |
|
|
|
|
|
|
|
|
| |
| |
|
|
| |
| |
|
|
| |
| |
| |
| |
| |
|
|
| |
|
|
| |
| |
| |
| |
| |
| |
|
|
|
|
| |
| |
| |
| |
| |
| |
|
|
| import os |
|
|
| def print_tree(start_path="models"): |
| for root, dirs, files in os.walk(start_path): |
| level = root.replace(start_path, "").count(os.sep) |
| indent = "│ " * level |
| print(f"{indent}├── {os.path.basename(root)}/") |
| |
| subindent = "│ " * (level + 1) |
| for f in files: |
| print(f"{subindent}├── {f}") |
|
|
| print_tree("models") |
|
|
|
|
|
|
|
|
| import gc |
| import torch |
|
|
| def delete_llama_model(llm): |
| |
|
|
| if llm is not None: |
| try: |
| llm.close() |
| except Exception as e: |
| print("Close error:", e) |
|
|
| llm = None |
|
|
| |
| gc.collect() |
|
|
| |
| try: |
| torch.cuda.empty_cache() |
| torch.cuda.ipc_collect() |
| torch.cuda.synchronize() |
| except: |
| pass |
|
|
| print("Model fully unloaded.") |
|
|
|
|
|
|
| llm = None |
| llm_model_glm = None |
| llm_model_qwen= None |
|
|
|
|
|
|
| _IMAGE_MIME_TYPES = { |
| |
| '.png': 'image/png', |
| '.jpg': 'image/jpeg', |
| '.jpeg': 'image/jpeg', |
| '.gif': 'image/gif', |
| '.webp': 'image/webp', |
|
|
| |
| '.avif': 'image/avif', |
| '.jp2': 'image/jp2', |
| '.j2k': 'image/jp2', |
| '.jpx': 'image/jp2', |
|
|
| |
| '.bmp': 'image/bmp', |
| '.ico': 'image/x-icon', |
| '.pcx': 'image/x-pcx', |
| '.tga': 'image/x-tga', |
| '.icns': 'image/icns', |
|
|
| |
| '.tif': 'image/tiff', |
| '.tiff': 'image/tiff', |
| '.eps': 'application/postscript', |
| '.dds': 'image/vnd-ms.dds', |
| '.dib': 'image/dib', |
| '.sgi': 'image/sgi', |
|
|
| |
| '.pbm': 'image/x-portable-bitmap', |
| '.pgm': 'image/x-portable-graymap', |
| '.ppm': 'image/x-portable-pixmap', |
|
|
| |
| '.xbm': 'image/x-xbitmap', |
| '.mpo': 'image/mpo', |
| '.msp': 'image/msp', |
| '.im': 'image/x-pillow-im', |
| '.qoi': 'image/qoi', |
| } |
| import os |
| import base64 |
|
|
| def image_to_base64_data_uri( |
| file_path: str, |
| *, |
| fallback_mime: str = "application/octet-stream" |
| ) -> str: |
| """ |
| Convert a local image file to a base64-encoded data URI with the correct MIME type. |
| |
| Supports 20+ image formats (PNG, JPEG, WebP, AVIF, BMP, ICO, TIFF, etc.). |
| |
| Args: |
| file_path: Path to the image file on disk. |
| fallback_mime: MIME type used when the file extension is unknown. |
| |
| Returns: |
| A valid data URI string (e.g., data:image/webp;base64,...). |
| |
| Raises: |
| FileNotFoundError: If the file does not exist. |
| OSError: If reading the file fails. |
| """ |
| if not os.path.isfile(file_path): |
| raise FileNotFoundError(f"Image file not found: {file_path}") |
|
|
| extension = os.path.splitext(file_path)[1].lower() |
| mime_type = _IMAGE_MIME_TYPES.get(extension, fallback_mime) |
|
|
| if mime_type == fallback_mime: |
| print(f"Warning: Unknown extension '{extension}' for '{file_path}'. " |
| f"Using fallback MIME type: {fallback_mime}") |
|
|
| try: |
| with open(file_path, "rb") as img_file: |
| encoded_data = base64.b64encode(img_file.read()).decode("utf-8") |
| except OSError as e: |
| raise OSError(f"Failed to read image file '{file_path}': {e}") from e |
|
|
| return f"data:{mime_type};base64,{encoded_data}" |
|
|
|
|
|
|
| |
| |
|
|
| |
| |
| |
| |
| |
| |
| |
|
|
| |
| |
|
|
|
|
|
|
| |
| |
| |
| |
| |
|
|
| |
| |
|
|
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| |
| |
| import os |
| import tempfile |
| import requests |
| from urllib.parse import urlparse |
|
|
|
|
|
|
|
|
|
|
| def handle_image_input(image_input): |
| """ |
| image_input can be: |
| - URL (http/https) |
| - Data URI (data:image/png;base64,...) |
| """ |
|
|
| |
| if image_input.startswith("data:"): |
| print("Data URI detected. No download needed.") |
| return process_image(image_input) |
|
|
| |
| parsed = urlparse(image_input) |
|
|
| if parsed.scheme in ("http", "https"): |
| print("URL detected. Downloading temporarily...") |
|
|
| response = requests.get(image_input) |
| response.raise_for_status() |
|
|
| |
| with tempfile.NamedTemporaryFile(delete=False, suffix=".png") as tmp_file: |
| tmp_file.write(response.content) |
| temp_path = tmp_file.name |
|
|
| try: |
| tmp_tmp=image_to_base64_data_uri(temp_path) |
| finally: |
| |
| os.remove(temp_path) |
| return tmp_tmp |
|
|
| else: |
| raise ValueError("Unsupported image format.") |
|
|
|
|
| |
| |
|
|
|
|
| import json |
|
|
| def str_to_json(str_obj): |
| json_obj = json.loads(str_obj) |
| return json_obj |
|
|
|
|
| MODES = { |
|
|
| |
| |
| |
| "thinking_general": { |
| "temperature": 1.0, |
| "top_p": 0.95, |
| "top_k": 20, |
| "min_p": 0.0, |
| "presence_penalty": 1.5, |
| "repeat_penalty": 1.0, |
| "enable_thinking": True, |
| }, |
|
|
| |
| |
| |
| "thinking_coding": { |
| "temperature": 0.6, |
| "top_p": 0.95, |
| "top_k": 20, |
| "min_p": 0.0, |
| "presence_penalty": 0.0, |
| "repeat_penalty": 1.0, |
| "enable_thinking": True, |
| }, |
|
|
| |
| |
| |
| "instruct_general": { |
| "temperature": 0.7, |
| "top_p": 0.8, |
| "top_k": 20, |
| "min_p": 0.0, |
| "presence_penalty": 1.5, |
| "repeat_penalty": 1.0, |
| "enable_thinking": False, |
| }, |
|
|
| |
| |
| |
| "instruct_reasoning": { |
| "temperature": 1.0, |
| "top_p": 0.95, |
| "top_k": 20, |
| "min_p": 0.0, |
| "presence_penalty": 1.5, |
| "repeat_penalty": 1.0, |
| "enable_thinking": False, |
| }, |
| } |
|
|
| import os |
|
|
|
|
| def load_llama_model(model_path, enable_thinking): |
|
|
| key = (model_path, enable_thinking) |
|
|
| with CACHE_LOCK: |
|
|
| if key in MODEL_CACHE: |
| return MODEL_CACHE[key] |
|
|
| print("Loading model:", key) |
|
|
| llm = Llama( |
| model_path=f"models/Qwen3.5-9B-UD-Q8_K_XL.gguf", |
| flash_attn=True, |
| n_gpu_layers=-1, |
| n_batch=2048, |
| n_ctx= 16384, |
| |
| n_threads=os.cpu_count(), |
| n_threads_batch=os.cpu_count(), |
| |
| |
| mul_mat_q=True, |
| use_mmap=True, |
| chat_handler=Qwen35ChatHandler( |
| clip_model_path=f"models/mmproj-BF16.gguf", |
| |
| enable_thinking=enable_thinking , |
| image_min_tokens=1024, |
| ), |
| ) |
|
|
| MODEL_CACHE[key] = llm |
|
|
| return llm |
|
|
|
|
|
|
|
|
|
|
|
|
| |
| |
| |
| @spaces.GPU(duration=70) |
| def respond( |
| message, |
| history: list[tuple[str, str]], |
| model, |
| system_message, |
| max_tokens, |
| temperature, |
| top_p, |
| top_k, |
| repeat_penalty, |
| ): |
| |
| |
| |
| global MODES |
| try: |
| j_r= str_to_json(message) |
| messages= j_r['messages'] |
| max_token= j_r['max_token'] |
| try: |
| mode = j_r['mode'] |
| except: |
| mode="thinking_general" |
| except Exception as e: |
| print(e) |
| print("not valid json") |
| max_token=8196 |
| mode="instruct_general" |
| messages=[ |
| { |
| "role":"user", |
| "content":str(message) |
| } |
| ] |
| |
|
|
| final_messages=[] |
|
|
| for message in messages: |
| content= message['content'] |
| if isinstance(content, str): |
| final_messages.append(message) |
| continue |
| |
| tmp_list= [] |
| for item in content: |
| if item['type']=="text": |
| tmp_list.append(item) |
| continue |
| |
| if item['type']=="image": |
| try: |
| if item['url']: |
| data_uri= handle_image_input(item['url']) |
| tmp_list.append({"type": "image_url", "image_url": {"url": data_uri}}) |
| except Exception as e: |
| print(e) |
| try: |
| if item['uri']: |
| data_uri= handle_image_input(item['url']) |
| tmp_list.append({"type": "image_url", "image_url": {"url": data_uri}}) |
| except Exception as e: |
| print(e) |
| |
| |
| final_messages.append({"role": message['role'], "content": tmp_list}) |
| |
| |
| |
| |
| |
|
|
| |
| |
| |
| config = MODES[mode] |
| llm_model_qwen = load_llama_model( |
| model_path=f"models/{model}", |
| enable_thinking=config["enable_thinking"], |
| ) |
|
|
| |
| x=llm_model_qwen.create_chat_completion( |
| messages = final_messages, |
| max_tokens= max_token, |
| temperature=config["temperature"], |
| top_p=config["top_p"], |
| top_k=config["top_k"], |
| min_p=config["min_p"], |
| |
| repeat_penalty=config["repeat_penalty"], |
| |
| |
| |
| |
| |
| |
| ) |
| print(x) |
| |
| x= x['choices'][0]['message']['content'] |
| yield str(x) |
|
|
| |
| |
| |
| |
| |
| |
|
|
| |
|
|
| |
|
|
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| |
|
|
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| description = """<p align="center">Defaults to 2B (you can switch to 9B or 27B from additional inputs)</p> |
| <p><center> |
| <a href="https://huggingface.co/google/gemma-2-27b-it" target="_blank">[27B it Model]</a> |
| <a href="https://huggingface.co/google/gemma-2-9b-it" target="_blank">[9B it Model]</a> |
| <a href="https://huggingface.co/google/gemma-2-2b-it" target="_blank">[2B it Model]</a> |
| <a href="https://huggingface.co/bartowski/gemma-2-27b-it-GGUF" target="_blank">[27B it Model GGUF]</a> |
| <a href="https://huggingface.co/bartowski/gemma-2-9b-it-GGUF" target="_blank">[9B it Model GGUF]</a> |
| <a href="https://huggingface.co/google/gemma-2-2b-it-GGUF" target="_blank">[2B it Model GGUF]</a> |
| </center></p> |
| """ |
| import gradio as gr |
|
|
| demo = gr.ChatInterface( |
| fn=respond, |
| additional_inputs=[ |
| gr.Dropdown( |
| [ |
| |
| |
| |
| |
| |
| |
| |
| "Qwen3-VL-32B-Thinking-Q8_0.gguf", |
| |
| |
| ], |
| value="Qwen3-VL-32B-Thinking-Q8_0.gguf", |
| label="Model", |
| ), |
| gr.Textbox( |
| value="You are a helpful assistant.", |
| label="System message", |
| ), |
| gr.Slider(1, 4096, value=2048, step=1, label="Max tokens"), |
| gr.Slider(0.1, 4.0, value=0.7, step=0.1, label="Temperature"), |
| gr.Slider(0.1, 1.0, value=0.95, step=0.05, label="Top-p"), |
| gr.Slider(0, 100, value=40, step=1, label="Top-k"), |
| gr.Slider(0.0, 2.0, value=1.1, step=0.1, label="Repetition penalty"), |
| ], |
| title="Chat with Gemma 2 using llama.cpp", |
| description=description, |
| ) |
|
|
|
|
| |
|
|
| if __name__ == "__main__": |
| demo.launch() |