import spaces import os os.environ["LLAMA_CUBLAS"] = "1" os.environ["GGML_CUDA_FORCE_DMMV"] = "1" import json import subprocess ### monkey patch import llama_cpp._internals as internals from llama_cpp.llama_chat_format import Qwen3VLChatHandler # 2οΈβ£ Monkey patch BEFORE creating Llama() _original_close = internals.LlamaModel.close def safe_close(self): try: if hasattr(self, "sampler") and self.sampler is not None: return _original_close(self) except Exception: pass internals.LlamaModel.close = safe_close def safe_del(self): try: self.close() except Exception: pass internals.LlamaModel.__del__ = safe_del ##### final verdict # GLM 4.7 flash fast infrence #qwen 3 VL #mini max 2.5 # qwen 3 coder next #gpt oss 120B from llama_cpp import Llama from llama_cpp_agent import LlamaCppAgent, MessagesFormatterType from llama_cpp_agent.providers import LlamaCppPythonProvider from llama_cpp_agent.chat_history import BasicChatHistory from llama_cpp_agent.chat_history.messages import Roles import gradio as gr from huggingface_hub import hf_hub_download import llama_cpp print(llama_cpp.__file__) print(llama_cpp.__version__) huggingface_token = os.getenv("HUGGINGFACE_TOKEN") # hf_hub_download( # repo_id="bartowski/gemma-2-9b-it-GGUF", # filename="gemma-2-9b-it-Q5_K_M.gguf", # local_dir="./models" # ) # hf_hub_download( # repo_id="bartowski/gemma-2-27b-it-GGUF", # filename="gemma-2-27b-it-Q5_K_M.gguf", # local_dir="./models" # ) # hf_hub_download( # repo_id="google/gemma-2-2b-it-GGUF", # filename="2b_it_v2.gguf", # local_dir="./models", # token=huggingface_token # ) # hf_hub_download( # repo_id="unsloth/GLM-4.7-Flash-GGUF", # filename="GLM-4.7-Flash-Q8_0.gguf", # local_dir="./models", # token=huggingface_token # ) # hf_hub_download( # repo_id="unsloth/gpt-oss-20b-GGUF", # filename="gpt-oss-20b-Q4_K_M.gguf", # local_dir="./models", # token=huggingface_token # ) # hf_hub_download( # repo_id="unsloth/gpt-oss-20b-GGUF", # filename="gpt-oss-20b-Q4_K_M.gguf", # local_dir="./models", # token=huggingface_token # ) # hf_hub_download( # repo_id="unsloth/Qwen3-Next-80B-A3B-Instruct-GGUF", # filename="Qwen3-Next-80B-A3B-Instruct-Q4_K_M.gguf", # local_dir="./models", # token=huggingface_token # ) hf_hub_download( repo_id="unsloth/Qwen3-VL-32B-Thinking-GGUF", filename="Qwen3-VL-32B-Thinking-Q8_0.gguf", local_dir="./models" ) hf_hub_download( repo_id="Qwen/Qwen3-VL-32B-Thinking-GGUF", filename="mmproj-Qwen3VL-32B-Thinking-F16.gguf", local_dir="./models" ) from huggingface_hub import snapshot_download # snapshot_download( # repo_id="unsloth/Qwen3-Coder-Next-GGUF", # repo_type="model", # local_dir="./models/", # allow_patterns=["Q5_K_M/*"], # π folder inside repo # token=huggingface_token # only if gated/private # ) #### Deploy Minimax 2.5 insplace of gpt oss 120b its larger . and better and more recet leeases # snapshot_download( # repo_id="unsloth/gpt-oss-120b-GGUF", # repo_type="model", # local_dir="./models/", # allow_patterns=["Q8_0/*"], # π folder inside repo # token=huggingface_token # only if gated/private # ) # llm = Llama.from_pretrained( # repo_id="stepfun-ai/Step-3.5-Flash-GGUF-Q4_K_S", # # ALWAYS first shard only here # filename="step3p5_flash_Q4_K_S-00001-of-00012.gguf", # # Download all shards # additional_files=[ # f"step3p5_flash_Q4_K_S-{i:05d}-of-00012.gguf" # for i in range(2, 13) # ], # local_dir="./models", # # Performance settings # flash_attn=True, # n_gpu_layers=-1, # use full GPU (if you have enough VRAM) # n_batch=2048, # n_ctx=4096, # 8000 is heavy unless needed # ) # llm = Llama.from_pretrained( # repo_id="stepfun-ai/Step-3.5-Flash-GGUF-Q4_K_S", # filename="step3p5_flash_Q4_K_S-00001-of-00012.gguf", # allow_patterns=["UD-TQ1_0/*.gguf"], # verbose=False # ) import os def print_tree(start_path="models"): for root, dirs, files in os.walk(start_path): level = root.replace(start_path, "").count(os.sep) indent = "β " * level print(f"{indent}βββ {os.path.basename(root)}/") subindent = "β " * (level + 1) for f in files: print(f"{subindent}βββ {f}") print_tree("models") import gc import torch def delete_llama_model(llm): # global llm if llm is not None: try: llm.close() # π₯ VERY IMPORTANT except Exception as e: print("Close error:", e) llm = None # Force Python garbage collection gc.collect() # Clear GPU cache (if using CUDA) try: torch.cuda.empty_cache() torch.cuda.ipc_collect() torch.cuda.synchronize() except: pass print("Model fully unloaded.") llm = None llm_model_glm = None llm_model_qwen= None _IMAGE_MIME_TYPES = { # Most common formats '.png': 'image/png', '.jpg': 'image/jpeg', '.jpeg': 'image/jpeg', '.gif': 'image/gif', '.webp': 'image/webp', # Next-generation formats '.avif': 'image/avif', '.jp2': 'image/jp2', '.j2k': 'image/jp2', '.jpx': 'image/jp2', # Legacy / Windows formats '.bmp': 'image/bmp', '.ico': 'image/x-icon', '.pcx': 'image/x-pcx', '.tga': 'image/x-tga', '.icns': 'image/icns', # Professional / Scientific imaging '.tif': 'image/tiff', '.tiff': 'image/tiff', '.eps': 'application/postscript', '.dds': 'image/vnd-ms.dds', '.dib': 'image/dib', '.sgi': 'image/sgi', # Portable Map formats (PPM/PGM/PBM) '.pbm': 'image/x-portable-bitmap', '.pgm': 'image/x-portable-graymap', '.ppm': 'image/x-portable-pixmap', # Miscellaneous / Older formats '.xbm': 'image/x-xbitmap', '.mpo': 'image/mpo', '.msp': 'image/msp', '.im': 'image/x-pillow-im', '.qoi': 'image/qoi', } def image_to_base64_data_uri( file_path: str, *, fallback_mime: str = "application/octet-stream" ) -> str: """ Convert a local image file to a base64-encoded data URI with the correct MIME type. Supports 20+ image formats (PNG, JPEG, WebP, AVIF, BMP, ICO, TIFF, etc.). Args: file_path: Path to the image file on disk. fallback_mime: MIME type used when the file extension is unknown. Returns: A valid data URI string (e.g., data:image/webp;base64,...). Raises: FileNotFoundError: If the file does not exist. OSError: If reading the file fails. """ if not os.path.isfile(file_path): raise FileNotFoundError(f"Image file not found: {file_path}") extension = os.path.splitext(file_path)[1].lower() mime_type = _IMAGE_MIME_TYPES.get(extension, fallback_mime) if mime_type == fallback_mime: print(f"Warning: Unknown extension '{extension}' for '{file_path}'. " f"Using fallback MIME type: {fallback_mime}") try: with open(file_path, "rb") as img_file: encoded_data = base64.b64encode(img_file.read()).decode("utf-8") except OSError as e: raise OSError(f"Failed to read image file '{file_path}': {e}") from e return f"data:{mime_type};base64,{encoded_data}" ###################### sample code ################################################ # --- Main Logic for Image Processing --- # # 1. Create a list containing all image paths # image_paths = [ # r'./scene.jpeg', # r'./cat.png', # r'./network.webp', # # Add more image paths here if needed # ] # # 2. Create an empty list to store the message objects (images and text) # images_messages = [] # # 3. Loop through the image path list, convert each image to a Data URI, # # and add it to the message list as an image_url object. # for path in image_paths: # data_uri = image_to_base64_data_uri(path) # images_messages.append({"type": "image_url", "image_url": {"url": data_uri}}) # # 4. Add the final text prompt at the end of the list # images_messages.append({"type": "text", "text": "Describes the images."}) # # 5. Use this list to build the chat_completion request # res = llm.create_chat_completion( # messages=[ # {"role": "system", "content": "You are a highly accurate vision-language assistant. Provide detailed, precise, and well-structured image descriptions."}, # # The user's content is the list containing both images and text # {"role": "user", "content": images_messages} # ] # ) # # Print the assistant's response # print(res["choices"][0]["message"]["content"]) @spaces.GPU(duration=120) def respond( message, history: list[tuple[str, str]], model, system_message, max_tokens, temperature, top_p, top_k, repeat_penalty, ): # chat_template = MessagesFormatterType.GEMMA_2 chat_template = MessagesFormatterType.CHATML # chat_template = global llm global llm_model global llm_model_glm global llm_model_qwen if model == "Qwen3-VL-32B-Thinking-Q8_0.gguf" : if llm_model_qwen == None: llm_model_qwen = Llama( model_path=f"models/Qwen3-VL-32B-Thinking-Q8_0.gguf", flash_attn=True, n_gpu_layers=-1, n_batch=2048, # increase n_ctx= 8196, # reduce if you donβt need 8k n_threads=16, # set to your CPU cores use_mlock=True, verbose=True, chat_handler=Qwen3VLChatHandler( clip_model_path=f"models/mmproj-Qwen3VL-32B-Thinking-F16.gguf", force_reasoning=True, image_min_tokens=1024, # Note: Qwen-VL models require at minimum 1024 image tokens to function correctly on bbox grounding tasks ), ) x=llm_model_qwen.create_chat_completion( messages = [ {"role": "system", "content": "hi"}, { "role": "user", "content": str(message) } ] ) print(x) # delete_llama_model(llm_model_qwen) yield str(x) # provider = LlamaCppPythonProvider(llm) # agent = LlamaCppAgent( # provider, # system_prompt=f"{system_message}", # predefined_messages_formatter_type=chat_template, # debug_output=False # ) # settings = provider.get_provider_default_settings() # settings.temperature = temperature # settings.top_k = top_k # settings.top_p = top_p # settings.max_tokens = max_tokens # settings.repeat_penalty = repeat_penalty # # settings.stream = True # # settings.reasoning_effort ="low" # messages = BasicChatHistory() # for msn in history: # user = { # 'role': Roles.user, # 'content': msn[0] # } # assistant = { # 'role': Roles.assistant, # 'content': msn[1] # } # messages.add_message(user) # messages.add_message(assistant) # stream = agent.get_chat_response( # message, # # llm_sampling_settings=settings, # chat_history=messages, # # returns_streaming_generator=True, # print_output=False # ) # outputs = "" # for output in stream: # outputs += output # yield outputs description = """
Defaults to 2B (you can switch to 9B or 27B from additional inputs)