import gradio as gr import torch from transformers import AutoTokenizer, AutoModelForCausalLM, TextIteratorStreamer import time from threading import Thread import sys import os # os.environ["BNB_CUDA_VERSION"] = "0" # Forces bitsandbytes to recognize no GPU os.environ["OMP_NUM_THREADS"] = "1" # Prevents race conditions in custom CPU kernels os.environ["VECLIB_MAXIMUM_ISA"] = "AVX2" os.environ["MKL_DEBUG_CPU_TYPE"] = "5" # Forces MKL to use AVX2 try: import spaces except ImportError: spaces = None if spaces is None or not torch.cuda.is_available(): print("Using CPU-only mode (spaces.GPU disabled)") class SpacesShim: def GPU(self, *args, **kwargs): # Helper to handle both @spaces.GPU and @spaces.GPU(duration=...) usage def decorator(func): return func # If called as @spaces.GPU (no parens), the first arg is the function if len(args) == 1 and callable(args[0]) and not kwargs: return args[0] # If called as @spaces.GPU(duration=30), it returns the decorator return decorator spaces = SpacesShim() def gpu_decorator(func): return spaces.GPU()(func) # Model configuration if len(sys.argv) > 1 and os.path.exists(sys.argv[1]): MODEL_NAME = sys.argv[1] print(f"Using local model from: {MODEL_NAME}") else: #MODEL_NAME = "TobDeBer/SmolLM3-3B-hirma-b80s-0.5" #MODEL_NAME = "TobDeBer/SmolLM3-3B-hirma-b60s-0.5" MODEL_NAME = "TobDeBer/SmolLM3-3B-hirma-b100-0.5" #MODEL_NAME = "TobDeBer/SmolLM2-135M-Instruct-hirma-b60s-0.5" #MODEL_NAME = "TobDeBer/SmolLM2-135M-Instruct-b100" ##MODEL_NAME = "TobDeBer/SmolLM3-3B-hirma-b60-bnb4" #MODEL_NAME = "TobDeBer/SmolLM3-3B-hirma-b60-0.5" ##MODEL_NAME = "TobDeBer/SmolLM3-3B-hirma-q20-bnb8" ##MODEL_NAME = "TobDeBer/SmolLM3-3B-hirma-q20" # MODEL_NAME = "TobDeBer/SmolLM3-3B-hirma-q80-bnb4" #MODEL_NAME = "TobDeBer/SmolLM2-135M-Instruct-q99-bnb4" #MODEL_NAME = "HuggingFaceTB/SmolLM2-135M-Instruct" # Global variables tokenizer = None model = None import platform import subprocess import cpuinfo # Optional: 'pip install py-cpuinfo' is better if you can add it def load_model(): """Load the Smol LLM model and tokenizer with hardware detection""" global tokenizer, model try: print("--- Hardware Audit ---") print(f"Processor: {platform.processor()}") print(f"Machine: {platform.machine()}") # Check for CPU Flags (Instruction Sets) try: # For Linux-based Cloud environments cpu_flags = subprocess.check_output("lscpu", shell=True).decode() print("Instruction sets found:") for flag in ["avx512", "avx2", "avx", "fma", "amx"]: if flag in cpu_flags.lower(): print(f" ✅ {flag.upper()} supported") else: print(f" ❌ {flag.upper()} NOT found") except Exception as e: print(f"Could not check CPU flags: {e}") print(f"PyTorch version: {torch.__version__}") print(f"Loading model: {MODEL_NAME}") print("----------------------") tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True) tokenizer.padding_side = "left" if tokenizer.pad_token is None: tokenizer.pad_token = tokenizer.eos_token # Determine device and dtype based on hardware availability if torch.cuda.is_available(): print(" ✅ CUDA detected. Loading model on GPU.") device_map = "auto" dtype = torch.bfloat16 else: print(" ⚠️ No CUDA detected. Loading model on CPU.") device_map = {"": "cpu"} dtype = torch.float32 model = AutoModelForCausalLM.from_pretrained( MODEL_NAME, dtype=dtype, device_map=device_map, low_cpu_mem_usage=True ) model.to(torch.bfloat16) return "✅ Model loaded successfully!" except Exception as e: return f"❌ Error loading model: {str(e)}" @spaces.GPU(duration=30) def chat_predict(message, history, max_length, temperature, top_p, repetition_penalty, system_prompt): """Generate text using the loaded model with streaming and history""" global model, tokenizer if model is None or tokenizer is None: yield "⚠️ Please wait for the model to finish loading..." return try: # Prepare messages for chat template messages = [] if system_prompt: messages.append({"role": "system", "content": system_prompt}) # Handle history which can be list of dicts with multimodal content for msg in history: role = msg.get("role", "user") content = msg.get("content", "") # Extract text if content is a list (multimodal format in Gradio 6) if isinstance(content, list): text_content = "" for part in content: if isinstance(part, dict) and part.get("type") == "text": text_content += part.get("text", "") content = text_content # Ensure content is string if not isinstance(content, str): content = str(content) # Clean up assistant stats if role == "assistant" and "\n\n---\n*Generated" in content: content = content.split("\n\n---\n*Generated")[0] messages.append({"role": role, "content": content}) messages.append({"role": "user", "content": message}) # Format the prompt formatted_prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) print("formatted_prompt: ", formatted_prompt) inputs = tokenizer(formatted_prompt, return_tensors="pt").to(model.device) # Setup streamer streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True) # Generation arguments generation_kwargs = dict( **inputs, streamer=streamer, max_new_tokens=max_length, temperature=temperature, top_p=top_p, repetition_penalty=repetition_penalty, do_sample=True, pad_token_id=tokenizer.eos_token_id, eos_token_id=tokenizer.eos_token_id ) # Start generation in a separate thread thread = Thread(target=model.generate, kwargs=generation_kwargs) thread.start() # Consume the stream generated_text = "" start_time = time.time() token_count = 0 last_update_time = start_time current_stats = "" for new_text in streamer: generated_text += new_text token_count += 1 # Update stats every 0.2 seconds current_time = time.time() if current_time - last_update_time > 0.2: elapsed = current_time - start_time if elapsed > 0: tps = token_count / elapsed current_stats = f"\n\n---\n*Generating... ({tps:.1f} t/s)*" last_update_time = current_time yield generated_text + current_stats # Final stats elapsed_time = time.time() - start_time if elapsed_time > 0: tps = token_count / elapsed_time stats = f"\n\n---\n*Generated {token_count} tokens in {elapsed_time:.2f}s ({tps:.2f} t/s)*" yield generated_text + stats except Exception as e: yield f"❌ Error during generation: {str(e)}" # Custom CSS to force full height and style chat css = """ .gradio-container { height: 100vh !important; max-height: 100vh !important; overflow: hidden !important; } #main-row { height: calc(100vh - 150px) !important; } #chat-col { height: 100% !important; } /* Thin box around prompt field - targeting specifically within chat column */ #chat-col textarea { border: 1px solid #64748b !important; border-radius: 8px !important; padding: 8px !important; } """ # Create custom theme with smaller base font custom_theme = gr.themes.Soft( primary_hue="blue", secondary_hue="indigo", neutral_hue="slate", font=gr.themes.GoogleFont("Inter"), text_size="md", spacing_size="sm", radius_size="md" ).set( button_primary_background_fill="*primary_600", button_primary_background_fill_hover="*primary_700", block_title_text_weight="600", ) # Build the Gradio interface with gr.Blocks(fill_height=True) as demo: gr.Markdown( """ # 🤖 Smol LLM Chat - Multi-turn chat with SmolLM3-3B. """ ) with gr.Row(elem_id="main-row"): with gr.Column(scale=1, min_width=200): with gr.Accordion("⚙️ Parameters", open=False): max_tokens = gr.Slider( minimum=50, maximum=1024, value=200, step=50, label="Max Tokens" ) temperature = gr.Slider( minimum=0.1, maximum=2.0, value=0.1, step=0.1, label="Temperature" ) top_p = gr.Slider( minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-p" ) repetition_penalty = gr.Slider( minimum=1.0, maximum=2.0, value=1.1, step=0.1, label="Repetition Penalty" ) system_prompt = gr.Textbox( label="System Prompt", value="You are a helpful AI assistant. Provide clear and concise answers.", lines=2 ) with gr.Column(scale=4, elem_id="chat-col"): # Chat Interface chat_interface = gr.ChatInterface( fn=chat_predict, fill_height=True, additional_inputs=[ max_tokens, temperature, top_p, repetition_penalty, system_prompt ], ) # Auto-load the model at startup load_status = load_model() print(f"Startup load status: {load_status}") if __name__ == "__main__": # Launch the application demo.launch( theme=custom_theme, css=css, share=False, show_error=True )