#!/usr/bin/env python3 import os import sys import warnings import torch from transformers import AutoTokenizer, AutoModelForCausalLM import gradio as gr # =================== CONFIGURATION =================== MODEL_ID = "abdelac/tinyllama" # Changed back to TinyLlama for CPU USE_CPU = True # Force CPU mode # =================== SUPPRESS WARNINGS =================== warnings.filterwarnings("ignore") os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3" os.environ["TRANSFORMERS_VERBOSITY"] = "error" # =================== SIMPLE MODEL CACHE =================== _model_cache = {} def load_model(): """Load model with simple caching (no @gr.cache_resource)""" if "model" in _model_cache: return _model_cache["tokenizer"], _model_cache["model"] print(f"🚀 Loading {MODEL_ID} on CPU...") # Load tokenizer tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) # Force CPU loading (no CUDA) model = AutoModelForCausalLM.from_pretrained( MODEL_ID, torch_dtype=torch.float32, # Use float32 for CPU device_map="cpu", # Force CPU low_cpu_mem_usage=True, offload_folder="./offload" # Offload if needed ) # Cache for future use _model_cache["tokenizer"] = tokenizer _model_cache["model"] = model print("✅ Model loaded successfully on CPU!") print(f" Device: {model.device}") print(f" Dtype: {model.dtype}") return tokenizer, model # =================== GENERATION FUNCTION =================== def generate_text(prompt, max_tokens=80, temperature=0.7): """Generate text with memory limits""" try: tokenizer, model = load_model() # Tokenize inputs = tokenizer(prompt, return_tensors="pt") # Generate with very conservative settings with torch.no_grad(): outputs = model.generate( **inputs, max_new_tokens=min(max_tokens, 100), # Hard cap at 100 temperature=temperature, do_sample=True, pad_token_id=tokenizer.eos_token_id, repetition_penalty=1.1, no_repeat_ngram_size=2, early_stopping=True ) # Decode result = tokenizer.decode(outputs[0], skip_special_tokens=True) return result except Exception as e: return f"❌ Error: {str(e)}" # =================== SIMPLE INTERFACE =================== def create_interface(): """Create a minimal interface""" with gr.Blocks( title="🦙 TinyLlama Demo", theme=gr.themes.Soft(), css=""" .gradio-container {max-width: 700px !important; margin: auto;} """ ) as demo: gr.Markdown(""" # 🦙 TinyLlama Demo (CPU Mode) **Model:** [abdelac/tinyllama](https://huggingface.co/abdelac/tinyllama) **Hardware:** CPU Only (No GPU required) ⚠️ **Note:** Running on CPU - responses may be slower """) # Input prompt = gr.Textbox( label="📝 Enter your prompt:", placeholder="Type here...", lines=3, value="Once upon a time" ) # Controls with gr.Row(): max_tokens = gr.Slider( 30, 100, value=60, label="📏 Max Tokens", info="Keep ≤ 80 for best performance" ) temperature = gr.Slider( 0.1, 1.0, value=0.7, label="🌡️ Temperature" ) # Buttons with gr.Row(): generate_btn = gr.Button( "✨ Generate", variant="primary" ) clear_btn = gr.Button("🗑️ Clear") # Output output = gr.Textbox( label="📄 Generated Text:", lines=6 ) # Examples gr.Examples( examples=[ ["The future of AI is"], ["Write a short story about a cat"], ["Explain machine learning simply:"], ["The benefits of exercise include"] ], inputs=prompt, label="💡 Try these examples" ) # Actions generate_btn.click( fn=generate_text, inputs=[prompt, max_tokens, temperature], outputs=output ) clear_btn.click( fn=lambda: ("", ""), inputs=[], outputs=[prompt, output] ) # Footer gr.Markdown("---") gr.Markdown("""