Spaces:
Sleeping
Sleeping
| #!/usr/bin/env python3 | |
| import os | |
| import sys | |
| import warnings | |
| import torch | |
| from transformers import AutoTokenizer, AutoModelForCausalLM | |
| import gradio as gr | |
| # =================== CONFIGURATION =================== | |
| MODEL_ID = "abdelac/tinyllama" # Changed back to TinyLlama for CPU | |
| USE_CPU = True # Force CPU mode | |
| # =================== SUPPRESS WARNINGS =================== | |
| warnings.filterwarnings("ignore") | |
| os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3" | |
| os.environ["TRANSFORMERS_VERBOSITY"] = "error" | |
| # =================== SIMPLE MODEL CACHE =================== | |
| _model_cache = {} | |
| def load_model(): | |
| """Load model with simple caching (no @gr.cache_resource)""" | |
| if "model" in _model_cache: | |
| return _model_cache["tokenizer"], _model_cache["model"] | |
| print(f"π Loading {MODEL_ID} on CPU...") | |
| # Load tokenizer | |
| tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) | |
| # Force CPU loading (no CUDA) | |
| model = AutoModelForCausalLM.from_pretrained( | |
| MODEL_ID, | |
| torch_dtype=torch.float32, # Use float32 for CPU | |
| device_map="cpu", # Force CPU | |
| low_cpu_mem_usage=True, | |
| offload_folder="./offload" # Offload if needed | |
| ) | |
| # Cache for future use | |
| _model_cache["tokenizer"] = tokenizer | |
| _model_cache["model"] = model | |
| print("β Model loaded successfully on CPU!") | |
| print(f" Device: {model.device}") | |
| print(f" Dtype: {model.dtype}") | |
| return tokenizer, model | |
| # =================== GENERATION FUNCTION =================== | |
| def generate_text(prompt, max_tokens=80, temperature=0.7): | |
| """Generate text with memory limits""" | |
| try: | |
| tokenizer, model = load_model() | |
| # Tokenize | |
| inputs = tokenizer(prompt, return_tensors="pt") | |
| # Generate with very conservative settings | |
| with torch.no_grad(): | |
| outputs = model.generate( | |
| **inputs, | |
| max_new_tokens=min(max_tokens, 100), # Hard cap at 100 | |
| temperature=temperature, | |
| do_sample=True, | |
| pad_token_id=tokenizer.eos_token_id, | |
| repetition_penalty=1.1, | |
| no_repeat_ngram_size=2, | |
| early_stopping=True | |
| ) | |
| # Decode | |
| result = tokenizer.decode(outputs[0], skip_special_tokens=True) | |
| return result | |
| except Exception as e: | |
| return f"β Error: {str(e)}" | |
| # =================== SIMPLE INTERFACE =================== | |
| def create_interface(): | |
| """Create a minimal interface""" | |
| with gr.Blocks( | |
| title="π¦ TinyLlama Demo", | |
| theme=gr.themes.Soft(), | |
| css=""" | |
| .gradio-container {max-width: 700px !important; margin: auto;} | |
| """ | |
| ) as demo: | |
| gr.Markdown(""" | |
| # π¦ TinyLlama Demo (CPU Mode) | |
| **Model:** [abdelac/tinyllama](https://huggingface.co/abdelac/tinyllama) | |
| **Hardware:** CPU Only (No GPU required) | |
| β οΈ **Note:** Running on CPU - responses may be slower | |
| """) | |
| # Input | |
| prompt = gr.Textbox( | |
| label="π Enter your prompt:", | |
| placeholder="Type here...", | |
| lines=3, | |
| value="Once upon a time" | |
| ) | |
| # Controls | |
| with gr.Row(): | |
| max_tokens = gr.Slider( | |
| 30, 100, value=60, | |
| label="π Max Tokens", | |
| info="Keep β€ 80 for best performance" | |
| ) | |
| temperature = gr.Slider( | |
| 0.1, 1.0, value=0.7, | |
| label="π‘οΈ Temperature" | |
| ) | |
| # Buttons | |
| with gr.Row(): | |
| generate_btn = gr.Button( | |
| "β¨ Generate", | |
| variant="primary" | |
| ) | |
| clear_btn = gr.Button("ποΈ Clear") | |
| # Output | |
| output = gr.Textbox( | |
| label="π Generated Text:", | |
| lines=6 | |
| ) | |
| # Examples | |
| gr.Examples( | |
| examples=[ | |
| ["The future of AI is"], | |
| ["Write a short story about a cat"], | |
| ["Explain machine learning simply:"], | |
| ["The benefits of exercise include"] | |
| ], | |
| inputs=prompt, | |
| label="π‘ Try these examples" | |
| ) | |
| # Actions | |
| generate_btn.click( | |
| fn=generate_text, | |
| inputs=[prompt, max_tokens, temperature], | |
| outputs=output | |
| ) | |
| clear_btn.click( | |
| fn=lambda: ("", ""), | |
| inputs=[], | |
| outputs=[prompt, output] | |
| ) | |
| # Footer | |
| gr.Markdown("---") | |
| gr.Markdown(""" | |
| <div style='text-align: center; color: #666; font-size: 0.9em;'> | |
| β Model loaded on CPU | β‘ Ready for text generation | |
| </div> | |
| """) | |
| return demo | |
| # =================== MAIN =================== | |
| if __name__ == "__main__": | |
| print("Starting TinyLlama Demo...") | |
| print(f"PyTorch version: {torch.__version__}") | |
| print(f"CUDA available: {torch.cuda.is_available()}") | |
| demo = create_interface() | |
| demo.launch( | |
| server_name="0.0.0.0", | |
| server_port=7860, | |
| share=False, | |
| quiet=False, # Keep False to see startup messages | |
| debug=False, | |
| show_error=True | |
| ) |