Spaces:
Sleeping
Sleeping
File size: 5,398 Bytes
9cb84f0 24b8860 664542f 9cb84f0 24b8860 2febca8 664542f 24b8860 664542f 24b8860 9cb84f0 664542f 9cb84f0 2febca8 9cb84f0 2febca8 664542f 9cb84f0 664542f 9cb84f0 664542f 2febca8 9cb84f0 24b8860 646620f 664542f 9cb84f0 2febca8 646620f 664542f 9cb84f0 664542f 9cb84f0 664542f 9cb84f0 2febca8 664542f 9cb84f0 664542f 9cb84f0 24b8860 646620f 9cb84f0 664542f 9cb84f0 664542f 9cb84f0 664542f c96e7ad 664542f 9cb84f0 664542f 2febca8 664542f 9cb84f0 24b8860 2febca8 9cb84f0 664542f 2febca8 664542f 2febca8 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 | #!/usr/bin/env python3
import os
import sys
import warnings
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
import gradio as gr
# =================== CONFIGURATION ===================
MODEL_ID = "abdelac/tinyllama" # Changed back to TinyLlama for CPU
USE_CPU = True # Force CPU mode
# =================== SUPPRESS WARNINGS ===================
warnings.filterwarnings("ignore")
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
os.environ["TRANSFORMERS_VERBOSITY"] = "error"
# =================== SIMPLE MODEL CACHE ===================
_model_cache = {}
def load_model():
"""Load model with simple caching (no @gr.cache_resource)"""
if "model" in _model_cache:
return _model_cache["tokenizer"], _model_cache["model"]
print(f"π Loading {MODEL_ID} on CPU...")
# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
# Force CPU loading (no CUDA)
model = AutoModelForCausalLM.from_pretrained(
MODEL_ID,
torch_dtype=torch.float32, # Use float32 for CPU
device_map="cpu", # Force CPU
low_cpu_mem_usage=True,
offload_folder="./offload" # Offload if needed
)
# Cache for future use
_model_cache["tokenizer"] = tokenizer
_model_cache["model"] = model
print("β
Model loaded successfully on CPU!")
print(f" Device: {model.device}")
print(f" Dtype: {model.dtype}")
return tokenizer, model
# =================== GENERATION FUNCTION ===================
def generate_text(prompt, max_tokens=80, temperature=0.7):
"""Generate text with memory limits"""
try:
tokenizer, model = load_model()
# Tokenize
inputs = tokenizer(prompt, return_tensors="pt")
# Generate with very conservative settings
with torch.no_grad():
outputs = model.generate(
**inputs,
max_new_tokens=min(max_tokens, 100), # Hard cap at 100
temperature=temperature,
do_sample=True,
pad_token_id=tokenizer.eos_token_id,
repetition_penalty=1.1,
no_repeat_ngram_size=2,
early_stopping=True
)
# Decode
result = tokenizer.decode(outputs[0], skip_special_tokens=True)
return result
except Exception as e:
return f"β Error: {str(e)}"
# =================== SIMPLE INTERFACE ===================
def create_interface():
"""Create a minimal interface"""
with gr.Blocks(
title="π¦ TinyLlama Demo",
theme=gr.themes.Soft(),
css="""
.gradio-container {max-width: 700px !important; margin: auto;}
"""
) as demo:
gr.Markdown("""
# π¦ TinyLlama Demo (CPU Mode)
**Model:** [abdelac/tinyllama](https://huggingface.co/abdelac/tinyllama)
**Hardware:** CPU Only (No GPU required)
β οΈ **Note:** Running on CPU - responses may be slower
""")
# Input
prompt = gr.Textbox(
label="π Enter your prompt:",
placeholder="Type here...",
lines=3,
value="Once upon a time"
)
# Controls
with gr.Row():
max_tokens = gr.Slider(
30, 100, value=60,
label="π Max Tokens",
info="Keep β€ 80 for best performance"
)
temperature = gr.Slider(
0.1, 1.0, value=0.7,
label="π‘οΈ Temperature"
)
# Buttons
with gr.Row():
generate_btn = gr.Button(
"β¨ Generate",
variant="primary"
)
clear_btn = gr.Button("ποΈ Clear")
# Output
output = gr.Textbox(
label="π Generated Text:",
lines=6
)
# Examples
gr.Examples(
examples=[
["The future of AI is"],
["Write a short story about a cat"],
["Explain machine learning simply:"],
["The benefits of exercise include"]
],
inputs=prompt,
label="π‘ Try these examples"
)
# Actions
generate_btn.click(
fn=generate_text,
inputs=[prompt, max_tokens, temperature],
outputs=output
)
clear_btn.click(
fn=lambda: ("", ""),
inputs=[],
outputs=[prompt, output]
)
# Footer
gr.Markdown("---")
gr.Markdown("""
<div style='text-align: center; color: #666; font-size: 0.9em;'>
β
Model loaded on CPU | β‘ Ready for text generation
</div>
""")
return demo
# =================== MAIN ===================
if __name__ == "__main__":
print("Starting TinyLlama Demo...")
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
demo = create_interface()
demo.launch(
server_name="0.0.0.0",
server_port=7860,
share=False,
quiet=False, # Keep False to see startup messages
debug=False,
show_error=True
) |