Mistral_Test / app.py
eesfeg's picture
requirements
c96e7ad
#!/usr/bin/env python3
import os
import sys
import warnings
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
import gradio as gr
# =================== CONFIGURATION ===================
MODEL_ID = "abdelac/tinyllama" # Changed back to TinyLlama for CPU
USE_CPU = True # Force CPU mode
# =================== SUPPRESS WARNINGS ===================
warnings.filterwarnings("ignore")
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
os.environ["TRANSFORMERS_VERBOSITY"] = "error"
# =================== SIMPLE MODEL CACHE ===================
_model_cache = {}
def load_model():
"""Load model with simple caching (no @gr.cache_resource)"""
if "model" in _model_cache:
return _model_cache["tokenizer"], _model_cache["model"]
print(f"πŸš€ Loading {MODEL_ID} on CPU...")
# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
# Force CPU loading (no CUDA)
model = AutoModelForCausalLM.from_pretrained(
MODEL_ID,
torch_dtype=torch.float32, # Use float32 for CPU
device_map="cpu", # Force CPU
low_cpu_mem_usage=True,
offload_folder="./offload" # Offload if needed
)
# Cache for future use
_model_cache["tokenizer"] = tokenizer
_model_cache["model"] = model
print("βœ… Model loaded successfully on CPU!")
print(f" Device: {model.device}")
print(f" Dtype: {model.dtype}")
return tokenizer, model
# =================== GENERATION FUNCTION ===================
def generate_text(prompt, max_tokens=80, temperature=0.7):
"""Generate text with memory limits"""
try:
tokenizer, model = load_model()
# Tokenize
inputs = tokenizer(prompt, return_tensors="pt")
# Generate with very conservative settings
with torch.no_grad():
outputs = model.generate(
**inputs,
max_new_tokens=min(max_tokens, 100), # Hard cap at 100
temperature=temperature,
do_sample=True,
pad_token_id=tokenizer.eos_token_id,
repetition_penalty=1.1,
no_repeat_ngram_size=2,
early_stopping=True
)
# Decode
result = tokenizer.decode(outputs[0], skip_special_tokens=True)
return result
except Exception as e:
return f"❌ Error: {str(e)}"
# =================== SIMPLE INTERFACE ===================
def create_interface():
"""Create a minimal interface"""
with gr.Blocks(
title="πŸ¦™ TinyLlama Demo",
theme=gr.themes.Soft(),
css="""
.gradio-container {max-width: 700px !important; margin: auto;}
"""
) as demo:
gr.Markdown("""
# πŸ¦™ TinyLlama Demo (CPU Mode)
**Model:** [abdelac/tinyllama](https://huggingface.co/abdelac/tinyllama)
**Hardware:** CPU Only (No GPU required)
⚠️ **Note:** Running on CPU - responses may be slower
""")
# Input
prompt = gr.Textbox(
label="πŸ“ Enter your prompt:",
placeholder="Type here...",
lines=3,
value="Once upon a time"
)
# Controls
with gr.Row():
max_tokens = gr.Slider(
30, 100, value=60,
label="πŸ“ Max Tokens",
info="Keep ≀ 80 for best performance"
)
temperature = gr.Slider(
0.1, 1.0, value=0.7,
label="🌑️ Temperature"
)
# Buttons
with gr.Row():
generate_btn = gr.Button(
"✨ Generate",
variant="primary"
)
clear_btn = gr.Button("πŸ—‘οΈ Clear")
# Output
output = gr.Textbox(
label="πŸ“„ Generated Text:",
lines=6
)
# Examples
gr.Examples(
examples=[
["The future of AI is"],
["Write a short story about a cat"],
["Explain machine learning simply:"],
["The benefits of exercise include"]
],
inputs=prompt,
label="πŸ’‘ Try these examples"
)
# Actions
generate_btn.click(
fn=generate_text,
inputs=[prompt, max_tokens, temperature],
outputs=output
)
clear_btn.click(
fn=lambda: ("", ""),
inputs=[],
outputs=[prompt, output]
)
# Footer
gr.Markdown("---")
gr.Markdown("""
<div style='text-align: center; color: #666; font-size: 0.9em;'>
βœ… Model loaded on CPU | ⚑ Ready for text generation
</div>
""")
return demo
# =================== MAIN ===================
if __name__ == "__main__":
print("Starting TinyLlama Demo...")
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
demo = create_interface()
demo.launch(
server_name="0.0.0.0",
server_port=7860,
share=False,
quiet=False, # Keep False to see startup messages
debug=False,
show_error=True
)