File size: 5,398 Bytes
9cb84f0
 
 
 
 
24b8860
664542f
9cb84f0
24b8860
2febca8
664542f
 
24b8860
664542f
 
 
 
 
 
 
24b8860
9cb84f0
664542f
 
 
 
 
9cb84f0
2febca8
9cb84f0
2febca8
664542f
9cb84f0
 
664542f
 
 
 
9cb84f0
 
664542f
 
 
 
 
 
 
2febca8
9cb84f0
24b8860
646620f
664542f
 
9cb84f0
2febca8
 
646620f
664542f
9cb84f0
664542f
9cb84f0
 
 
664542f
9cb84f0
 
2febca8
664542f
 
 
9cb84f0
 
664542f
 
 
9cb84f0
 
 
24b8860
646620f
9cb84f0
664542f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9cb84f0
664542f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9cb84f0
664542f
 
 
c96e7ad
664542f
9cb84f0
664542f
 
 
 
 
 
 
 
 
 
 
 
 
2febca8
 
 
 
 
664542f
 
 
 
 
 
 
 
 
 
 
 
 
 
9cb84f0
 
24b8860
2febca8
9cb84f0
664542f
 
 
 
2febca8
 
 
 
664542f
 
 
 
2febca8
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
#!/usr/bin/env python3

import os
import sys
import warnings
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
import gradio as gr

# =================== CONFIGURATION ===================
MODEL_ID = "abdelac/tinyllama"  # Changed back to TinyLlama for CPU
USE_CPU = True  # Force CPU mode

# =================== SUPPRESS WARNINGS ===================
warnings.filterwarnings("ignore")
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
os.environ["TRANSFORMERS_VERBOSITY"] = "error"

# =================== SIMPLE MODEL CACHE ===================
_model_cache = {}

def load_model():
    """Load model with simple caching (no @gr.cache_resource)"""
    if "model" in _model_cache:
        return _model_cache["tokenizer"], _model_cache["model"]
    
    print(f"πŸš€ Loading {MODEL_ID} on CPU...")
    
    # Load tokenizer
    tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
    
    # Force CPU loading (no CUDA)
    model = AutoModelForCausalLM.from_pretrained(
        MODEL_ID,
        torch_dtype=torch.float32,  # Use float32 for CPU
        device_map="cpu",           # Force CPU
        low_cpu_mem_usage=True,
        offload_folder="./offload"  # Offload if needed
    )
    
    # Cache for future use
    _model_cache["tokenizer"] = tokenizer
    _model_cache["model"] = model
    
    print("βœ… Model loaded successfully on CPU!")
    print(f"   Device: {model.device}")
    print(f"   Dtype: {model.dtype}")
    
    return tokenizer, model

# =================== GENERATION FUNCTION ===================
def generate_text(prompt, max_tokens=80, temperature=0.7):
    """Generate text with memory limits"""
    try:
        tokenizer, model = load_model()
        
        # Tokenize
        inputs = tokenizer(prompt, return_tensors="pt")
        
        # Generate with very conservative settings
        with torch.no_grad():
            outputs = model.generate(
                **inputs,
                max_new_tokens=min(max_tokens, 100),  # Hard cap at 100
                temperature=temperature,
                do_sample=True,
                pad_token_id=tokenizer.eos_token_id,
                repetition_penalty=1.1,
                no_repeat_ngram_size=2,
                early_stopping=True
            )
        
        # Decode
        result = tokenizer.decode(outputs[0], skip_special_tokens=True)
        return result
        
    except Exception as e:
        return f"❌ Error: {str(e)}"

# =================== SIMPLE INTERFACE ===================
def create_interface():
    """Create a minimal interface"""
    with gr.Blocks(
        title="πŸ¦™ TinyLlama Demo",
        theme=gr.themes.Soft(),
        css="""
        .gradio-container {max-width: 700px !important; margin: auto;}
        """
    ) as demo:
        
        gr.Markdown("""
        # πŸ¦™ TinyLlama Demo (CPU Mode)
        
        **Model:** [abdelac/tinyllama](https://huggingface.co/abdelac/tinyllama)
        **Hardware:** CPU Only (No GPU required)
        
        ⚠️ **Note:** Running on CPU - responses may be slower
        """)
        
        # Input
        prompt = gr.Textbox(
            label="πŸ“ Enter your prompt:",
            placeholder="Type here...",
            lines=3,
            value="Once upon a time"
        )
        
        # Controls
        with gr.Row():
            max_tokens = gr.Slider(
                30, 100, value=60,
                label="πŸ“ Max Tokens",
                info="Keep ≀ 80 for best performance"
            )
            temperature = gr.Slider(
                0.1, 1.0, value=0.7,
                label="🌑️ Temperature"
            )
        
        # Buttons
        with gr.Row():
            generate_btn = gr.Button(
                "✨ Generate",
                variant="primary"
            )
            clear_btn = gr.Button("πŸ—‘οΈ Clear")
        
        # Output
        output = gr.Textbox(
            label="πŸ“„ Generated Text:",
            lines=6
        )
        
        # Examples
        gr.Examples(
            examples=[
                ["The future of AI is"],
                ["Write a short story about a cat"],
                ["Explain machine learning simply:"],
                ["The benefits of exercise include"]
            ],
            inputs=prompt,
            label="πŸ’‘ Try these examples"
        )
        
        # Actions
        generate_btn.click(
            fn=generate_text,
            inputs=[prompt, max_tokens, temperature],
            outputs=output
        )
        
        clear_btn.click(
            fn=lambda: ("", ""),
            inputs=[],
            outputs=[prompt, output]
        )
        
        # Footer
        gr.Markdown("---")
        gr.Markdown("""
        <div style='text-align: center; color: #666; font-size: 0.9em;'>
        βœ… Model loaded on CPU | ⚑ Ready for text generation
        </div>
        """)
    
    return demo

# =================== MAIN ===================
if __name__ == "__main__":
    print("Starting TinyLlama Demo...")
    print(f"PyTorch version: {torch.__version__}")
    print(f"CUDA available: {torch.cuda.is_available()}")
    
    demo = create_interface()
    demo.launch(
        server_name="0.0.0.0",
        server_port=7860,
        share=False,
        quiet=False,  # Keep False to see startup messages
        debug=False,
        show_error=True
    )