eesfeg commited on
Commit
664542f
Β·
1 Parent(s): b16fade

requirements

Browse files
Files changed (2) hide show
  1. app.py +123 -58
  2. requirements.txt +5 -5
app.py CHANGED
@@ -3,117 +3,182 @@
3
  import os
4
  import sys
5
  import warnings
6
- import functools
7
  import torch
8
- from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
9
  import gradio as gr
10
 
11
  # =================== CONFIGURATION ===================
12
- MODEL_ID = "abdelac/Mistral_Test"
13
- USE_QUANTIZATION = True
14
 
15
- # =================== QUANTIZATION SETUP ===================
16
- if USE_QUANTIZATION:
17
- bnb_config = BitsAndBytesConfig(
18
- load_in_4bit=True,
19
- bnb_4bit_quant_type="nf4",
20
- bnb_4bit_compute_dtype=torch.float16,
21
- bnb_4bit_use_double_quant=True,
22
- )
23
- else:
24
- bnb_config = None
25
 
26
- # =================== MODEL LOADING ===================
27
- @functools.lru_cache(maxsize=1) # Cache the model loading
28
  def load_model():
29
- """Load Mistral model with quantization"""
30
- print(f"πŸš€ Loading {MODEL_ID}...")
 
 
 
31
 
32
  # Load tokenizer
33
  tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
34
 
35
- # Configure model loading
36
- load_kwargs = {
37
- "torch_dtype": torch.float16,
38
- "device_map": "auto",
39
- "low_cpu_mem_usage": True,
40
- }
41
-
42
- if USE_QUANTIZATION:
43
- load_kwargs["quantization_config"] = bnb_config
44
- print("βœ… Using 4-bit quantization")
45
- else:
46
- load_kwargs["device_map"] = "cpu"
47
- print("⚠️ Using CPU only")
48
-
49
- # Load model
50
  model = AutoModelForCausalLM.from_pretrained(
51
  MODEL_ID,
52
- **load_kwargs
 
 
 
53
  )
54
 
55
- # Set padding token
56
- if tokenizer.pad_token is None:
57
- tokenizer.pad_token = tokenizer.eos_token
 
 
 
 
58
 
59
- print("βœ… Model loaded!")
60
  return tokenizer, model
61
 
62
  # =================== GENERATION FUNCTION ===================
63
- def generate_text(prompt, max_tokens=100, temperature=0.7):
64
- """Generate text with memory constraints"""
65
  try:
66
  tokenizer, model = load_model()
67
 
68
  # Tokenize
69
- inputs = tokenizer(
70
- prompt,
71
- return_tensors="pt",
72
- truncation=True,
73
- max_length=512
74
- ).to(model.device)
75
 
76
- # Generate
77
  with torch.no_grad():
78
  outputs = model.generate(
79
  **inputs,
80
- max_new_tokens=min(max_tokens, 150),
81
  temperature=temperature,
82
  do_sample=True,
83
  pad_token_id=tokenizer.eos_token_id,
 
 
 
84
  )
85
 
86
- return tokenizer.decode(outputs[0], skip_special_tokens=True)
 
 
87
 
88
  except Exception as e:
89
  return f"❌ Error: {str(e)}"
90
 
91
  # =================== SIMPLE INTERFACE ===================
92
  def create_interface():
93
- with gr.Blocks(title="Mistral Demo") as demo:
94
- gr.Markdown(f"# πŸ¦… {MODEL_ID}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
95
 
96
- prompt = gr.Textbox(label="Prompt", lines=3, value="Hello")
97
- max_tokens = gr.Slider(30, 150, value=80, label="Max Tokens")
98
- temperature = gr.Slider(0.1, 1.0, value=0.7, label="Temperature")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
99
 
100
- generate_btn = gr.Button("Generate", variant="primary")
101
- output = gr.Textbox(label="Output", lines=6)
 
 
 
 
102
 
 
 
 
 
 
 
 
 
 
 
 
 
 
103
  generate_btn.click(
104
  fn=generate_text,
105
  inputs=[prompt, max_tokens, temperature],
106
  outputs=output
107
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
108
 
109
  return demo
110
 
111
  # =================== MAIN ===================
112
  if __name__ == "__main__":
113
- warnings.filterwarnings("ignore")
 
 
 
114
  demo = create_interface()
115
  demo.launch(
116
  server_name="0.0.0.0",
117
  server_port=7860,
118
- quiet=True
 
 
 
119
  )
 
3
  import os
4
  import sys
5
  import warnings
 
6
  import torch
7
+ from transformers import AutoTokenizer, AutoModelForCausalLM
8
  import gradio as gr
9
 
10
  # =================== CONFIGURATION ===================
11
+ MODEL_ID = "abdelac/tinyllama" # Changed back to TinyLlama for CPU
12
+ USE_CPU = True # Force CPU mode
13
 
14
+ # =================== SUPPRESS WARNINGS ===================
15
+ warnings.filterwarnings("ignore")
16
+ os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
17
+ os.environ["TRANSFORMERS_VERBOSITY"] = "error"
18
+
19
+ # =================== SIMPLE MODEL CACHE ===================
20
+ _model_cache = {}
 
 
 
21
 
 
 
22
  def load_model():
23
+ """Load model with simple caching (no @gr.cache_resource)"""
24
+ if "model" in _model_cache:
25
+ return _model_cache["tokenizer"], _model_cache["model"]
26
+
27
+ print(f"πŸš€ Loading {MODEL_ID} on CPU...")
28
 
29
  # Load tokenizer
30
  tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
31
 
32
+ # Force CPU loading (no CUDA)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
  model = AutoModelForCausalLM.from_pretrained(
34
  MODEL_ID,
35
+ torch_dtype=torch.float32, # Use float32 for CPU
36
+ device_map="cpu", # Force CPU
37
+ low_cpu_mem_usage=True,
38
+ offload_folder="./offload" # Offload if needed
39
  )
40
 
41
+ # Cache for future use
42
+ _model_cache["tokenizer"] = tokenizer
43
+ _model_cache["model"] = model
44
+
45
+ print("βœ… Model loaded successfully on CPU!")
46
+ print(f" Device: {model.device}")
47
+ print(f" Dtype: {model.dtype}")
48
 
 
49
  return tokenizer, model
50
 
51
  # =================== GENERATION FUNCTION ===================
52
+ def generate_text(prompt, max_tokens=80, temperature=0.7):
53
+ """Generate text with memory limits"""
54
  try:
55
  tokenizer, model = load_model()
56
 
57
  # Tokenize
58
+ inputs = tokenizer(prompt, return_tensors="pt")
 
 
 
 
 
59
 
60
+ # Generate with very conservative settings
61
  with torch.no_grad():
62
  outputs = model.generate(
63
  **inputs,
64
+ max_new_tokens=min(max_tokens, 100), # Hard cap at 100
65
  temperature=temperature,
66
  do_sample=True,
67
  pad_token_id=tokenizer.eos_token_id,
68
+ repetition_penalty=1.1,
69
+ no_repeat_ngram_size=2,
70
+ early_stopping=True
71
  )
72
 
73
+ # Decode
74
+ result = tokenizer.decode(outputs[0], skip_special_tokens=True)
75
+ return result
76
 
77
  except Exception as e:
78
  return f"❌ Error: {str(e)}"
79
 
80
  # =================== SIMPLE INTERFACE ===================
81
  def create_interface():
82
+ """Create a minimal interface"""
83
+ with gr.Blocks(
84
+ title="πŸ¦™ TinyLlama Demo",
85
+ theme=gr.themes.Soft(),
86
+ css="""
87
+ .gradio-container {max-width: 700px !important; margin: auto;}
88
+ """
89
+ ) as demo:
90
+
91
+ gr.Markdown("""
92
+ # πŸ¦™ TinyLlama Demo (CPU Mode)
93
+
94
+ **Model:** [abdelac/tinyllama](https://huggingface.co/abdelac/tinyllama)
95
+ **Hardware:** CPU Only (No GPU required)
96
+
97
+ ⚠️ **Note:** Running on CPU - responses may be slower
98
+ """)
99
+
100
+ # Input
101
+ prompt = gr.Textbox(
102
+ label="πŸ“ Enter your prompt:",
103
+ placeholder="Type here...",
104
+ lines=3,
105
+ value="Once upon a time"
106
+ )
107
 
108
+ # Controls
109
+ with gr.Row():
110
+ max_tokens = gr.Slider(
111
+ 30, 100, value=60,
112
+ label="πŸ“ Max Tokens",
113
+ info="Keep ≀ 80 for best performance"
114
+ )
115
+ temperature = gr.Slider(
116
+ 0.1, 1.0, value=0.7,
117
+ label="🌑️ Temperature"
118
+ )
119
+
120
+ # Buttons
121
+ with gr.Row():
122
+ generate_btn = gr.Button(
123
+ "✨ Generate",
124
+ variant="primary"
125
+ )
126
+ clear_btn = gr.Button("πŸ—‘οΈ Clear")
127
 
128
+ # Output
129
+ output = gr.Textbox(
130
+ label="πŸ“„ Generated Text:",
131
+ lines=6,
132
+ show_copy_button=True
133
+ )
134
 
135
+ # Examples
136
+ gr.Examples(
137
+ examples=[
138
+ ["The future of AI is"],
139
+ ["Write a short story about a cat"],
140
+ ["Explain machine learning simply:"],
141
+ ["The benefits of exercise include"]
142
+ ],
143
+ inputs=prompt,
144
+ label="πŸ’‘ Try these examples"
145
+ )
146
+
147
+ # Actions
148
  generate_btn.click(
149
  fn=generate_text,
150
  inputs=[prompt, max_tokens, temperature],
151
  outputs=output
152
  )
153
+
154
+ clear_btn.click(
155
+ fn=lambda: ("", ""),
156
+ inputs=[],
157
+ outputs=[prompt, output]
158
+ )
159
+
160
+ # Footer
161
+ gr.Markdown("---")
162
+ gr.Markdown("""
163
+ <div style='text-align: center; color: #666; font-size: 0.9em;'>
164
+ βœ… Model loaded on CPU | ⚑ Ready for text generation
165
+ </div>
166
+ """)
167
 
168
  return demo
169
 
170
  # =================== MAIN ===================
171
  if __name__ == "__main__":
172
+ print("Starting TinyLlama Demo...")
173
+ print(f"PyTorch version: {torch.__version__}")
174
+ print(f"CUDA available: {torch.cuda.is_available()}")
175
+
176
  demo = create_interface()
177
  demo.launch(
178
  server_name="0.0.0.0",
179
  server_port=7860,
180
+ share=False,
181
+ quiet=False, # Keep False to see startup messages
182
+ debug=False,
183
+ show_error=True
184
  )
requirements.txt CHANGED
@@ -1,5 +1,5 @@
1
- gradio>=3.0.0 # Any version will work
2
- torch>=2.1.0
3
- transformers>=4.35.2
4
- accelerate>=0.25.0
5
- bitsandbytes==0.41.3 # For 4-bit quantization
 
1
+ gradio==4.0.0
2
+ torch==2.1.0
3
+ transformers==4.35.2
4
+ accelerate==0.25.0
5
+ # NO bitsandbytes - we're using CPU only