navidfalah commited on
Commit
4385b80
·
verified ·
1 Parent(s): e0b652f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +197 -62
app.py CHANGED
@@ -2,86 +2,153 @@ import gradio as gr
2
  import torch
3
  from transformers import AutoTokenizer, AutoModelForCausalLM
4
  import os
 
 
 
 
 
 
 
 
 
 
 
5
 
6
  # Configuration for CPU optimization
7
  class Config:
8
  MODEL_PATH = "navidfalah/3ai"
9
  BASE_MODEL = "mistralai/Mistral-7B-Instruct-v0.1"
10
- MAX_NEW_TOKENS = 150 # Much shorter for faster generation
11
  TEMPERATURE = 0.7
12
  TOP_P = 0.9
13
- MAX_INPUT_LENGTH = 256 # Shorter input for faster processing
 
14
 
15
  # Global variables
16
  model = None
17
  tokenizer = None
 
 
 
 
 
 
 
18
 
19
  def load_model_cpu_optimized():
20
- """Load model optimized for CPU inference."""
21
- global model, tokenizer
22
 
23
  if model is not None and tokenizer is not None:
 
24
  return model, tokenizer
25
 
 
 
26
  try:
27
- print("Loading tokenizer...")
28
- tokenizer = AutoTokenizer.from_pretrained(Config.BASE_MODEL)
29
- if tokenizer.pad_token is None:
 
 
 
 
 
 
 
 
 
30
  tokenizer.pad_token = tokenizer.eos_token
 
 
 
 
 
 
 
 
 
 
 
 
31
 
32
- print("Loading model for CPU...")
33
- # Load in float32 for CPU (no quantization)
34
- model = AutoModelForCausalLM.from_pretrained(
35
- Config.BASE_MODEL,
36
- torch_dtype=torch.float32, # Use float32 for CPU
37
- low_cpu_mem_usage=True,
38
- device_map="cpu" # Force CPU
39
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
40
 
41
- model.eval()
42
- print("✅ Model loaded on CPU")
43
  return model, tokenizer
44
 
45
  except Exception as e:
46
- print(f"Error loading model: {e}")
47
- # Try a smaller model as fallback
48
- try:
49
- print("Trying smaller model fallback...")
50
- model = AutoModelForCausalLM.from_pretrained(
51
- "gpt2", # Much smaller fallback model
52
- torch_dtype=torch.float32
53
- )
54
- tokenizer = AutoTokenizer.from_pretrained("gpt2")
55
- tokenizer.pad_token = tokenizer.eos_token
56
- model.eval()
57
- print("✅ Loaded fallback model (GPT-2)")
58
- return model, tokenizer
59
- except:
60
- return None, None
61
 
62
- def analyze_text(user_input):
63
- """Simple and fast text analysis."""
 
 
64
  if not user_input.strip():
65
- return "Please enter some text to analyze."
 
 
 
 
 
66
 
 
 
67
  model, tokenizer = load_model_cpu_optimized()
 
68
 
69
  if model is None or tokenizer is None:
70
- return "Error: Could not load model. Please try again."
 
 
71
 
72
  try:
73
- # Simple prompt - no complex formatting
74
- prompt = f"Analyze this life situation and provide brief advice: {user_input}\n\nAnalysis:"
 
75
 
76
- # Tokenize with minimal length
 
77
  inputs = tokenizer(
78
  prompt,
79
  return_tensors="pt",
80
  truncation=True,
81
  max_length=Config.MAX_INPUT_LENGTH
82
  )
 
 
 
 
 
 
 
 
 
 
 
83
 
84
- # Generate with aggressive settings for speed
85
  with torch.no_grad():
86
  outputs = model.generate(
87
  **inputs,
@@ -89,61 +156,129 @@ def analyze_text(user_input):
89
  temperature=Config.TEMPERATURE,
90
  do_sample=True,
91
  pad_token_id=tokenizer.eos_token_id,
92
- early_stopping=True, # Stop as soon as possible
93
- num_beams=1 # No beam search for speed
 
94
  )
95
 
 
 
 
 
 
 
 
96
  response = tokenizer.decode(outputs[0], skip_special_tokens=True)
 
97
 
98
- # Extract only the generated part
99
  result = response[len(prompt):].strip()
100
 
101
  if not result:
102
- result = "Analysis: Based on your input, I recommend focusing on balance and gradual improvements."
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
103
 
104
- return result
 
 
105
 
106
  except Exception as e:
107
- return f"Error: {str(e)}"
 
 
 
108
 
109
- # Simple Gradio Interface
110
- with gr.Blocks(title="Quick Life Analysis", css="footer {display: none !important}") as demo:
111
- gr.Markdown("# Quick Life Satisfaction Analysis")
112
- gr.Markdown("Enter your situation and get instant AI advice (optimized for CPU)")
 
 
 
113
 
114
  with gr.Row():
115
  with gr.Column():
116
  input_text = gr.Textbox(
117
  label="Your Input",
118
- placeholder="Example: I'm stressed at work (3/10) but happy with family (8/10)...",
119
- lines=4
120
  )
121
- submit_btn = gr.Button("Analyze", variant="primary")
 
 
 
 
 
 
 
 
 
 
 
122
 
123
  with gr.Column():
124
  output_text = gr.Textbox(
125
  label="AI Analysis",
126
- lines=6,
127
  interactive=False
128
  )
 
 
 
 
129
 
130
- # Simple examples
131
  gr.Examples(
132
  examples=[
133
- "Work stress is high, health is okay, finances tight",
134
- "Happy with job but no work-life balance",
135
- "Good health and relationships but career is stagnant"
136
  ],
137
- inputs=input_text
 
138
  )
139
 
 
140
  submit_btn.click(
141
  fn=analyze_text,
142
  inputs=input_text,
143
- outputs=output_text
 
 
 
 
 
144
  )
145
 
146
  if __name__ == "__main__":
147
- print("Starting CPU-optimized app...")
148
- print("Note: First generation will be slow due to model loading")
 
 
 
 
 
 
 
 
 
 
 
 
149
  demo.launch()
 
2
  import torch
3
  from transformers import AutoTokenizer, AutoModelForCausalLM
4
  import os
5
+ import time
6
+ import logging
7
+ from datetime import datetime
8
+
9
+ # Set up logging
10
+ logging.basicConfig(
11
+ level=logging.INFO,
12
+ format='%(asctime)s - %(levelname)s - %(message)s',
13
+ datefmt='%H:%M:%S'
14
+ )
15
+ logger = logging.getLogger(__name__)
16
 
17
  # Configuration for CPU optimization
18
  class Config:
19
  MODEL_PATH = "navidfalah/3ai"
20
  BASE_MODEL = "mistralai/Mistral-7B-Instruct-v0.1"
21
+ MAX_NEW_TOKENS = 50 # Very short for speed
22
  TEMPERATURE = 0.7
23
  TOP_P = 0.9
24
+ MAX_INPUT_LENGTH = 128 # Very short input
25
+ USE_SMALL_MODEL = True # Set to True to use tiny model
26
 
27
  # Global variables
28
  model = None
29
  tokenizer = None
30
+ model_load_time = None
31
+
32
+ def log_time(start_time, operation):
33
+ """Log time taken for an operation."""
34
+ elapsed = time.time() - start_time
35
+ logger.info(f"{operation} took {elapsed:.2f} seconds")
36
+ return elapsed
37
 
38
  def load_model_cpu_optimized():
39
+ """Load model optimized for CPU inference with timing."""
40
+ global model, tokenizer, model_load_time
41
 
42
  if model is not None and tokenizer is not None:
43
+ logger.info("Model already loaded, using cached version")
44
  return model, tokenizer
45
 
46
+ total_start = time.time()
47
+
48
  try:
49
+ # Use smaller model for CPU
50
+ if Config.USE_SMALL_MODEL:
51
+ logger.info("Loading small model for fast CPU inference...")
52
+
53
+ # Option 1: Use GPT-2 small (fastest)
54
+ model_name = "gpt2" # Only 124M parameters
55
+
56
+ # Option 2: Use DistilGPT-2 (even faster)
57
+ # model_name = "distilgpt2" # Only 82M parameters
58
+
59
+ start = time.time()
60
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
61
  tokenizer.pad_token = tokenizer.eos_token
62
+ log_time(start, "Tokenizer loading")
63
+
64
+ start = time.time()
65
+ model = AutoModelForCausalLM.from_pretrained(
66
+ model_name,
67
+ torch_dtype=torch.float32,
68
+ low_cpu_mem_usage=True
69
+ )
70
+ model.eval()
71
+ log_time(start, "Model loading")
72
+
73
+ logger.info(f"✅ Loaded {model_name} for fast inference")
74
 
75
+ else:
76
+ # Load full model (will be slow on CPU)
77
+ logger.warning("Loading full Mistral model - this will be VERY slow on CPU!")
78
+ logger.warning("Consider setting USE_SMALL_MODEL = True")
79
+
80
+ start = time.time()
81
+ tokenizer = AutoTokenizer.from_pretrained(Config.BASE_MODEL)
82
+ if tokenizer.pad_token is None:
83
+ tokenizer.pad_token = tokenizer.eos_token
84
+ log_time(start, "Tokenizer loading")
85
+
86
+ start = time.time()
87
+ model = AutoModelForCausalLM.from_pretrained(
88
+ Config.BASE_MODEL,
89
+ torch_dtype=torch.float32,
90
+ low_cpu_mem_usage=True,
91
+ device_map="cpu"
92
+ )
93
+ model.eval()
94
+ log_time(start, "Model loading")
95
+
96
+ model_load_time = log_time(total_start, "Total model loading")
97
+ logger.info(f"Model size: ~{sum(p.numel() for p in model.parameters()) / 1e6:.0f}M parameters")
98
 
 
 
99
  return model, tokenizer
100
 
101
  except Exception as e:
102
+ logger.error(f"Failed to load model: {e}")
103
+ return None, None
 
 
 
 
 
 
 
 
 
 
 
 
 
104
 
105
+ def analyze_text(user_input, progress=gr.Progress()):
106
+ """Simple and fast text analysis with progress tracking."""
107
+ start_time = time.time()
108
+
109
  if not user_input.strip():
110
+ return "Please enter some text to analyze.", "No input provided"
111
+
112
+ logger.info(f"Starting analysis for input: {user_input[:50]}...")
113
+
114
+ # Update progress
115
+ progress(0.1, desc="Loading model...")
116
 
117
+ # Load model with timing
118
+ model_start = time.time()
119
  model, tokenizer = load_model_cpu_optimized()
120
+ model_time = time.time() - model_start
121
 
122
  if model is None or tokenizer is None:
123
+ return "Error: Could not load model.", f"Model loading failed after {model_time:.2f}s"
124
+
125
+ progress(0.3, desc="Model loaded, preparing input...")
126
 
127
  try:
128
+ # Simple prompt
129
+ prompt = f"Life advice for: {user_input}\n\nAdvice:"
130
+ logger.info(f"Prompt length: {len(prompt)} characters")
131
 
132
+ # Tokenize with timing
133
+ tokenize_start = time.time()
134
  inputs = tokenizer(
135
  prompt,
136
  return_tensors="pt",
137
  truncation=True,
138
  max_length=Config.MAX_INPUT_LENGTH
139
  )
140
+ tokenize_time = log_time(tokenize_start, "Tokenization")
141
+
142
+ progress(0.5, desc="Generating response...")
143
+
144
+ # Log input details
145
+ input_ids = inputs['input_ids']
146
+ logger.info(f"Input tokens: {input_ids.shape[1]}")
147
+
148
+ # Generate with timing
149
+ gen_start = time.time()
150
+ logger.info(f"Starting generation with max {Config.MAX_NEW_TOKENS} new tokens...")
151
 
 
152
  with torch.no_grad():
153
  outputs = model.generate(
154
  **inputs,
 
156
  temperature=Config.TEMPERATURE,
157
  do_sample=True,
158
  pad_token_id=tokenizer.eos_token_id,
159
+ early_stopping=True,
160
+ num_beams=1,
161
+ use_cache=True # Enable KV cache
162
  )
163
 
164
+ gen_time = log_time(gen_start, "Generation")
165
+ logger.info(f"Generated {outputs.shape[1] - input_ids.shape[1]} new tokens")
166
+
167
+ progress(0.8, desc="Decoding response...")
168
+
169
+ # Decode with timing
170
+ decode_start = time.time()
171
  response = tokenizer.decode(outputs[0], skip_special_tokens=True)
172
+ decode_time = log_time(decode_start, "Decoding")
173
 
174
+ # Extract generated part
175
  result = response[len(prompt):].strip()
176
 
177
  if not result:
178
+ result = "Based on your input, I recommend focusing on balance and gradual improvements."
179
+
180
+ # Total time
181
+ total_time = time.time() - start_time
182
+ logger.info(f"Total analysis time: {total_time:.2f}s")
183
+
184
+ # Create timing report
185
+ timing_report = f"""### Timing Report
186
+ - Model Load: {model_time:.2f}s {'' if model_time < 1 else '(cached)'}
187
+ - Tokenization: {tokenize_time:.2f}s
188
+ - Generation: {gen_time:.2f}s ({Config.MAX_NEW_TOKENS} tokens)
189
+ - Decoding: {decode_time:.2f}s
190
+ - **Total: {total_time:.2f}s**
191
+
192
+ Model: {model.__class__.__name__}
193
+ Input tokens: {input_ids.shape[1]}
194
+ Output tokens: {outputs.shape[1] - input_ids.shape[1]}
195
+ """
196
 
197
+ progress(1.0, desc="Complete!")
198
+
199
+ return result, timing_report
200
 
201
  except Exception as e:
202
+ error_msg = f"Error during analysis: {str(e)}"
203
+ logger.error(error_msg)
204
+ total_time = time.time() - start_time
205
+ return error_msg, f"Failed after {total_time:.2f}s\nError: {str(e)}"
206
 
207
+ # Simple Gradio Interface with timing display
208
+ with gr.Blocks(title="Fast CPU Analysis", theme=gr.themes.Base()) as demo:
209
+ gr.Markdown("""
210
+ # Fast Life Analysis (CPU Optimized)
211
+
212
+ ⚡ Using small model for fast CPU inference. First run will be slower due to model loading.
213
+ """)
214
 
215
  with gr.Row():
216
  with gr.Column():
217
  input_text = gr.Textbox(
218
  label="Your Input",
219
+ placeholder="Describe your situation briefly...",
220
+ lines=3
221
  )
222
+
223
+ with gr.Row():
224
+ submit_btn = gr.Button("🚀 Analyze (Fast)", variant="primary")
225
+ clear_btn = gr.Button("Clear")
226
+
227
+ # Model selection
228
+ gr.Markdown("""
229
+ **Tips for faster response:**
230
+ - Keep input short (< 50 words)
231
+ - First analysis is slowest (model loading)
232
+ - Subsequent analyses are much faster
233
+ """)
234
 
235
  with gr.Column():
236
  output_text = gr.Textbox(
237
  label="AI Analysis",
238
+ lines=4,
239
  interactive=False
240
  )
241
+ timing_info = gr.Markdown(
242
+ label="Performance Metrics",
243
+ value="*Timing information will appear here*"
244
+ )
245
 
246
+ # Examples
247
  gr.Examples(
248
  examples=[
249
+ "Stressed at work, need balance",
250
+ "Happy but financially worried",
251
+ "Good job, poor health"
252
  ],
253
+ inputs=input_text,
254
+ label="Quick Examples (short = faster)"
255
  )
256
 
257
+ # Event handlers
258
  submit_btn.click(
259
  fn=analyze_text,
260
  inputs=input_text,
261
+ outputs=[output_text, timing_info]
262
+ )
263
+
264
+ clear_btn.click(
265
+ fn=lambda: ("", "", "*Timing information will appear here*"),
266
+ outputs=[input_text, output_text, timing_info]
267
  )
268
 
269
  if __name__ == "__main__":
270
+ logger.info("="*50)
271
+ logger.info("Starting Fast CPU-Optimized App")
272
+ logger.info(f"PyTorch version: {torch.__version__}")
273
+ logger.info(f"Using small model: {Config.USE_SMALL_MODEL}")
274
+ logger.info(f"Max new tokens: {Config.MAX_NEW_TOKENS}")
275
+ logger.info("="*50)
276
+
277
+ # Pre-load model for faster first inference
278
+ logger.info("Pre-loading model...")
279
+ pre_load_start = time.time()
280
+ load_model_cpu_optimized()
281
+ logger.info(f"Model pre-loaded in {time.time() - pre_load_start:.2f}s")
282
+
283
+ demo.queue()
284
  demo.launch()