hamxaameer commited on
Commit
06dde32
Β·
verified Β·
1 Parent(s): 4b54bb9

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +52 -53
app.py CHANGED
@@ -61,11 +61,11 @@ CONFIG = {
61
  "max_tokens": 600, # Allow natural length responses
62
  }
63
 
64
- # Local PHI model configuration for Hugging Face Spaces
65
- # PHI-2 is optimal for CPU deployment: 2.7B parameters, excellent quality
66
- # Can be swapped with Phi-3-mini-4k-instruct if more memory is available
67
- LOCAL_PHI_MODEL = os.environ.get("LOCAL_PHI_MODEL", "microsoft/phi-2")
68
- USE_8BIT_QUANTIZATION = False # DISABLED: causes hanging on CPU
69
  USE_REMOTE_LLM = False
70
 
71
  # Natural flow mode: No word limits, let model decide length
@@ -95,16 +95,13 @@ if HF_INFERENCE_API_KEY:
95
  # ============================================================================
96
 
97
  def initialize_llm():
98
- """Initialize PHI model locally with CPU optimizations for Hugging Face Spaces.
99
 
100
- Uses efficient techniques:
101
- - 8-bit quantization to reduce memory by ~50%
102
- - CPU-optimized loading with device_map
103
- - Lazy loading and minimal memory footprint
104
  """
105
- global LOCAL_PHI_MODEL, USE_8BIT_QUANTIZATION
106
 
107
- logger.info(f"πŸ”„ Initializing local PHI model: {LOCAL_PHI_MODEL}")
108
  logger.info(" Using CPU-optimized configuration for Hugging Face Spaces")
109
 
110
  try:
@@ -114,37 +111,33 @@ def initialize_llm():
114
  device = "cuda" if torch.cuda.is_available() else "cpu"
115
  logger.info(f" Target device: {device}")
116
 
117
- # Load tokenizer (lightweight)
118
  logger.info(" Loading tokenizer...")
119
  tokenizer = AutoTokenizer.from_pretrained(
120
- LOCAL_PHI_MODEL,
121
- trust_remote_code=True,
122
- use_fast=True
123
  )
124
 
125
- # Configure tokenizer for PHI models
126
  if tokenizer.pad_token is None:
127
  tokenizer.pad_token = tokenizer.eos_token
128
  if tokenizer.pad_token_id is None:
129
  tokenizer.pad_token_id = tokenizer.eos_token_id
130
 
131
- logger.info(f" Tokenizer configured: vocab_size={len(tokenizer)}, eos_token={tokenizer.eos_token}")
132
-
133
- # Configure model loading for CPU efficiency (NO quantization)
134
- model_kwargs = {
135
- "trust_remote_code": True,
136
- "low_cpu_mem_usage": True,
137
- "torch_dtype": torch.float32, # CPU works best with float32
138
- "device_map": "auto", # Let transformers handle device placement
139
- }
140
 
141
- # Load the model with optimization
142
- logger.info(" Loading PHI model (this may take 30-60 seconds)...")
143
  model = AutoModelForCausalLM.from_pretrained(
144
- LOCAL_PHI_MODEL,
145
- **model_kwargs
 
 
146
  )
147
 
 
 
 
148
  # Apply advanced optimizations for faster inference
149
  if hasattr(model, 'config'):
150
  # Reduce attention heads computation for speed
@@ -163,64 +156,70 @@ def initialize_llm():
163
  logger.info(" Configuring direct model inference (faster than pipeline)...")
164
 
165
  # Create a simple wrapper that mimics pipeline interface
166
- class FastPHIGenerator:
167
  def __init__(self, model, tokenizer):
168
  self.model = model
169
  self.tokenizer = tokenizer
170
 
171
  def __call__(self, prompt, max_new_tokens=150, temperature=0.7, top_p=0.9,
172
  do_sample=True, repetition_penalty=1.1, **kwargs):
173
- """Direct generation - faster than pipeline"""
174
  try:
175
  # Tokenize
176
- inputs = self.tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512)
177
- input_ids = inputs["input_ids"]
 
 
 
178
 
179
  # Generate
180
  with torch.no_grad():
181
  outputs = self.model.generate(
182
  input_ids,
 
183
  max_new_tokens=max_new_tokens,
184
- temperature=temperature,
185
- top_p=top_p,
186
  do_sample=do_sample,
187
  repetition_penalty=repetition_penalty,
188
- pad_token_id=self.tokenizer.eos_token_id,
189
- eos_token_id=self.tokenizer.eos_token_id,
190
- early_stopping=True
191
  )
192
 
193
  # Decode only the new tokens
194
  generated_ids = outputs[0][input_ids.shape[1]:]
195
  generated_text = self.tokenizer.decode(generated_ids, skip_special_tokens=True)
196
 
197
- return [{"generated_text": generated_text}]
198
 
199
  except Exception as e:
200
  logger.error(f"Generation error: {e}")
 
 
201
  return [{"generated_text": ""}]
202
 
203
- llm_client = FastPHIGenerator(model, tokenizer)
204
  llm_client.tokenizer = tokenizer # Add tokenizer reference for compatibility
205
 
206
- CONFIG["llm_model"] = LOCAL_PHI_MODEL
207
- CONFIG["model_type"] = "phi_local"
208
 
209
- logger.info(f"βœ… PHI model initialized successfully: {LOCAL_PHI_MODEL}")
210
- logger.info(f" Model size: ~2.7B parameters (PHI-2) or ~3.8B (PHI-3)")
211
- logger.info(f" Memory optimization: {'8-bit quantization' if USE_8BIT_QUANTIZATION else 'float32'}")
212
 
213
  return llm_client
214
 
215
  except ImportError as ie:
216
  logger.error(f"❌ Missing required library: {ie}")
217
- logger.info(" Install with: pip install transformers accelerate bitsandbytes")
218
  raise
219
  except Exception as e:
220
- logger.error(f"❌ Failed to load PHI model: {str(e)}")
221
- logger.info(" This may be due to insufficient memory on the Space")
222
- logger.info(" Try using a smaller model or enabling 8-bit quantization")
223
- raise Exception(f"Failed to initialize PHI LLM: {str(e)}")
 
224
 
225
 
226
  def remote_generate(prompt: str, max_new_tokens: int = 512, temperature: float = 0.7, top_p: float = 0.9) -> str:
@@ -715,7 +714,7 @@ def generate_llm_answer(
715
  # Ultra-simple prompt
716
  formatted_prompt = f"{prompt}\n\nAnswer:"
717
 
718
- logger.info(f" β†’ PHI-2 generating (max_tokens={max_new_tokens})")
719
 
720
  # MINIMAL settings - most restrictive for speed
721
  out = llm_client(
@@ -744,7 +743,7 @@ def generate_llm_answer(
744
  gen_thread.join(timeout=45) # 45 second timeout
745
 
746
  if gen_thread.is_alive():
747
- logger.error(" βœ— Generation TIMEOUT after 45s")
748
  return ''
749
 
750
  if result_container['error']:
 
61
  "max_tokens": 600, # Allow natural length responses
62
  }
63
 
64
+ # Local LLM configuration for Hugging Face Spaces
65
+ # TinyLlama: 1.1B parameters, fast on CPU, reliable generation
66
+ # Alternative: google/flan-t5-base (smaller, faster)
67
+ LOCAL_LLM_MODEL = os.environ.get("LOCAL_LLM_MODEL", "TinyLlama/TinyLlama-1.1B-Chat-v1.0")
68
+ USE_8BIT_QUANTIZATION = False # Not needed for TinyLlama
69
  USE_REMOTE_LLM = False
70
 
71
  # Natural flow mode: No word limits, let model decide length
 
95
  # ============================================================================
96
 
97
  def initialize_llm():
98
+ """Initialize TinyLlama model locally with CPU optimizations.
99
 
100
+ TinyLlama is fast, reliable, and works well on CPU without device issues.
 
 
 
101
  """
102
+ global LOCAL_LLM_MODEL, USE_8BIT_QUANTIZATION
103
 
104
+ logger.info(f"πŸ”„ Initializing local LLM: {LOCAL_LLM_MODEL}")
105
  logger.info(" Using CPU-optimized configuration for Hugging Face Spaces")
106
 
107
  try:
 
111
  device = "cuda" if torch.cuda.is_available() else "cpu"
112
  logger.info(f" Target device: {device}")
113
 
114
+ # Load tokenizer
115
  logger.info(" Loading tokenizer...")
116
  tokenizer = AutoTokenizer.from_pretrained(
117
+ LOCAL_LLM_MODEL,
118
+ trust_remote_code=True
 
119
  )
120
 
121
+ # Configure tokenizer
122
  if tokenizer.pad_token is None:
123
  tokenizer.pad_token = tokenizer.eos_token
124
  if tokenizer.pad_token_id is None:
125
  tokenizer.pad_token_id = tokenizer.eos_token_id
126
 
127
+ logger.info(f" Tokenizer ready: {len(tokenizer)} tokens")
 
 
 
 
 
 
 
 
128
 
129
+ # Load model - simple CPU configuration
130
+ logger.info(" Loading model (20-40 seconds)...")
131
  model = AutoModelForCausalLM.from_pretrained(
132
+ LOCAL_LLM_MODEL,
133
+ trust_remote_code=True,
134
+ torch_dtype=torch.float32,
135
+ low_cpu_mem_usage=True
136
  )
137
 
138
+ # Move to CPU explicitly
139
+ model = model.to('cpu')
140
+
141
  # Apply advanced optimizations for faster inference
142
  if hasattr(model, 'config'):
143
  # Reduce attention heads computation for speed
 
156
  logger.info(" Configuring direct model inference (faster than pipeline)...")
157
 
158
  # Create a simple wrapper that mimics pipeline interface
159
+ class FastLLMGenerator:
160
  def __init__(self, model, tokenizer):
161
  self.model = model
162
  self.tokenizer = tokenizer
163
 
164
  def __call__(self, prompt, max_new_tokens=150, temperature=0.7, top_p=0.9,
165
  do_sample=True, repetition_penalty=1.1, **kwargs):
166
+ """Direct generation - faster and more reliable"""
167
  try:
168
  # Tokenize
169
+ inputs = self.tokenizer(prompt, return_tensors="pt", truncation=True, max_length=400)
170
+ input_ids = inputs["input_ids"].to('cpu')
171
+ attention_mask = inputs.get("attention_mask", None)
172
+ if attention_mask is not None:
173
+ attention_mask = attention_mask.to('cpu')
174
 
175
  # Generate
176
  with torch.no_grad():
177
  outputs = self.model.generate(
178
  input_ids,
179
+ attention_mask=attention_mask,
180
  max_new_tokens=max_new_tokens,
181
+ temperature=temperature if do_sample else 1.0,
182
+ top_p=top_p if do_sample else 1.0,
183
  do_sample=do_sample,
184
  repetition_penalty=repetition_penalty,
185
+ pad_token_id=self.tokenizer.pad_token_id,
186
+ eos_token_id=self.tokenizer.eos_token_id
 
187
  )
188
 
189
  # Decode only the new tokens
190
  generated_ids = outputs[0][input_ids.shape[1]:]
191
  generated_text = self.tokenizer.decode(generated_ids, skip_special_tokens=True)
192
 
193
+ return [{"generated_text": generated_text.strip()}]
194
 
195
  except Exception as e:
196
  logger.error(f"Generation error: {e}")
197
+ import traceback
198
+ logger.error(traceback.format_exc())
199
  return [{"generated_text": ""}]
200
 
201
+ llm_client = FastLLMGenerator(model, tokenizer)
202
  llm_client.tokenizer = tokenizer # Add tokenizer reference for compatibility
203
 
204
+ CONFIG["llm_model"] = LOCAL_LLM_MODEL
205
+ CONFIG["model_type"] = "tinyllama_local"
206
 
207
+ logger.info(f"βœ… LLM initialized successfully: {LOCAL_LLM_MODEL}")
208
+ logger.info(f" Model size: 1.1B parameters")
209
+ logger.info(f" Expected speed: 5-15 seconds per response on CPU")
210
 
211
  return llm_client
212
 
213
  except ImportError as ie:
214
  logger.error(f"❌ Missing required library: {ie}")
215
+ logger.info(" Install with: pip install transformers torch")
216
  raise
217
  except Exception as e:
218
+ logger.error(f"❌ Failed to load LLM: {str(e)}")
219
+ logger.info(" This may be due to insufficient memory")
220
+ import traceback
221
+ logger.error(traceback.format_exc())
222
+ raise Exception(f"Failed to initialize LLM: {str(e)}")
223
 
224
 
225
  def remote_generate(prompt: str, max_new_tokens: int = 512, temperature: float = 0.7, top_p: float = 0.9) -> str:
 
714
  # Ultra-simple prompt
715
  formatted_prompt = f"{prompt}\n\nAnswer:"
716
 
717
+ logger.info(f" β†’ Generating with TinyLlama (max_tokens={max_new_tokens})")
718
 
719
  # MINIMAL settings - most restrictive for speed
720
  out = llm_client(
 
743
  gen_thread.join(timeout=45) # 45 second timeout
744
 
745
  if gen_thread.is_alive():
746
+ logger.error(" βœ— TIMEOUT after 45s - model may be too slow")
747
  return ''
748
 
749
  if result_container['error']: