jdesiree commited on
Commit
05be05b
·
verified ·
1 Parent(s): 9a5ee2e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +84 -173
app.py CHANGED
@@ -235,215 +235,94 @@ logger = logging.getLogger(__name__)
235
  class Qwen25SmallLLM(LLM):
236
  model: Any = None
237
  tokenizer: Any = None
238
-
239
  def __init__(self, model_path: str = "Qwen/Qwen2.5-3B-Instruct", use_4bit: bool = True):
240
  super().__init__()
241
- logger.info(f"Loading model with BitsAndBytes quantization: {model_path}")
242
-
243
- # Configure BitsAndBytes quantization
244
- if use_4bit:
245
- quantization_config = BitsAndBytesConfig(
246
- load_in_4bit=True,
247
- bnb_4bit_compute_dtype=torch.bfloat16, # Use bfloat16 for better performance
248
- bnb_4bit_use_double_quant=True, # Double quantization for additional memory savings
249
- bnb_4bit_quant_type="nf4" # Normal Float 4-bit quantization
250
- )
251
- logger.info("Using 4-bit quantization with BitsAndBytes")
252
- else:
253
- quantization_config = BitsAndBytesConfig(
254
- load_in_8bit=True,
255
- llm_int8_enable_fp32_cpu_offload=True # Offload to CPU if needed
256
- )
257
- logger.info("Using 8-bit quantization with BitsAndBytes")
258
-
259
  try:
260
  # Load tokenizer
261
- self.tokenizer = AutoTokenizer.from_pretrained(
262
- model_path,
263
- trust_remote_code=True
264
- )
265
-
266
- # Load model with quantization
267
- self.model = AutoModelForCausalLM.from_pretrained(
268
- model_path,
269
- quantization_config=quantization_config,
270
- device_map="auto", # Automatically distribute across available devices
271
- torch_dtype=torch.bfloat16, # Use bfloat16 for memory efficiency
272
- trust_remote_code=True,
273
- low_cpu_mem_usage=True, # Reduce CPU memory usage during loading
274
- max_memory={0: "15GB"} if torch.cuda.is_available() else None # Limit GPU memory usage
275
- )
276
-
277
- # Ensure pad token is set
278
- if self.tokenizer.pad_token is None:
279
- self.tokenizer.pad_token = self.tokenizer.eos_token
280
-
281
- logger.info("Model loaded successfully with BitsAndBytes quantization")
282
-
283
- except Exception as e:
284
- logger.error(f"Failed to load model with quantization: {e}")
285
- logger.info("Falling back to standard loading...")
286
- # Fallback to standard loading if quantization fails
287
- self._load_fallback_model(model_path)
288
-
289
- def _load_fallback_model(self, model_path: str):
290
- """Fallback method to load model without quantization if needed."""
291
- try:
292
  self.tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
293
  self.model = AutoModelForCausalLM.from_pretrained(
294
  model_path,
295
- torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
296
- device_map="auto" if torch.cuda.is_available() else None,
 
297
  trust_remote_code=True,
298
  low_cpu_mem_usage=True
299
  )
300
- if self.tokenizer.pad_token is None:
301
- self.tokenizer.pad_token = self.tokenizer.eos_token
302
- logger.info("Model loaded with fallback method")
303
- except Exception as e:
304
- logger.error(f"Fallback model loading also failed: {e}")
305
- raise e
306
-
307
- def _call(self, prompt: str, stop: Optional[List[str]] = None) -> str:
308
- """Generate text response using the quantized local model."""
309
- try:
310
- # Format the conversation
311
- messages = [
312
- {"role": "system", "content": SYSTEM_PROMPT},
313
- {"role": "user", "content": prompt}
314
- ]
315
-
316
- # Apply chat template
317
- text = self.tokenizer.apply_chat_template(
318
- messages,
319
- tokenize=False,
320
- add_generation_prompt=True
321
- )
322
-
323
- # Tokenize with proper padding
324
- model_inputs = self.tokenizer(
325
- [text],
326
- return_tensors="pt",
327
- padding=True,
328
- truncation=True,
329
- max_length=2048 # Limit input length to prevent memory issues
330
- )
331
-
332
- # Move to model device if available
333
- if torch.cuda.is_available():
334
- model_inputs = {k: v.to(self.model.device) for k, v in model_inputs.items()}
335
-
336
- # Generate with memory-efficient settings
337
- with torch.no_grad():
338
- generated_ids = self.model.generate(
339
- **model_inputs,
340
- max_new_tokens=800, # Reduced for memory efficiency
341
- do_sample=True,
342
- temperature=0.7,
343
- top_p=0.9,
344
- top_k=50,
345
- repetition_penalty=1.1,
346
- pad_token_id=self.tokenizer.eos_token_id,
347
- use_cache=True, # Enable KV cache for efficiency
348
- attention_mask=model_inputs.get('attention_mask', None)
349
- )
350
-
351
- # Decode response (only new tokens)
352
- generated_ids = [
353
- output_ids[len(input_ids):]
354
- for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
355
- ]
356
-
357
- response = self.tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
358
-
359
- # Clean up GPU memory
360
- if torch.cuda.is_available():
361
- torch.cuda.empty_cache()
362
-
363
- return response.strip()
364
-
365
- except torch.cuda.OutOfMemoryError:
366
- logger.error("GPU out of memory during generation")
367
- if torch.cuda.is_available():
368
- torch.cuda.empty_cache()
369
- return "I apologize, but I'm experiencing memory constraints. Please try a shorter message or restart the application."
370
-
371
  except Exception as e:
372
- logger.error(f"Error in model generation: {e}")
373
- if torch.cuda.is_available():
374
- torch.cuda.empty_cache()
375
- return f"I apologize, but I encountered an error while generating a response: {str(e)}"
376
 
377
- @property
378
- def _llm_type(self) -> str:
379
- return "qwen25_small_quantized"
380
- model: Any = None
381
- tokenizer: Any = None
382
-
383
- def __init__(self, model_path: str = "Qwen/Qwen2.5-3B-Instruct"):
384
- super().__init__()
385
- logger.info(f"Loading model: {model_path}")
386
-
387
- # Load tokenizer and model
388
- self.tokenizer = AutoTokenizer.from_pretrained(model_path)
389
  self.model = AutoModelForCausalLM.from_pretrained(
390
  model_path,
391
  torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
392
  device_map="auto" if torch.cuda.is_available() else None,
393
- trust_remote_code=True
 
394
  )
395
-
396
- logger.info("Model loaded successfully")
397
-
398
  def _call(self, prompt: str, stop: Optional[List[str]] = None) -> str:
399
- """Generate text response using the local model."""
400
  try:
401
- # Format the conversation
402
  messages = [
403
  {"role": "system", "content": SYSTEM_PROMPT},
404
  {"role": "user", "content": prompt}
405
  ]
406
-
407
- # Apply chat template
408
- text = self.tokenizer.apply_chat_template(
409
- messages,
410
- tokenize=False,
411
- add_generation_prompt=True
412
- )
413
-
414
- # Tokenize
415
- model_inputs = self.tokenizer([text], return_tensors="pt")
416
  if torch.cuda.is_available():
417
- model_inputs = model_inputs.to(self.model.device)
418
-
419
- # Generate
420
  with torch.no_grad():
421
- generated_ids = self.model.generate(
422
- **model_inputs,
423
- max_new_tokens=1000,
424
  do_sample=True,
425
  temperature=0.7,
426
  top_p=0.9,
 
 
427
  pad_token_id=self.tokenizer.eos_token_id
428
  )
429
-
430
- # Decode response
431
- generated_ids = [
432
- output_ids[len(input_ids):]
433
- for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
434
- ]
435
-
436
- response = self.tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
437
- return response.strip()
438
-
439
  except Exception as e:
440
- logger.error(f"Error in model generation: {e}")
441
- return f"I apologize, but I encountered an error while generating a response: {str(e)}"
442
 
443
  @property
444
  def _llm_type(self) -> str:
445
  return "qwen25_small"
446
 
 
447
  # Example of how the AI should use the tool
448
  def example_usage_for_ai():
449
  """
@@ -495,6 +374,38 @@ def get_agent():
495
  agent = create_langchain_agent()
496
  return agent
497
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
498
  # --- UI: MathJax Configuration ---
499
  mathjax_config = '''
500
  <script>
 
235
  class Qwen25SmallLLM(LLM):
236
  model: Any = None
237
  tokenizer: Any = None
238
+
239
  def __init__(self, model_path: str = "Qwen/Qwen2.5-3B-Instruct", use_4bit: bool = True):
240
  super().__init__()
241
+ logger.info(f"Loading model: {model_path} (use_4bit={use_4bit})")
242
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
243
  try:
244
  # Load tokenizer
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
245
  self.tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
246
+
247
+ if use_4bit:
248
+ quant_config = BitsAndBytesConfig(
249
+ load_in_4bit=True,
250
+ bnb_4bit_compute_dtype=torch.bfloat16,
251
+ bnb_4bit_use_double_quant=True,
252
+ bnb_4bit_quant_type="nf4"
253
+ )
254
+ logger.info("Using 4-bit quantization with BitsAndBytes")
255
+ else:
256
+ quant_config = BitsAndBytesConfig(
257
+ load_in_8bit=True,
258
+ llm_int8_enable_fp32_cpu_offload=True
259
+ )
260
+ logger.info("Using 8-bit quantization with BitsAndBytes")
261
+
262
+ # Try quantized load
263
  self.model = AutoModelForCausalLM.from_pretrained(
264
  model_path,
265
+ quantization_config=quant_config,
266
+ device_map="auto",
267
+ torch_dtype=torch.bfloat16,
268
  trust_remote_code=True,
269
  low_cpu_mem_usage=True
270
  )
271
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
272
  except Exception as e:
273
+ logger.warning(f"Quantized load failed, falling back: {e}")
274
+ self._load_fallback_model(model_path)
 
 
275
 
276
+ # Ensure pad token
277
+ if self.tokenizer.pad_token is None:
278
+ self.tokenizer.pad_token = self.tokenizer.eos_token
279
+
280
+ def _load_fallback_model(self, model_path: str):
281
+ """Fallback if quantization fails."""
 
 
 
 
 
 
282
  self.model = AutoModelForCausalLM.from_pretrained(
283
  model_path,
284
  torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
285
  device_map="auto" if torch.cuda.is_available() else None,
286
+ trust_remote_code=True,
287
+ low_cpu_mem_usage=True
288
  )
289
+
 
 
290
  def _call(self, prompt: str, stop: Optional[List[str]] = None) -> str:
 
291
  try:
 
292
  messages = [
293
  {"role": "system", "content": SYSTEM_PROMPT},
294
  {"role": "user", "content": prompt}
295
  ]
296
+ text = self.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
297
+
298
+ inputs = self.tokenizer([text], return_tensors="pt", padding=True, truncation=True, max_length=2048)
 
 
 
 
 
 
 
299
  if torch.cuda.is_available():
300
+ inputs = {k: v.to(self.model.device) for k, v in inputs.items()}
301
+
 
302
  with torch.no_grad():
303
+ outputs = self.model.generate(
304
+ **inputs,
305
+ max_new_tokens=800,
306
  do_sample=True,
307
  temperature=0.7,
308
  top_p=0.9,
309
+ top_k=50,
310
+ repetition_penalty=1.1,
311
  pad_token_id=self.tokenizer.eos_token_id
312
  )
313
+
314
+ new_tokens = [out[len(inp):] for inp, out in zip(inputs.input_ids, outputs)]
315
+ return self.tokenizer.batch_decode(new_tokens, skip_special_tokens=True)[0].strip()
316
+
 
 
 
 
 
 
317
  except Exception as e:
318
+ logger.error(f"Generation error: {e}")
319
+ return f"[Error generating response: {str(e)}]"
320
 
321
  @property
322
  def _llm_type(self) -> str:
323
  return "qwen25_small"
324
 
325
+
326
  # Example of how the AI should use the tool
327
  def example_usage_for_ai():
328
  """
 
374
  agent = create_langchain_agent()
375
  return agent
376
 
377
+ def create_langchain_agent():
378
+ """Factory to build the LangChain agent with memory and tools."""
379
+ try:
380
+ # Initialize your LLM
381
+ llm = Qwen25SmallLLM(model_path="Qwen/Qwen2.5-3B-Instruct")
382
+
383
+ # Memory
384
+ memory = ConversationBufferWindowMemory(
385
+ memory_key="chat_history",
386
+ return_messages=True,
387
+ k=5 # keep last 5 exchanges
388
+ )
389
+
390
+ # Tools (graph tool, etc.)
391
+ tools = [create_educational_graph_tool()]
392
+
393
+ # Initialize agent
394
+ agent = initialize_agent(
395
+ tools=tools,
396
+ llm=llm,
397
+ agent=AgentType.CONVERSATIONAL_REACT_DESCRIPTION,
398
+ memory=memory,
399
+ verbose=True,
400
+ handle_parsing_errors=True
401
+ )
402
+
403
+ return agent
404
+
405
+ except Exception as e:
406
+ logger.error(f"Error creating LangChain agent: {e}")
407
+ raise
408
+
409
  # --- UI: MathJax Configuration ---
410
  mathjax_config = '''
411
  <script>