Keeby-smilyai commited on
Commit
c8aa814
·
verified ·
1 Parent(s): 8695198

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +16 -11
app.py CHANGED
@@ -462,14 +462,14 @@ class ModelWrapper:
462
  print(f"✅ Model loaded: {self.d_model}d × {self.n_layers}L × {self.n_heads}H")
463
 
464
  def generate_stream(self, prompt: str, max_new_tokens: int = 200,
465
- temperature: float = 0.8, top_k: int = 50, top_p: float = 0.9):
466
  """Generator that yields tokens one at a time for streaming"""
467
- # Format prompt correctly (NO newline between User: and Sam:)
468
- if not prompt.startswith("User:"):
469
- prompt = f"User: {prompt} Sam:"
470
  else:
471
- if " Sam:" not in prompt:
472
- prompt = prompt + " Sam:"
473
 
474
  # Tokenize
475
  encoding = self.tokenizer.encode(prompt)
@@ -509,8 +509,11 @@ class ModelWrapper:
509
  # Decode the new token
510
  token_id = int(next_token[0, 0])
511
 
512
- # Stop on EOS
513
- if token_id == self.tokenizer.token_to_id("<|endoftext|>"):
 
 
 
514
  break
515
 
516
  # Decode and yield the token
@@ -518,6 +521,7 @@ class ModelWrapper:
518
  response_text += token_text
519
  yield response_text
520
 
 
521
  def generate(self, prompt: str, max_new_tokens: int = 200,
522
  temperature: float = 0.8, top_k: int = 50, top_p: float = 0.9):
523
  """Non-streaming generation (returns full response)"""
@@ -544,14 +548,15 @@ print(f"✅ Model downloaded to: {model_path}")
544
  # Load model
545
  model = ModelWrapper(model_path)
546
 
 
547
  def chat_fn(message, history, temperature, top_k, top_p, max_tokens):
548
- # Build conversation context with proper template
549
  conversation = ""
550
  for user_msg, bot_msg in history:
551
- conversation += f"User: {user_msg} Sam: {bot_msg} "
552
 
553
  # Add current message
554
- conversation += f"User: {message} Sam:"
555
 
556
  # Stream response token by token
557
  partial_response = ""
 
462
  print(f"✅ Model loaded: {self.d_model}d × {self.n_layers}L × {self.n_heads}H")
463
 
464
  def generate_stream(self, prompt: str, max_new_tokens: int = 200,
465
+ temperature: float = 0.8, top_k: int = 50, top_p: float = 0.9):
466
  """Generator that yields tokens one at a time for streaming"""
467
+ # Format prompt in ChatML format
468
+ if not prompt.startswith("<|im_start|>"):
469
+ prompt = f"<|im_start|>user\n{prompt}<|im_end|>\n<|im_start|>assistant\n"
470
  else:
471
+ if "<|im_start|>assistant" not in prompt:
472
+ prompt = prompt + "<|im_start|>assistant\n"
473
 
474
  # Tokenize
475
  encoding = self.tokenizer.encode(prompt)
 
509
  # Decode the new token
510
  token_id = int(next_token[0, 0])
511
 
512
+ # Stop on EOS or end tokens
513
+ if token_id in [
514
+ self.tokenizer.token_to_id("<|endoftext|>"),
515
+ self.tokenizer.token_to_id("<|im_end|>")
516
+ ]:
517
  break
518
 
519
  # Decode and yield the token
 
521
  response_text += token_text
522
  yield response_text
523
 
524
+
525
  def generate(self, prompt: str, max_new_tokens: int = 200,
526
  temperature: float = 0.8, top_k: int = 50, top_p: float = 0.9):
527
  """Non-streaming generation (returns full response)"""
 
548
  # Load model
549
  model = ModelWrapper(model_path)
550
 
551
+
552
  def chat_fn(message, history, temperature, top_k, top_p, max_tokens):
553
+ # Build conversation context in ChatML format
554
  conversation = ""
555
  for user_msg, bot_msg in history:
556
+ conversation += f"<|im_start|>user\n{user_msg}<|im_end|>\n<|im_start|>assistant\n{bot_msg}<|im_end|>\n"
557
 
558
  # Add current message
559
+ conversation += f"<|im_start|>user\n{message}<|im_end|>\n<|im_start|>assistant\n"
560
 
561
  # Stream response token by token
562
  partial_response = ""