Spaces:

jatingocodeo
/

SmolLM2

Runtime error

App Files Files Community

jatingocodeo commited on Jan 27, 2025

Commit

6fa5efe

verified ·

1 Parent(s): eccb044

Update app.py

Browse files

Files changed (1) hide show

app.py +37 -13

app.py CHANGED Viewed

@@ -169,17 +169,31 @@ class SmolLM2ForCausalLM(PreTrainedModel):
         hidden_states = self.embed_tokens(input_ids)
-        # Create causal attention mask if none provided
-        if attention_mask is None:
-            attention_mask = torch.triu(
-                torch.ones((input_ids.size(1), input_ids.size(1)), dtype=torch.bool, device=input_ids.device),
-                diagonal=1
-            )
-            attention_mask = attention_mask.unsqueeze(0).unsqueeze(0)
-            attention_mask = attention_mask * -1e4
         for layer in self.layers:
-            hidden_states = layer(hidden_states, attention_mask)
         hidden_states = self.norm(hidden_states)
         logits = self.lm_head(hidden_states)
@@ -200,11 +214,13 @@ class SmolLM2ForCausalLM(PreTrainedModel):
             )
         return (loss, logits) if loss is not None else logits
-    def prepare_inputs_for_generation(self, input_ids, **kwargs):
-        return {
             "input_ids": input_ids,
-            "attention_mask": kwargs.get("attention_mask", None)
         }
 # Register the model architecture
 from transformers import AutoConfig
@@ -272,17 +288,22 @@ def generate_text(prompt, max_length=100, temperature=0.7, top_k=50):
         input_ids = TOKENIZER.encode(prompt, return_tensors="pt", truncation=True, max_length=2048)
         input_ids = input_ids.to(MODEL.device)
         # Generate
         with torch.no_grad():
             output_ids = MODEL.generate(
                 input_ids,
                 max_length=min(max_length + len(input_ids[0]), 2048),
                 temperature=temperature,
                 top_k=top_k,
                 do_sample=True,
                 pad_token_id=TOKENIZER.pad_token_id,
                 eos_token_id=TOKENIZER.eos_token_id,
-                num_return_sequences=1
             )
         # Decode and return
@@ -290,6 +311,9 @@ def generate_text(prompt, max_length=100, temperature=0.7, top_k=50):
         return generated_text.strip()
     except Exception as e:
         return f"Error generating text: {str(e)}"
 # Initialize on startup

         hidden_states = self.embed_tokens(input_ids)
+        # Create causal attention mask
+        batch_size, seq_length = input_ids.size()
+        device = input_ids.device
+        # Create causal mask
+        causal_mask = torch.triu(
+            torch.ones((seq_length, seq_length), dtype=torch.bool, device=device),
+            diagonal=1
+        )
+        causal_mask = causal_mask.unsqueeze(0).unsqueeze(0)  # [1, 1, seq_len, seq_len]
+        causal_mask = causal_mask.expand(batch_size, 1, seq_length, seq_length)
+        causal_mask = causal_mask.to(dtype=torch.float32) * -1e4
+        # Combine with attention mask if provided
+        if attention_mask is not None:
+            # Convert attention mask to float and unsqueeze
+            attention_mask = attention_mask.unsqueeze(1).unsqueeze(2)  # [batch, 1, 1, seq_len]
+            attention_mask = attention_mask.expand(batch_size, 1, seq_length, seq_length)
+            attention_mask = (1.0 - attention_mask) * -1e4
+            # Combine masks
+            causal_mask = causal_mask + attention_mask
+        # Process through layers
         for layer in self.layers:
+            hidden_states = layer(hidden_states, causal_mask)
         hidden_states = self.norm(hidden_states)
         logits = self.lm_head(hidden_states)
             )
         return (loss, logits) if loss is not None else logits
+    def prepare_inputs_for_generation(self, input_ids, attention_mask=None, **kwargs):
+        # Only return what we need
+        inputs = {
             "input_ids": input_ids,
+            "attention_mask": attention_mask
         }
+        return inputs
 # Register the model architecture
 from transformers import AutoConfig
         input_ids = TOKENIZER.encode(prompt, return_tensors="pt", truncation=True, max_length=2048)
         input_ids = input_ids.to(MODEL.device)
+        # Create attention mask
+        attention_mask = torch.ones_like(input_ids)
         # Generate
         with torch.no_grad():
             output_ids = MODEL.generate(
                 input_ids,
+                attention_mask=attention_mask,
                 max_length=min(max_length + len(input_ids[0]), 2048),
                 temperature=temperature,
                 top_k=top_k,
                 do_sample=True,
                 pad_token_id=TOKENIZER.pad_token_id,
                 eos_token_id=TOKENIZER.eos_token_id,
+                num_return_sequences=1,
+                use_cache=True
             )
         # Decode and return
         return generated_text.strip()
     except Exception as e:
+        print(f"Generation error details: {str(e)}")
+        import traceback
+        traceback.print_exc()
         return f"Error generating text: {str(e)}"
 # Initialize on startup