I used 1.75x Tokens as one word and gave an instruction on checking discussion
Browse files
app.py
CHANGED
|
@@ -76,30 +76,38 @@ def generate_text(prompt, tone, max_length, temperature=0.7, top_p=0.9, repetiti
|
|
| 76 |
# This turns our input text (with the tone instruction) into a format (tensors) that the model can process using the tokenizer.
|
| 77 |
input_token_length = input_ids.shape[1] # Get the number of tokens in the input
|
| 78 |
# Store the length of the input
|
| 79 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 80 |
outputs = model.generate(
|
| 81 |
inputs["input_ids"],
|
| 82 |
# max_length=max_length + len(input_text.split()),
|
| 83 |
# This sets how long the generated text can be. We add the number of words in our input text (len(input_text.split())) to the max_length the user picked, so the model knows how many total words to create.
|
| 84 |
# CHANGE: Use max_new_tokens for clarity instead of calculating total length
|
| 85 |
-
max_new_tokens=
|
| 86 |
# Generate THIS many NEW tokens
|
| 87 |
-
temperature=temperature,
|
| 88 |
# This controls how creative the model gets. A lower temperature (e.g., 0.7) keeps things more predictable, while a higher one makes it wilder and more random—think of it like adjusting the spice level!
|
| 89 |
top_p=top_p,
|
| 90 |
# This is like a filter for word choices. It picks from the top percentage of likely words (e.g., 0.9 means 90% of the best options), making the output diverse but not too crazy.
|
| 91 |
-
repetition_penalty=repetition_penalty,
|
| 92 |
# This stops the model from repeating the same words too much. A higher value (e.g., 1.5) pushes it to try new words, like telling it to mix up its vocabulary!
|
| 93 |
-
num_return_sequences=1,
|
| 94 |
# This tells the model to give us just one version of the text. If we wanted more options, we could change
|
| 95 |
-
do_sample=True,
|
| 96 |
-
pad_token_id=tokenizer.eos_token_id # Good practice for generation
|
| 97 |
)
|
| 98 |
-
# --- Decode ONLY the generated part ---
|
| 99 |
# Slice the output tensor to get only the tokens AFTER the input tokens
|
| 100 |
# This tells the model to generate text: it uses the input IDs, sets a max length, and adjusts creativity with temperature, top_p, and repetition_penalty.
|
| 101 |
generated_token_ids = outputs[0, input_token_length:]
|
| 102 |
-
generated_text = tokenizer.decode(generated_token_ids, skip_special_tokens=True)
|
|
|
|
| 103 |
return generated_text # Return only the newly generated text
|
| 104 |
# This turns again the model's output back into readable form, skipping any extra tokens we don’t need.
|
| 105 |
|
|
@@ -325,6 +333,8 @@ st.markdown("""
|
|
| 325 |
# This part is generally telling how things work
|
| 326 |
st.markdown("""
|
| 327 |
<div class="instructions">
|
|
|
|
|
|
|
| 328 |
Enter a prompt below to generate text using the Gemma model from DeepMind. Customize the tone and length to see different outputs!<br><br>
|
| 329 |
<b>Example:</b> Prompt: "The cat sat on" | Tone: "Funny" | Length: 50 → "The cat sat on my homework and laughed as I cried over my grades."
|
| 330 |
</div>
|
|
@@ -387,7 +397,7 @@ with st.form(key="input_form"):
|
|
| 387 |
with col1:
|
| 388 |
tone = st.selectbox("Tone", ["Funny", "Serious", "Poetic"], index=["Funny", "Serious", "Poetic"].index(st.session_state.get("tone", "Funny")))
|
| 389 |
with col2:
|
| 390 |
-
max_length = st.slider("Word count", 20, 100, 50)
|
| 391 |
# This adds a slider for users to set how many words they want in the output, ranging from 20 to 100 with a default of 50.
|
| 392 |
# And similarly every slider here works
|
| 393 |
|
|
|
|
| 76 |
# This turns our input text (with the tone instruction) into a format (tensors) that the model can process using the tokenizer.
|
| 77 |
input_token_length = input_ids.shape[1] # Get the number of tokens in the input
|
| 78 |
# Store the length of the input
|
| 79 |
+
|
| 80 |
+
# --- Step 1: Estimate Tokens Needed (Increase the buffer) ---
|
| 81 |
+
# Estimate slightly more tokens than words (e.g., 1.5x or 2x buffer)
|
| 82 |
+
# Let's use a factor of 1.75 for a larger buffer to increase chances of reaching word count
|
| 83 |
+
estimated_max_tokens = int(max_length * 1.75)
|
| 84 |
+
# Add a minimum token generation to avoid tiny requests
|
| 85 |
+
estimated_max_tokens = max(estimated_max_tokens, 30) # Ensure we generate at least some tokens
|
| 86 |
+
# --- Step 2: Generate with the higher token limit ---
|
| 87 |
outputs = model.generate(
|
| 88 |
inputs["input_ids"],
|
| 89 |
# max_length=max_length + len(input_text.split()),
|
| 90 |
# This sets how long the generated text can be. We add the number of words in our input text (len(input_text.split())) to the max_length the user picked, so the model knows how many total words to create.
|
| 91 |
# CHANGE: Use max_new_tokens for clarity instead of calculating total length
|
| 92 |
+
max_new_tokens = estimated_max_tokens, # Use the higher estimate, meaning 1.75 times the lenght
|
| 93 |
# Generate THIS many NEW tokens
|
| 94 |
+
temperature = temperature,
|
| 95 |
# This controls how creative the model gets. A lower temperature (e.g., 0.7) keeps things more predictable, while a higher one makes it wilder and more random—think of it like adjusting the spice level!
|
| 96 |
top_p=top_p,
|
| 97 |
# This is like a filter for word choices. It picks from the top percentage of likely words (e.g., 0.9 means 90% of the best options), making the output diverse but not too crazy.
|
| 98 |
+
repetition_penalty = repetition_penalty,
|
| 99 |
# This stops the model from repeating the same words too much. A higher value (e.g., 1.5) pushes it to try new words, like telling it to mix up its vocabulary!
|
| 100 |
+
num_return_sequences = 1,
|
| 101 |
# This tells the model to give us just one version of the text. If we wanted more options, we could change
|
| 102 |
+
do_sample = True,
|
| 103 |
+
pad_token_id = tokenizer.eos_token_id # Good practice for generation
|
| 104 |
)
|
| 105 |
+
# --- Step 3: Decode ONLY the generated part ---
|
| 106 |
# Slice the output tensor to get only the tokens AFTER the input tokens
|
| 107 |
# This tells the model to generate text: it uses the input IDs, sets a max length, and adjusts creativity with temperature, top_p, and repetition_penalty.
|
| 108 |
generated_token_ids = outputs[0, input_token_length:]
|
| 109 |
+
generated_text = tokenizer.decode(generated_token_ids, skip_special_tokens=True).strip()
|
| 110 |
+
|
| 111 |
return generated_text # Return only the newly generated text
|
| 112 |
# This turns again the model's output back into readable form, skipping any extra tokens we don’t need.
|
| 113 |
|
|
|
|
| 333 |
# This part is generally telling how things work
|
| 334 |
st.markdown("""
|
| 335 |
<div class="instructions">
|
| 336 |
+
<b><a href = "https://huggingface.co/spaces/Kakaarot/Gemma-HuggingFace_TextCompletion_Demo/discussions/1">Please check the discussion</a><b>, I mentioned there the reason, why your first response will take little more time.
|
| 337 |
+
Thanks for understanding, Now Enjoyyy 😁 <br><br>
|
| 338 |
Enter a prompt below to generate text using the Gemma model from DeepMind. Customize the tone and length to see different outputs!<br><br>
|
| 339 |
<b>Example:</b> Prompt: "The cat sat on" | Tone: "Funny" | Length: 50 → "The cat sat on my homework and laughed as I cried over my grades."
|
| 340 |
</div>
|
|
|
|
| 397 |
with col1:
|
| 398 |
tone = st.selectbox("Tone", ["Funny", "Serious", "Poetic"], index=["Funny", "Serious", "Poetic"].index(st.session_state.get("tone", "Funny")))
|
| 399 |
with col2:
|
| 400 |
+
max_length = st.slider("Word count", 20, 100, 50, help="Tries to generate text close to this word count. Output might be shorter if the model finishes early, or slightly different due to word splitting. I am considering 1.75 tokens as one word.")
|
| 401 |
# This adds a slider for users to set how many words they want in the output, ranging from 20 to 100 with a default of 50.
|
| 402 |
# And similarly every slider here works
|
| 403 |
|