Spaces:

david167
/

question-generation-api

Sleeping

App Files Files Community

david167 commited on Aug 13, 2025

Commit

14f445d

1 Parent(s): b394386

MAXIMUM TOKEN SETTINGS: Use 131k context, 16k max_new_tokens, 2k min_tokens for CoT - eliminate all truncation

Browse files

Files changed (1) hide show

gradio_app.py +34 -49

gradio_app.py CHANGED Viewed

@@ -84,34 +84,33 @@ def generate_response(prompt, temperature=0.8):
 """
-        # Determine context window and allocate space for input vs. generation
         try:
-            max_ctx = getattr(model_manager.model.config, "max_position_embeddings", 8192)
         except Exception:
-            max_ctx = 8192
-        # Reserve room for generation; cap to half the context as a safety default
-        safe_max_new = min(8192, max(max_ctx // 2, 256))
-        # If caller requested temperature, keep; we control new tokens internally
-        gen_max_new_tokens = min(safe_max_new, 8192)
-        # Allowed input tokens is context minus generation budget and a small buffer
-        allowed_input_tokens = max(512, max_ctx - gen_max_new_tokens - 64)
-        # Detect if this is a Chain of Thinking request and adjust min_new_tokens
         is_cot_request = ("chain-of-thinking" in prompt.lower() or
                          "chain of thinking" in prompt.lower() or
                          "Return exactly this JSON array" in prompt or
                          ("verbatim" in prompt.lower() and "json array" in prompt.lower()))
-        # Set minimum tokens based on request type
         if is_cot_request:
-            min_tokens = 1500  # Even higher minimum for CoT to ensure complete responses
-            # Also reduce max_new_tokens to ensure we don't hit context limits
-            gen_max_new_tokens = min(gen_max_new_tokens, 2048)  # Cap at 2048 for CoT
-            logger.info(f"Detected Chain of Thinking request - using min_new_tokens={min_tokens}, max_new_tokens={gen_max_new_tokens}")
         else:
-            min_tokens = 200  # Standard minimum
         # Tokenize the input with safe truncation
         inputs = model_manager.tokenizer(
@@ -126,39 +125,25 @@ def generate_response(prompt, temperature=0.8):
             model_device = next(model_manager.model.parameters()).device
             inputs = {k: v.to(model_device) for k, v in inputs.items()}
-        # Generate response
         with torch.no_grad():
-            # For CoT requests, be more aggressive about preventing early stopping
-            if is_cot_request:
-                # Suppress EOS token for CoT to prevent early termination
-                eos_token_id = None
-                suppress_tokens = [model_manager.tokenizer.eos_token_id] if model_manager.tokenizer.eos_token_id is not None else None
-            else:
-                eos_token_id = model_manager.tokenizer.eos_token_id
-                suppress_tokens = None
-            generation_kwargs = {
                 **inputs,
-                "max_new_tokens": gen_max_new_tokens,
-                "temperature": temperature,
-                "top_p": 0.95,
-                "do_sample": True,
-                "num_beams": 1,
-                "pad_token_id": model_manager.tokenizer.eos_token_id,
-                "early_stopping": False,
-                "repetition_penalty": 1.05,
-                "no_repeat_ngram_size": 0,
-                "length_penalty": 1.0,
-                "min_new_tokens": min_tokens
-            }
-            # Add EOS suppression for CoT
-            if eos_token_id is not None:
-                generation_kwargs["eos_token_id"] = eos_token_id
-            if suppress_tokens is not None:
-                generation_kwargs["suppress_tokens"] = suppress_tokens
-            outputs = model_manager.model.generate(**generation_kwargs)
         # Decode the response
         generated_text = model_manager.tokenizer.decode(outputs[0], skip_special_tokens=True)

 """
+        # Determine context window and USE ABSOLUTE MAXIMUM
         try:
+            max_ctx = getattr(model_manager.model.config, "max_position_embeddings", 131072)  # Llama 3.1 supports up to 131k
         except Exception:
+            max_ctx = 131072  # Use maximum possible
+        logger.info(f"Model max context: {max_ctx} tokens")
+        # Detect if this is a Chain of Thinking request
         is_cot_request = ("chain-of-thinking" in prompt.lower() or
                          "chain of thinking" in prompt.lower() or
                          "Return exactly this JSON array" in prompt or
                          ("verbatim" in prompt.lower() and "json array" in prompt.lower()))
+        # MAXIMIZE GENERATION TOKENS - use most of context for generation
         if is_cot_request:
+            # For CoT, use MAXIMUM possible generation tokens
+            gen_max_new_tokens = 16384  # Very high limit for complete responses
+            min_tokens = 2000  # High minimum to force complete generation
+            # Allow most of context for input
+            allowed_input_tokens = max_ctx - gen_max_new_tokens - 100  # Small safety buffer
+            logger.info(f"CoT REQUEST - MAXIMIZED: min_tokens={min_tokens}, max_new_tokens={gen_max_new_tokens}, input_limit={allowed_input_tokens}")
         else:
+            # Standard requests
+            gen_max_new_tokens = 8192
+            min_tokens = 200
+            allowed_input_tokens = max_ctx - gen_max_new_tokens - 100
         # Tokenize the input with safe truncation
         inputs = model_manager.tokenizer(
             model_device = next(model_manager.model.parameters()).device
             inputs = {k: v.to(model_device) for k, v in inputs.items()}
+        # Generate response with MAXIMUM settings
         with torch.no_grad():
+            outputs = model_manager.model.generate(
                 **inputs,
+                max_new_tokens=gen_max_new_tokens,
+                min_new_tokens=min_tokens,
+                temperature=temperature,
+                top_p=0.95,
+                do_sample=True,
+                num_beams=1,
+                pad_token_id=model_manager.tokenizer.eos_token_id,
+                eos_token_id=model_manager.tokenizer.eos_token_id,
+                early_stopping=False,  # Never stop early
+                repetition_penalty=1.05,
+                no_repeat_ngram_size=0,
+                length_penalty=1.0,
+                # Force generation to continue
+                use_cache=True
+            )
         # Decode the response
         generated_text = model_manager.tokenizer.decode(outputs[0], skip_special_tokens=True)