Spaces:

david167
/

question-generation-api

Sleeping

App Files Files Community

david167 commited on Aug 13, 2025

Commit

04a4f80

1 Parent(s): 678e0f9

Improve Chain of Thinking support: increase min_new_tokens to 500 for CoT requests, improve JSON bracket tracking for nested objects

Browse files

Files changed (1) hide show

gradio_app.py +87 -7

gradio_app.py CHANGED Viewed

@@ -84,12 +84,39 @@ def generate_response(prompt, temperature=0.8):
 """
-        # Tokenize the input
         inputs = model_manager.tokenizer(
-            formatted_prompt,
-            return_tensors="pt",
-            truncation=True,
-            max_length=4096
         )
         # Move inputs to the same device as the model
@@ -101,23 +128,76 @@ def generate_response(prompt, temperature=0.8):
         with torch.no_grad():
             outputs = model_manager.model.generate(
                 **inputs,
-                max_new_tokens=8192,
                 temperature=temperature,
                 top_p=0.95,
                 do_sample=True,
                 num_beams=1,
                 pad_token_id=model_manager.tokenizer.eos_token_id,
                 eos_token_id=model_manager.tokenizer.eos_token_id,
                 early_stopping=False,
                 repetition_penalty=1.05,
                 no_repeat_ngram_size=0,
                 length_penalty=1.0,
-                min_new_tokens=50
             )
         # Decode the response
         generated_text = model_manager.tokenizer.decode(outputs[0], skip_special_tokens=True)
         # Extract just the assistant's response
         if "<|start_header_id|>assistant<|end_header_id|>" in generated_text:
             response = generated_text.split("<|start_header_id|>assistant<|end_header_id|>")[-1].strip()

 """
+        # Determine context window and allocate space for input vs. generation
+        try:
+            max_ctx = getattr(model_manager.model.config, "max_position_embeddings", 8192)
+        except Exception:
+            max_ctx = 8192
+        # Reserve room for generation; cap to half the context as a safety default
+        safe_max_new = min(8192, max(max_ctx // 2, 256))
+        # If caller requested temperature, keep; we control new tokens internally
+        gen_max_new_tokens = min(safe_max_new, 8192)
+        # Allowed input tokens is context minus generation budget and a small buffer
+        allowed_input_tokens = max(512, max_ctx - gen_max_new_tokens - 64)
+        # Detect if this is a Chain of Thinking request and adjust min_new_tokens
+        is_cot_request = ("chain-of-thinking" in prompt.lower() or
+                         "chain of thinking" in prompt.lower() or
+                         "Return exactly this JSON array" in prompt or
+                         ("verbatim" in prompt.lower() and "json array" in prompt.lower()))
+        # Set minimum tokens based on request type
+        if is_cot_request:
+            min_tokens = 500  # Higher minimum for CoT to ensure complete responses
+            logger.info("Detected Chain of Thinking request - using min_new_tokens=500")
+        else:
+            min_tokens = 200  # Standard minimum
+        # Tokenize the input with safe truncation
         inputs = model_manager.tokenizer(
+            formatted_prompt,
+            return_tensors="pt",
+            truncation=True,
+            max_length=allowed_input_tokens
         )
         # Move inputs to the same device as the model
         with torch.no_grad():
             outputs = model_manager.model.generate(
                 **inputs,
+                max_new_tokens=gen_max_new_tokens,
                 temperature=temperature,
                 top_p=0.95,
                 do_sample=True,
                 num_beams=1,
                 pad_token_id=model_manager.tokenizer.eos_token_id,
+                # Keep EOS but rely primarily on post-decode stop to capture full JSON
                 eos_token_id=model_manager.tokenizer.eos_token_id,
                 early_stopping=False,
                 repetition_penalty=1.05,
                 no_repeat_ngram_size=0,
                 length_penalty=1.0,
+                # Dynamic minimum based on request type
+                min_new_tokens=min_tokens
             )
         # Decode the response
         generated_text = model_manager.tokenizer.decode(outputs[0], skip_special_tokens=True)
+        # Post-decode guard: if a top-level JSON array closes, trim to the first full array
+        # This helps prevent trailing prose like 'assistant' or 'Message'.
+        try:
+            # Track both bracket and brace depth to find first complete JSON structure
+            bracket_depth = 0  # [ ]
+            brace_depth = 0    # { }
+            in_string = False
+            escape_next = False
+            start_idx = None
+            end_idx = None
+            for i, ch in enumerate(generated_text):
+                # Handle string escaping
+                if escape_next:
+                    escape_next = False
+                    continue
+                if ch == '\\':
+                    escape_next = True
+                    continue
+                # Track if we're inside a string
+                if ch == '"' and not escape_next:
+                    in_string = not in_string
+                    continue
+                # Only count brackets/braces outside of strings
+                if not in_string:
+                    if ch == '[':
+                        if bracket_depth == 0 and brace_depth == 0 and start_idx is None:
+                            start_idx = i
+                        bracket_depth += 1
+                    elif ch == ']':
+                        bracket_depth = max(0, bracket_depth - 1)
+                        if bracket_depth == 0 and brace_depth == 0 and start_idx is not None:
+                            end_idx = i
+                            break
+                    elif ch == '{':
+                        brace_depth += 1
+                    elif ch == '}':
+                        brace_depth = max(0, brace_depth - 1)
+            if start_idx is not None and end_idx is not None and end_idx > start_idx:
+                # Extract just the complete JSON array
+                json_text = generated_text[start_idx:end_idx+1]
+                logger.info(f"Extracted complete JSON array of length {len(json_text)}")
+                generated_text = json_text
+        except Exception as e:
+            logger.warning(f"Error in JSON extraction: {e}")
+            pass
         # Extract just the assistant's response
         if "<|start_header_id|>assistant<|end_header_id|>" in generated_text:
             response = generated_text.split("<|start_header_id|>assistant<|end_header_id|>")[-1].strip()