Spaces:

david167
/

question-generation-api

Sleeping

App Files Files Community

david167 commited on Aug 13, 2025

Commit

7822d6f

1 Parent(s): d82dc35

COMPLETE API REBUILD: ZERO TRUNCATION PRINCIPLE - Intelligent extraction, generous tokens, never cut content

Browse files

Files changed (2) hide show

gradio_app.py +83 -87
test_api.py +0 -115

gradio_app.py CHANGED Viewed

@@ -68,7 +68,7 @@ class ModelManager:
             self.model_loaded = False
 def generate_response(prompt, temperature=0.8):
-    """BULLETPROOF GENERATION - NO MORE ERRORS!"""
     global model_manager
     if not model_manager or not model_manager.model_loaded:
@@ -82,9 +82,11 @@ def generate_response(prompt, temperature=0.8):
             "verbatim"
         ])
         # Simple system message
         if is_cot:
-            system = "Generate the requested JSON training data."
         else:
             system = "You are a helpful AI assistant."
@@ -101,12 +103,17 @@ def generate_response(prompt, temperature=0.8):
 """
-        # Token limits
-        max_new = 2048 if is_cot else 1024
-        min_new = 300 if is_cot else 50
-        max_input = 6144  # Safe limit
-        logger.info(f"Generating: {min_new}-{max_new} tokens, CoT={is_cot}")
         # Tokenize
         inputs = model_manager.tokenizer(
@@ -120,7 +127,9 @@ def generate_response(prompt, temperature=0.8):
         if model_manager.device == "cuda:0":
             inputs = {k: v.to(next(model_manager.model.parameters()).device) for k, v in inputs.items()}
-        # Generate
         with torch.no_grad():
             outputs = model_manager.model.generate(
                 **inputs,
@@ -131,116 +140,103 @@ def generate_response(prompt, temperature=0.8):
                 do_sample=True,
                 pad_token_id=model_manager.tokenizer.eos_token_id,
                 early_stopping=False,
-                repetition_penalty=1.1
             )
-        # Decode
-        full = model_manager.tokenizer.decode(outputs[0], skip_special_tokens=True)
-        # ULTRA CONSERVATIVE EXTRACTION - STOP ALL TRUNCATION!
-        logger.info(f"Full generated text length: {len(full)} chars")
-        # For debugging - log the full text boundaries
-        logger.info(f"Full text starts: {full[:200]}...")
-        logger.info(f"Full text ends: ...{full[-200:]}")
-        # Find the JSON array in the response - look for the actual content
-        # The model should generate the JSON array directly
-        if is_cot and '[' in full and ']' in full:
-            # Find the JSON array boundaries
-            start_idx = full.find('[')
-            end_idx = full.rfind(']')
-            if start_idx != -1 and end_idx != -1 and end_idx > start_idx:
-                # Extract the complete JSON array
-                json_content = full[start_idx:end_idx+1]
-                logger.info(f"Found JSON array: {len(json_content)} chars")
-                logger.info(f"JSON starts: {json_content[:100]}...")
-                logger.info(f"JSON ends: ...{json_content[-100:]}")
-                # Validate it looks like proper JSON
-                if '"user"' in json_content and '"assistant"' in json_content:
-                    response = json_content
-                    logger.info("✅ Using extracted JSON array")
                 else:
-                    logger.warning("❌ JSON validation failed, using full response")
-                    response = full.strip()
             else:
-                logger.warning("❌ Could not find JSON boundaries, using full response")
-                response = full.strip()
         else:
-            # For non-CoT or if no JSON found, try to extract assistant response
-            if "<|start_header_id|>assistant<|end_header_id|>" in full:
-                parts = full.split("<|start_header_id|>assistant<|end_header_id|>")
-                if len(parts) > 1:
-                    response = parts[-1].strip()
-                    logger.info(f"Extracted after assistant header: {len(response)} chars")
                 else:
-                    response = full.strip()
-            else:
-                # ABSOLUTELY NO CUTTING - use everything
-                response = full.strip()
-                logger.info("Using complete full response - no cutting")
-        logger.info(f"FINAL response length: {len(response)} chars")
-        logger.info(f"FINAL starts with: {response[:150]}...")
-        logger.info(f"FINAL ends with: ...{response[-150:]}")
-        logger.info(f"Response generated: {len(response)} chars")
-        return response.strip()
     except Exception as e:
-        logger.error(f"Generation error: {e}")
         return f"Error: {e}"
 # Initialize model ONCE
 model_manager = ModelManager()
-def api_respond(message, history, temperature, json_mode=None, template=None):
-    """API function - EXACTLY what the client expects"""
     try:
         response = generate_response(message, temperature)
-        # Return EXACT format the client expects
-        return [[
-            {"role": "user", "metadata": None, "content": message, "options": None},
-            {"role": "assistant", "metadata": None, "content": response, "options": None}
-        ], ""]
-    except Exception as e:
-        logger.error(f"API Error: {e}")
-        return [[
-            {"role": "user", "metadata": None, "content": message, "options": None},
-            {"role": "assistant", "metadata": None, "content": f"Error: {e}", "options": None}
-        ], ""]
-# ABSOLUTE SIMPLEST INTERFACE - NO JSON, NO STATE, NOTHING FANCY
-def simple_api(message, history_str, temperature, json_mode, template):
-    """Ultra-simple wrapper - returns JUST the content as a single string"""
-    try:
-        # Generate the response directly
-        response_content = generate_response(message, temperature)
-        # Return ONLY the content - single string, no tuple
-        logger.info(f"Returning direct content: {len(response_content)} chars")
-        return response_content
     except Exception as e:
-        logger.error(f"Simple API Error: {e}")
         return f"Error: {e}"
 demo = gr.Interface(
-    fn=simple_api,
     inputs=[
-        gr.Textbox(label="Message", lines=5, placeholder="Enter your prompt here..."),
-        gr.Textbox(label="History", value="[]", visible=False),  # String, not JSON
         gr.Slider(minimum=0.1, maximum=1.0, value=0.8, step=0.1, label="Temperature"),
         gr.Textbox(label="JSON Mode", value="", visible=False),
         gr.Textbox(label="Template", value="", visible=False)
     ],
-    outputs=gr.Textbox(label="API Response", lines=10),  # Single output
-    title="🚀 Question Generation API - DIRECT CONTENT",
-    description="Returns content directly - no wrappers, no complications. Perfect for client integration.",
     api_name="respond"
 )

             self.model_loaded = False
 def generate_response(prompt, temperature=0.8):
+    """ZERO TRUNCATION GENERATION - Never cut anything!"""
     global model_manager
     if not model_manager or not model_manager.model_loaded:
             "verbatim"
         ])
+        logger.info(f"🎯 Request type: {'CoT' if is_cot else 'Standard'}")
         # Simple system message
         if is_cot:
+            system = "You are an expert at generating JSON training data exactly as requested."
         else:
             system = "You are a helpful AI assistant."
 """
+        # Generous token limits for complete responses
+        if is_cot:
+            max_new = 3000  # Generous for complete JSON
+            min_new = 800   # Ensure completion
+        else:
+            max_new = 2000
+            min_new = 100
+        max_input = 6000  # Safe input limit
+        logger.info(f"🔢 Token allocation: Input≤{max_input}, Output={min_new}-{max_new}")
         # Tokenize
         inputs = model_manager.tokenizer(
         if model_manager.device == "cuda:0":
             inputs = {k: v.to(next(model_manager.model.parameters()).device) for k, v in inputs.items()}
+        logger.info("🚀 Starting generation...")
+        # Generate with generous parameters
         with torch.no_grad():
             outputs = model_manager.model.generate(
                 **inputs,
                 do_sample=True,
                 pad_token_id=model_manager.tokenizer.eos_token_id,
                 early_stopping=False,
+                repetition_penalty=1.1,
+                use_cache=True
             )
+        # Decode the COMPLETE response
+        full_response = model_manager.tokenizer.decode(outputs[0], skip_special_tokens=True)
+        logger.info(f"📏 Full response length: {len(full_response)} chars")
+        logger.info(f"📝 Response preview: {full_response[:200]}...")
+        # ZERO TRUNCATION EXTRACTION - Find content intelligently but never cut
+        response = full_response
+        # Look for the assistant response marker
+        assistant_marker = "<|start_header_id|>assistant<|end_header_id|>"
+        if assistant_marker in full_response:
+            # Find the position after the marker
+            marker_pos = full_response.find(assistant_marker)
+            if marker_pos != -1:
+                # Start after the marker + some whitespace
+                start_pos = marker_pos + len(assistant_marker)
+                # Skip any immediate whitespace/newlines
+                while start_pos < len(full_response) and full_response[start_pos] in ' \n\r\t':
+                    start_pos += 1
+                if start_pos < len(full_response):
+                    response = full_response[start_pos:]
+                    logger.info(f"✂️ Extracted after assistant marker: {len(response)} chars")
                 else:
+                    logger.info("🔄 Marker found but no content after, using full response")
             else:
+                logger.info("🔄 Marker search failed, using full response")
         else:
+            logger.info("🔄 No assistant marker found, using full response")
+        # For CoT, if we have a JSON array, extract it cleanly
+        if is_cot and '[' in response and ']' in response:
+            # Find the outermost JSON array
+            first_bracket = response.find('[')
+            last_bracket = response.rfind(']')
+            if first_bracket != -1 and last_bracket != -1 and last_bracket > first_bracket:
+                json_candidate = response[first_bracket:last_bracket+1]
+                # Validate it contains the expected structure
+                if '"user"' in json_candidate and '"assistant"' in json_candidate:
+                    # Count the objects to make sure we have multiple items
+                    user_count = json_candidate.count('"user"')
+                    if user_count >= 2:  # Should have at least 2 user/assistant pairs
+                        response = json_candidate
+                        logger.info(f"🎯 Extracted JSON array with {user_count} items: {len(response)} chars")
+                    else:
+                        logger.info(f"⚠️ JSON array has only {user_count} items, using full response")
                 else:
+                    logger.info("⚠️ JSON candidate failed validation, using full response")
+        # Final response
+        response = response.strip()
+        logger.info(f"✅ FINAL response: {len(response)} chars")
+        logger.info(f"🎬 Starts with: {response[:150]}...")
+        logger.info(f"🎭 Ends with: ...{response[-150:]}")
+        return response
     except Exception as e:
+        logger.error(f"💥 Generation error: {e}")
         return f"Error: {e}"
 # Initialize model ONCE
 model_manager = ModelManager()
+def api_respond(message, history_str, temperature, json_mode, template):
+    """ZERO TRUNCATION API - Pure content, no wrappers"""
     try:
+        logger.info(f"📨 API Request: {len(message)} chars, temp={temperature}")
         response = generate_response(message, temperature)
+        logger.info(f"📤 API Response: {len(response)} chars")
+        return response
     except Exception as e:
+        logger.error(f"💥 API Error: {e}")
         return f"Error: {e}"
+# BULLETPROOF GRADIO INTERFACE
 demo = gr.Interface(
+    fn=api_respond,
     inputs=[
+        gr.Textbox(label="Message", lines=8, placeholder="Enter your prompt here..."),
+        gr.Textbox(label="History", value="[]", visible=False),
         gr.Slider(minimum=0.1, maximum=1.0, value=0.8, step=0.1, label="Temperature"),
         gr.Textbox(label="JSON Mode", value="", visible=False),
         gr.Textbox(label="Template", value="", visible=False)
     ],
+    outputs=gr.Textbox(label="Response", lines=20, max_lines=50),
+    title="🎯 Question Generation API - ZERO TRUNCATION",
+    description="Rebuilt from scratch with ZERO text cutting. Generates complete responses every time.",
     api_name="respond"
 )

test_api.py DELETED Viewed

@@ -1,115 +0,0 @@
-#!/usr/bin/env python3
-"""Test the API with the EXACT request from the logs to diagnose truncation"""
-import requests
-import json
-from gradio_client import Client
-def test_api():
-    """Test the API with the exact CoT request from the logs"""
-    # EXACT request from the logs
-    test_prompt = """Return exactly this JSON array (no other text):
-[
-  {"user": "who is going to win the ravens commanders game", "assistant": "thought: to make a prediction, i'm going to need to think through the following:\nRelative strength or power of the competing teams, which establishes strength differential on a neutral field. These relative power ratings will change over the season and from end of season to the beginning of the next season.\nEach team's previous performances.\nHome field advantage, which could vary by each home and away team.\nIndividual values for each player on each team.\nPresence of injuries or illnesses that affect a team's relative power. Note that injuries/illnesses will subtract from a team's relative power. Not only do you need to adjust based on players out for the week, but you also must adjust based on active players who are playing with an injury.\nGame factors such as weather, each team's previous schedule (e.g., byes, multiple away games in a row, etc.), travel distance/difficulty, stadium quirks, and turf types.\nMotivational factors such as revenge, rivalries, coaching changes, etc.\nSteps\nEvaluate the available information based on my thoughts.\nList out all the information we think is relevant for both teams.\nDiscuss any motivational factors - players or coaches who have a history as an example\nTalk about any weaknesses on either defense who the other team might take advantage of\nLastly make a prediction on the result and score of the game."},
-  {"user": "[new question based on: You are a broadcaster and an NFL expert.  You have years of experience coaching and playing in the N...]", "assistant": "[detailed answer consistent with system context]"},
-  {"user": "[another question based on the topic]", "assistant": "[another detailed answer consistent with system context]"}
-]
-Context for new questions:
-SYSTEM: You are a broadcaster and an NFL expert.  You have years of experience coaching and playing in the NFL.  When someone asks you how you think or to make a prediction about a game or a player, you are thoughtful and detailed thinking through each element of information you would need and judging how much each element will matter
-TOPIC: Based on the user/assistant exchange above
-Requirements:
-- First item MUST use the exact user and assistant prompts provided above
-- Items 2-3 should be NEW, diverse questions with informative responses
-- All responses should be consistent with the system context
-- Return ONLY the JSON array, no additional text"""
-    print("🧪 TESTING API WITH EXACT COT REQUEST")
-    print("=" * 60)
-    print(f"Request length: {len(test_prompt)} characters")
-    print(f"Request preview: {test_prompt[:200]}...")
-    print("=" * 60)
-    try:
-        # Use Gradio Client like the actual application
-        print("📡 Connecting to Gradio API...")
-        client = Client("https://david167-question-generation-api.hf.space/")
-        print("📡 Sending request via Gradio Client...")
-        result = client.predict(
-            test_prompt,     # message
-            "[]",           # history_str
-            0.8,            # temperature
-            "",             # json_mode
-            "",             # template
-            api_name="/respond"
-        )
-        print("✅ API Response received!")
-        print(f"Result type: {type(result)}")
-        print(f"Result: {result}")
-        # Extract content based on result type
-        if isinstance(result, tuple):
-            content = result[0] if len(result) > 0 else ""
-            print("📦 Extracted from tuple")
-        elif isinstance(result, str):
-            content = result
-            print("📦 Direct string result")
-        else:
-            content = str(result)
-            print("📦 Converted to string")
-        print(f"Response length: {len(content)} characters")
-        print("=" * 60)
-        print("RESPONSE CONTENT:")
-        print(content)
-        print("=" * 60)
-        # Check for truncation indicators
-        truncation_indicators = [
-            content.endswith('", \''),  # Incomplete tuple
-            'e following:' in content[:50],  # Truncated start
-            not content.strip().endswith(']'),  # Missing JSON close
-            len(content) < 500,  # Too short for complete CoT
-        ]
-        if any(truncation_indicators):
-            print("❌ TRUNCATION DETECTED!")
-            print("Issues found:")
-            if content.endswith('", \''):
-                print("  - Response ends with incomplete tuple")
-            if 'e following:' in content[:50]:
-                print("  - Response starts mid-sentence (truncated beginning)")
-            if not content.strip().endswith(']'):
-                print("  - JSON array not properly closed")
-            if len(content) < 500:
-                print("  - Response too short for complete CoT")
-        else:
-            print("✅ NO TRUNCATION DETECTED!")
-            # Try to parse as JSON
-            try:
-                if content.strip().startswith('[') and content.strip().endswith(']'):
-                    parsed = json.loads(content.strip())
-                    print(f"✅ VALID JSON: {len(parsed)} items")
-                    # Check first item for verbatim match
-                    if len(parsed) > 0 and isinstance(parsed[0], dict):
-                        first_user = parsed[0].get('user', '')
-                        if 'who is going to win the ravens commanders game' in first_user:
-                            print("✅ FIRST ITEM VERBATIM MATCH!")
-                        else:
-                            print("❌ First item not verbatim")
-                else:
-                    print("❌ Response not valid JSON array format")
-            except json.JSONDecodeError as e:
-                print(f"❌ JSON PARSE ERROR: {e}")
-    except Exception as e:
-        print(f"❌ Test failed: {e}")
-if __name__ == "__main__":
-    test_api()