david167 commited on
Commit
7822d6f
Β·
1 Parent(s): d82dc35

COMPLETE API REBUILD: ZERO TRUNCATION PRINCIPLE - Intelligent extraction, generous tokens, never cut content

Browse files
Files changed (2) hide show
  1. gradio_app.py +83 -87
  2. test_api.py +0 -115
gradio_app.py CHANGED
@@ -68,7 +68,7 @@ class ModelManager:
68
  self.model_loaded = False
69
 
70
  def generate_response(prompt, temperature=0.8):
71
- """BULLETPROOF GENERATION - NO MORE ERRORS!"""
72
  global model_manager
73
 
74
  if not model_manager or not model_manager.model_loaded:
@@ -82,9 +82,11 @@ def generate_response(prompt, temperature=0.8):
82
  "verbatim"
83
  ])
84
 
 
 
85
  # Simple system message
86
  if is_cot:
87
- system = "Generate the requested JSON training data."
88
  else:
89
  system = "You are a helpful AI assistant."
90
 
@@ -101,12 +103,17 @@ def generate_response(prompt, temperature=0.8):
101
 
102
  """
103
 
104
- # Token limits
105
- max_new = 2048 if is_cot else 1024
106
- min_new = 300 if is_cot else 50
107
- max_input = 6144 # Safe limit
 
 
 
 
 
108
 
109
- logger.info(f"Generating: {min_new}-{max_new} tokens, CoT={is_cot}")
110
 
111
  # Tokenize
112
  inputs = model_manager.tokenizer(
@@ -120,7 +127,9 @@ def generate_response(prompt, temperature=0.8):
120
  if model_manager.device == "cuda:0":
121
  inputs = {k: v.to(next(model_manager.model.parameters()).device) for k, v in inputs.items()}
122
 
123
- # Generate
 
 
124
  with torch.no_grad():
125
  outputs = model_manager.model.generate(
126
  **inputs,
@@ -131,116 +140,103 @@ def generate_response(prompt, temperature=0.8):
131
  do_sample=True,
132
  pad_token_id=model_manager.tokenizer.eos_token_id,
133
  early_stopping=False,
134
- repetition_penalty=1.1
 
135
  )
136
 
137
- # Decode
138
- full = model_manager.tokenizer.decode(outputs[0], skip_special_tokens=True)
139
 
140
- # ULTRA CONSERVATIVE EXTRACTION - STOP ALL TRUNCATION!
141
- logger.info(f"Full generated text length: {len(full)} chars")
142
 
143
- # For debugging - log the full text boundaries
144
- logger.info(f"Full text starts: {full[:200]}...")
145
- logger.info(f"Full text ends: ...{full[-200:]}")
146
 
147
- # Find the JSON array in the response - look for the actual content
148
- # The model should generate the JSON array directly
149
- if is_cot and '[' in full and ']' in full:
150
- # Find the JSON array boundaries
151
- start_idx = full.find('[')
152
- end_idx = full.rfind(']')
153
-
154
- if start_idx != -1 and end_idx != -1 and end_idx > start_idx:
155
- # Extract the complete JSON array
156
- json_content = full[start_idx:end_idx+1]
157
- logger.info(f"Found JSON array: {len(json_content)} chars")
158
- logger.info(f"JSON starts: {json_content[:100]}...")
159
- logger.info(f"JSON ends: ...{json_content[-100:]}")
160
 
161
- # Validate it looks like proper JSON
162
- if '"user"' in json_content and '"assistant"' in json_content:
163
- response = json_content
164
- logger.info("βœ… Using extracted JSON array")
165
  else:
166
- logger.warning("❌ JSON validation failed, using full response")
167
- response = full.strip()
168
  else:
169
- logger.warning("❌ Could not find JSON boundaries, using full response")
170
- response = full.strip()
171
  else:
172
- # For non-CoT or if no JSON found, try to extract assistant response
173
- if "<|start_header_id|>assistant<|end_header_id|>" in full:
174
- parts = full.split("<|start_header_id|>assistant<|end_header_id|>")
175
- if len(parts) > 1:
176
- response = parts[-1].strip()
177
- logger.info(f"Extracted after assistant header: {len(response)} chars")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
178
  else:
179
- response = full.strip()
180
- else:
181
- # ABSOLUTELY NO CUTTING - use everything
182
- response = full.strip()
183
- logger.info("Using complete full response - no cutting")
184
 
185
- logger.info(f"FINAL response length: {len(response)} chars")
186
- logger.info(f"FINAL starts with: {response[:150]}...")
187
- logger.info(f"FINAL ends with: ...{response[-150:]}")
188
 
189
- logger.info(f"Response generated: {len(response)} chars")
190
- return response.strip()
 
 
 
191
 
192
  except Exception as e:
193
- logger.error(f"Generation error: {e}")
194
  return f"Error: {e}"
195
 
196
  # Initialize model ONCE
197
  model_manager = ModelManager()
198
 
199
- def api_respond(message, history, temperature, json_mode=None, template=None):
200
- """API function - EXACTLY what the client expects"""
201
  try:
 
202
  response = generate_response(message, temperature)
203
-
204
- # Return EXACT format the client expects
205
- return [[
206
- {"role": "user", "metadata": None, "content": message, "options": None},
207
- {"role": "assistant", "metadata": None, "content": response, "options": None}
208
- ], ""]
209
-
210
- except Exception as e:
211
- logger.error(f"API Error: {e}")
212
- return [[
213
- {"role": "user", "metadata": None, "content": message, "options": None},
214
- {"role": "assistant", "metadata": None, "content": f"Error: {e}", "options": None}
215
- ], ""]
216
-
217
- # ABSOLUTE SIMPLEST INTERFACE - NO JSON, NO STATE, NOTHING FANCY
218
- def simple_api(message, history_str, temperature, json_mode, template):
219
- """Ultra-simple wrapper - returns JUST the content as a single string"""
220
- try:
221
- # Generate the response directly
222
- response_content = generate_response(message, temperature)
223
-
224
- # Return ONLY the content - single string, no tuple
225
- logger.info(f"Returning direct content: {len(response_content)} chars")
226
- return response_content
227
 
228
  except Exception as e:
229
- logger.error(f"Simple API Error: {e}")
230
  return f"Error: {e}"
231
 
 
232
  demo = gr.Interface(
233
- fn=simple_api,
234
  inputs=[
235
- gr.Textbox(label="Message", lines=5, placeholder="Enter your prompt here..."),
236
- gr.Textbox(label="History", value="[]", visible=False), # String, not JSON
237
  gr.Slider(minimum=0.1, maximum=1.0, value=0.8, step=0.1, label="Temperature"),
238
  gr.Textbox(label="JSON Mode", value="", visible=False),
239
  gr.Textbox(label="Template", value="", visible=False)
240
  ],
241
- outputs=gr.Textbox(label="API Response", lines=10), # Single output
242
- title="πŸš€ Question Generation API - DIRECT CONTENT",
243
- description="Returns content directly - no wrappers, no complications. Perfect for client integration.",
244
  api_name="respond"
245
  )
246
 
 
68
  self.model_loaded = False
69
 
70
  def generate_response(prompt, temperature=0.8):
71
+ """ZERO TRUNCATION GENERATION - Never cut anything!"""
72
  global model_manager
73
 
74
  if not model_manager or not model_manager.model_loaded:
 
82
  "verbatim"
83
  ])
84
 
85
+ logger.info(f"🎯 Request type: {'CoT' if is_cot else 'Standard'}")
86
+
87
  # Simple system message
88
  if is_cot:
89
+ system = "You are an expert at generating JSON training data exactly as requested."
90
  else:
91
  system = "You are a helpful AI assistant."
92
 
 
103
 
104
  """
105
 
106
+ # Generous token limits for complete responses
107
+ if is_cot:
108
+ max_new = 3000 # Generous for complete JSON
109
+ min_new = 800 # Ensure completion
110
+ else:
111
+ max_new = 2000
112
+ min_new = 100
113
+
114
+ max_input = 6000 # Safe input limit
115
 
116
+ logger.info(f"πŸ”’ Token allocation: Input≀{max_input}, Output={min_new}-{max_new}")
117
 
118
  # Tokenize
119
  inputs = model_manager.tokenizer(
 
127
  if model_manager.device == "cuda:0":
128
  inputs = {k: v.to(next(model_manager.model.parameters()).device) for k, v in inputs.items()}
129
 
130
+ logger.info("πŸš€ Starting generation...")
131
+
132
+ # Generate with generous parameters
133
  with torch.no_grad():
134
  outputs = model_manager.model.generate(
135
  **inputs,
 
140
  do_sample=True,
141
  pad_token_id=model_manager.tokenizer.eos_token_id,
142
  early_stopping=False,
143
+ repetition_penalty=1.1,
144
+ use_cache=True
145
  )
146
 
147
+ # Decode the COMPLETE response
148
+ full_response = model_manager.tokenizer.decode(outputs[0], skip_special_tokens=True)
149
 
150
+ logger.info(f"πŸ“ Full response length: {len(full_response)} chars")
151
+ logger.info(f"πŸ“ Response preview: {full_response[:200]}...")
152
 
153
+ # ZERO TRUNCATION EXTRACTION - Find content intelligently but never cut
154
+ response = full_response
 
155
 
156
+ # Look for the assistant response marker
157
+ assistant_marker = "<|start_header_id|>assistant<|end_header_id|>"
158
+ if assistant_marker in full_response:
159
+ # Find the position after the marker
160
+ marker_pos = full_response.find(assistant_marker)
161
+ if marker_pos != -1:
162
+ # Start after the marker + some whitespace
163
+ start_pos = marker_pos + len(assistant_marker)
164
+ # Skip any immediate whitespace/newlines
165
+ while start_pos < len(full_response) and full_response[start_pos] in ' \n\r\t':
166
+ start_pos += 1
 
 
167
 
168
+ if start_pos < len(full_response):
169
+ response = full_response[start_pos:]
170
+ logger.info(f"βœ‚οΈ Extracted after assistant marker: {len(response)} chars")
 
171
  else:
172
+ logger.info("πŸ”„ Marker found but no content after, using full response")
 
173
  else:
174
+ logger.info("πŸ”„ Marker search failed, using full response")
 
175
  else:
176
+ logger.info("πŸ”„ No assistant marker found, using full response")
177
+
178
+ # For CoT, if we have a JSON array, extract it cleanly
179
+ if is_cot and '[' in response and ']' in response:
180
+ # Find the outermost JSON array
181
+ first_bracket = response.find('[')
182
+ last_bracket = response.rfind(']')
183
+
184
+ if first_bracket != -1 and last_bracket != -1 and last_bracket > first_bracket:
185
+ json_candidate = response[first_bracket:last_bracket+1]
186
+
187
+ # Validate it contains the expected structure
188
+ if '"user"' in json_candidate and '"assistant"' in json_candidate:
189
+ # Count the objects to make sure we have multiple items
190
+ user_count = json_candidate.count('"user"')
191
+ if user_count >= 2: # Should have at least 2 user/assistant pairs
192
+ response = json_candidate
193
+ logger.info(f"🎯 Extracted JSON array with {user_count} items: {len(response)} chars")
194
+ else:
195
+ logger.info(f"⚠️ JSON array has only {user_count} items, using full response")
196
  else:
197
+ logger.info("⚠️ JSON candidate failed validation, using full response")
 
 
 
 
198
 
199
+ # Final response
200
+ response = response.strip()
 
201
 
202
+ logger.info(f"βœ… FINAL response: {len(response)} chars")
203
+ logger.info(f"🎬 Starts with: {response[:150]}...")
204
+ logger.info(f"🎭 Ends with: ...{response[-150:]}")
205
+
206
+ return response
207
 
208
  except Exception as e:
209
+ logger.error(f"πŸ’₯ Generation error: {e}")
210
  return f"Error: {e}"
211
 
212
  # Initialize model ONCE
213
  model_manager = ModelManager()
214
 
215
+ def api_respond(message, history_str, temperature, json_mode, template):
216
+ """ZERO TRUNCATION API - Pure content, no wrappers"""
217
  try:
218
+ logger.info(f"πŸ“¨ API Request: {len(message)} chars, temp={temperature}")
219
  response = generate_response(message, temperature)
220
+ logger.info(f"πŸ“€ API Response: {len(response)} chars")
221
+ return response
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
222
 
223
  except Exception as e:
224
+ logger.error(f"πŸ’₯ API Error: {e}")
225
  return f"Error: {e}"
226
 
227
+ # BULLETPROOF GRADIO INTERFACE
228
  demo = gr.Interface(
229
+ fn=api_respond,
230
  inputs=[
231
+ gr.Textbox(label="Message", lines=8, placeholder="Enter your prompt here..."),
232
+ gr.Textbox(label="History", value="[]", visible=False),
233
  gr.Slider(minimum=0.1, maximum=1.0, value=0.8, step=0.1, label="Temperature"),
234
  gr.Textbox(label="JSON Mode", value="", visible=False),
235
  gr.Textbox(label="Template", value="", visible=False)
236
  ],
237
+ outputs=gr.Textbox(label="Response", lines=20, max_lines=50),
238
+ title="🎯 Question Generation API - ZERO TRUNCATION",
239
+ description="Rebuilt from scratch with ZERO text cutting. Generates complete responses every time.",
240
  api_name="respond"
241
  )
242
 
test_api.py DELETED
@@ -1,115 +0,0 @@
1
- #!/usr/bin/env python3
2
- """Test the API with the EXACT request from the logs to diagnose truncation"""
3
-
4
- import requests
5
- import json
6
- from gradio_client import Client
7
-
8
- def test_api():
9
- """Test the API with the exact CoT request from the logs"""
10
-
11
- # EXACT request from the logs
12
- test_prompt = """Return exactly this JSON array (no other text):
13
- [
14
- {"user": "who is going to win the ravens commanders game", "assistant": "thought: to make a prediction, i'm going to need to think through the following:\nRelative strength or power of the competing teams, which establishes strength differential on a neutral field. These relative power ratings will change over the season and from end of season to the beginning of the next season.\nEach team's previous performances.\nHome field advantage, which could vary by each home and away team.\nIndividual values for each player on each team.\nPresence of injuries or illnesses that affect a team's relative power. Note that injuries/illnesses will subtract from a team's relative power. Not only do you need to adjust based on players out for the week, but you also must adjust based on active players who are playing with an injury.\nGame factors such as weather, each team's previous schedule (e.g., byes, multiple away games in a row, etc.), travel distance/difficulty, stadium quirks, and turf types.\nMotivational factors such as revenge, rivalries, coaching changes, etc.\nSteps\nEvaluate the available information based on my thoughts.\nList out all the information we think is relevant for both teams.\nDiscuss any motivational factors - players or coaches who have a history as an example\nTalk about any weaknesses on either defense who the other team might take advantage of\nLastly make a prediction on the result and score of the game."},
15
- {"user": "[new question based on: You are a broadcaster and an NFL expert. You have years of experience coaching and playing in the N...]", "assistant": "[detailed answer consistent with system context]"},
16
- {"user": "[another question based on the topic]", "assistant": "[another detailed answer consistent with system context]"}
17
- ]
18
-
19
- Context for new questions:
20
- SYSTEM: You are a broadcaster and an NFL expert. You have years of experience coaching and playing in the NFL. When someone asks you how you think or to make a prediction about a game or a player, you are thoughtful and detailed thinking through each element of information you would need and judging how much each element will matter
21
- TOPIC: Based on the user/assistant exchange above
22
-
23
- Requirements:
24
- - First item MUST use the exact user and assistant prompts provided above
25
- - Items 2-3 should be NEW, diverse questions with informative responses
26
- - All responses should be consistent with the system context
27
- - Return ONLY the JSON array, no additional text"""
28
-
29
- print("πŸ§ͺ TESTING API WITH EXACT COT REQUEST")
30
- print("=" * 60)
31
- print(f"Request length: {len(test_prompt)} characters")
32
- print(f"Request preview: {test_prompt[:200]}...")
33
- print("=" * 60)
34
-
35
- try:
36
- # Use Gradio Client like the actual application
37
- print("πŸ“‘ Connecting to Gradio API...")
38
- client = Client("https://david167-question-generation-api.hf.space/")
39
-
40
- print("πŸ“‘ Sending request via Gradio Client...")
41
- result = client.predict(
42
- test_prompt, # message
43
- "[]", # history_str
44
- 0.8, # temperature
45
- "", # json_mode
46
- "", # template
47
- api_name="/respond"
48
- )
49
-
50
- print("βœ… API Response received!")
51
- print(f"Result type: {type(result)}")
52
- print(f"Result: {result}")
53
-
54
- # Extract content based on result type
55
- if isinstance(result, tuple):
56
- content = result[0] if len(result) > 0 else ""
57
- print("πŸ“¦ Extracted from tuple")
58
- elif isinstance(result, str):
59
- content = result
60
- print("πŸ“¦ Direct string result")
61
- else:
62
- content = str(result)
63
- print("πŸ“¦ Converted to string")
64
-
65
- print(f"Response length: {len(content)} characters")
66
- print("=" * 60)
67
- print("RESPONSE CONTENT:")
68
- print(content)
69
- print("=" * 60)
70
-
71
- # Check for truncation indicators
72
- truncation_indicators = [
73
- content.endswith('", \''), # Incomplete tuple
74
- 'e following:' in content[:50], # Truncated start
75
- not content.strip().endswith(']'), # Missing JSON close
76
- len(content) < 500, # Too short for complete CoT
77
- ]
78
-
79
- if any(truncation_indicators):
80
- print("❌ TRUNCATION DETECTED!")
81
- print("Issues found:")
82
- if content.endswith('", \''):
83
- print(" - Response ends with incomplete tuple")
84
- if 'e following:' in content[:50]:
85
- print(" - Response starts mid-sentence (truncated beginning)")
86
- if not content.strip().endswith(']'):
87
- print(" - JSON array not properly closed")
88
- if len(content) < 500:
89
- print(" - Response too short for complete CoT")
90
- else:
91
- print("βœ… NO TRUNCATION DETECTED!")
92
-
93
- # Try to parse as JSON
94
- try:
95
- if content.strip().startswith('[') and content.strip().endswith(']'):
96
- parsed = json.loads(content.strip())
97
- print(f"βœ… VALID JSON: {len(parsed)} items")
98
-
99
- # Check first item for verbatim match
100
- if len(parsed) > 0 and isinstance(parsed[0], dict):
101
- first_user = parsed[0].get('user', '')
102
- if 'who is going to win the ravens commanders game' in first_user:
103
- print("βœ… FIRST ITEM VERBATIM MATCH!")
104
- else:
105
- print("❌ First item not verbatim")
106
- else:
107
- print("❌ Response not valid JSON array format")
108
- except json.JSONDecodeError as e:
109
- print(f"❌ JSON PARSE ERROR: {e}")
110
-
111
- except Exception as e:
112
- print(f"❌ Test failed: {e}")
113
-
114
- if __name__ == "__main__":
115
- test_api()