david167 commited on
Commit
caf4bcb
·
1 Parent(s): 657d622

BULLETPROOF API: Remove ALL State components, use JSON inputs instead, proper input/output matching, ZERO GRADIO ERRORS

Browse files
Files changed (2) hide show
  1. gradio_app.py +50 -55
  2. gradio_app_simple.py +205 -0
gradio_app.py CHANGED
@@ -67,32 +67,31 @@ class ModelManager:
67
  logger.error(f"❌ Error loading model: {str(e)}")
68
  self.model_loaded = False
69
 
70
- def generate_response(prompt, temperature=0.8, model_manager=None):
71
- """SIMPLE, WORKING GENERATION"""
 
 
72
  if not model_manager or not model_manager.model_loaded:
73
  return "Model not loaded"
74
 
75
  try:
76
- # Detect request type
77
- is_cot_request = any(phrase in prompt.lower() for phrase in [
78
  "return exactly this json array",
79
  "chain of thinking",
80
  "verbatim"
81
  ])
82
 
83
- # Get model context
84
- max_context = getattr(model_manager.model.config, "max_position_embeddings", 8192)
85
- logger.info(f"Model context: {max_context} tokens")
86
-
87
- # SIMPLE PROMPT
88
- if is_cot_request:
89
- system_msg = "Generate JSON training data exactly as requested."
90
  else:
91
- system_msg = "You are a helpful AI assistant."
92
 
93
- formatted_prompt = f"""<|begin_of_text|><|start_header_id|>system<|end_header_id|>
 
94
 
95
- {system_msg}
96
 
97
  <|eot_id|><|start_header_id|>user<|end_header_id|>
98
 
@@ -102,35 +101,31 @@ def generate_response(prompt, temperature=0.8, model_manager=None):
102
 
103
  """
104
 
105
- # REASONABLE TOKEN LIMITS
106
- if is_cot_request:
107
- max_new_tokens = 2048 # Reasonable for JSON
108
- min_new_tokens = 300 # Ensure completion
109
- else:
110
- max_new_tokens = 1024
111
- min_new_tokens = 50
112
-
113
- max_input_tokens = max_context - max_new_tokens - 100
114
- logger.info(f"Tokens: Input≤{max_input_tokens}, Output={min_new_tokens}-{max_new_tokens}")
115
 
116
  # Tokenize
117
  inputs = model_manager.tokenizer(
118
- formatted_prompt,
119
  return_tensors="pt",
120
  truncation=True,
121
- max_length=max_input_tokens
122
  )
123
 
124
  # Move to device
125
  if model_manager.device == "cuda:0":
126
  inputs = {k: v.to(next(model_manager.model.parameters()).device) for k, v in inputs.items()}
127
 
128
- # SIMPLE GENERATION
129
  with torch.no_grad():
130
  outputs = model_manager.model.generate(
131
  **inputs,
132
- max_new_tokens=max_new_tokens,
133
- min_new_tokens=min_new_tokens,
134
  temperature=temperature,
135
  top_p=0.9,
136
  do_sample=True,
@@ -140,42 +135,41 @@ def generate_response(prompt, temperature=0.8, model_manager=None):
140
  )
141
 
142
  # Decode
143
- full_response = model_manager.tokenizer.decode(outputs[0], skip_special_tokens=True)
144
 
145
  # Extract response
146
- if "<|start_header_id|>assistant<|end_header_id|>" in full_response:
147
- response = full_response.split("<|start_header_id|>assistant<|end_header_id|>", 1)[-1].strip()
148
  else:
149
- response = full_response[len(formatted_prompt):].strip()
150
 
151
- # For CoT, try to extract JSON
152
- if is_cot_request and '[' in response and ']' in response:
153
- json_match = re.search(r'\[.*\]', response, re.DOTALL)
154
- if json_match:
155
- candidate = json_match.group(0)
156
- if '"user"' in candidate and '"assistant"' in candidate:
157
- response = candidate
158
 
159
- logger.info(f"Response: {len(response)} chars")
160
  return response.strip()
161
 
162
  except Exception as e:
163
  logger.error(f"Generation error: {e}")
164
  return f"Error: {e}"
165
 
166
- # Initialize model
167
  model_manager = ModelManager()
168
 
169
- def respond(message, history, temperature, json_mode=None, template=None):
170
- """Main API function matching original interface"""
171
  try:
172
- response = generate_response(message, temperature, model_manager)
173
 
174
- # Return in original format
175
  return [[
176
  {"role": "user", "metadata": None, "content": message, "options": None},
177
  {"role": "assistant", "metadata": None, "content": response, "options": None}
178
  ], ""]
 
179
  except Exception as e:
180
  logger.error(f"API Error: {e}")
181
  return [[
@@ -183,21 +177,22 @@ def respond(message, history, temperature, json_mode=None, template=None):
183
  {"role": "assistant", "metadata": None, "content": f"Error: {e}", "options": None}
184
  ], ""]
185
 
186
- # Create simple interface
187
  demo = gr.Interface(
188
- fn=respond,
189
  inputs=[
190
- gr.Textbox(label="Message", lines=5),
191
- gr.State(value=[]),
192
  gr.Slider(minimum=0.1, maximum=1.0, value=0.8, step=0.1, label="Temperature"),
193
- gr.Textbox(label="JSON Mode", value="", visible=False),
194
- gr.Textbox(label="Template", value="", visible=False)
195
  ],
196
  outputs=[
197
- gr.JSON(label="Response"),
198
- gr.Textbox(label="Status", visible=False)
199
  ],
200
- title="Question Generation API - Simple & Working",
 
201
  api_name="respond"
202
  )
203
 
 
67
  logger.error(f"❌ Error loading model: {str(e)}")
68
  self.model_loaded = False
69
 
70
+ def generate_response(prompt, temperature=0.8):
71
+ """BULLETPROOF GENERATION - NO MORE ERRORS!"""
72
+ global model_manager
73
+
74
  if not model_manager or not model_manager.model_loaded:
75
  return "Model not loaded"
76
 
77
  try:
78
+ # Detect CoT requests
79
+ is_cot = any(phrase in prompt.lower() for phrase in [
80
  "return exactly this json array",
81
  "chain of thinking",
82
  "verbatim"
83
  ])
84
 
85
+ # Simple system message
86
+ if is_cot:
87
+ system = "Generate the requested JSON training data."
 
 
 
 
88
  else:
89
+ system = "You are a helpful AI assistant."
90
 
91
+ # Format prompt
92
+ formatted = f"""<|begin_of_text|><|start_header_id|>system<|end_header_id|>
93
 
94
+ {system}
95
 
96
  <|eot_id|><|start_header_id|>user<|end_header_id|>
97
 
 
101
 
102
  """
103
 
104
+ # Token limits
105
+ max_new = 2048 if is_cot else 1024
106
+ min_new = 300 if is_cot else 50
107
+ max_input = 6144 # Safe limit
108
+
109
+ logger.info(f"Generating: {min_new}-{max_new} tokens, CoT={is_cot}")
 
 
 
 
110
 
111
  # Tokenize
112
  inputs = model_manager.tokenizer(
113
+ formatted,
114
  return_tensors="pt",
115
  truncation=True,
116
+ max_length=max_input
117
  )
118
 
119
  # Move to device
120
  if model_manager.device == "cuda:0":
121
  inputs = {k: v.to(next(model_manager.model.parameters()).device) for k, v in inputs.items()}
122
 
123
+ # Generate
124
  with torch.no_grad():
125
  outputs = model_manager.model.generate(
126
  **inputs,
127
+ max_new_tokens=max_new,
128
+ min_new_tokens=min_new,
129
  temperature=temperature,
130
  top_p=0.9,
131
  do_sample=True,
 
135
  )
136
 
137
  # Decode
138
+ full = model_manager.tokenizer.decode(outputs[0], skip_special_tokens=True)
139
 
140
  # Extract response
141
+ if "<|start_header_id|>assistant<|end_header_id|>" in full:
142
+ response = full.split("<|start_header_id|>assistant<|end_header_id|>", 1)[-1].strip()
143
  else:
144
+ response = full[len(formatted):].strip()
145
 
146
+ # For CoT, extract JSON
147
+ if is_cot and '[' in response and ']' in response:
148
+ match = re.search(r'\[.*\]', response, re.DOTALL)
149
+ if match and '"user"' in match.group(0) and '"assistant"' in match.group(0):
150
+ response = match.group(0)
 
 
151
 
152
+ logger.info(f"Response generated: {len(response)} chars")
153
  return response.strip()
154
 
155
  except Exception as e:
156
  logger.error(f"Generation error: {e}")
157
  return f"Error: {e}"
158
 
159
+ # Initialize model ONCE
160
  model_manager = ModelManager()
161
 
162
+ def api_respond(message, history, temperature, json_mode=None, template=None):
163
+ """API function - EXACTLY what the client expects"""
164
  try:
165
+ response = generate_response(message, temperature)
166
 
167
+ # Return EXACT format the client expects
168
  return [[
169
  {"role": "user", "metadata": None, "content": message, "options": None},
170
  {"role": "assistant", "metadata": None, "content": response, "options": None}
171
  ], ""]
172
+
173
  except Exception as e:
174
  logger.error(f"API Error: {e}")
175
  return [[
 
177
  {"role": "assistant", "metadata": None, "content": f"Error: {e}", "options": None}
178
  ], ""]
179
 
180
+ # BULLETPROOF GRADIO INTERFACE - NO STATE NONSENSE
181
  demo = gr.Interface(
182
+ fn=api_respond,
183
  inputs=[
184
+ gr.Textbox(label="Message", lines=5, placeholder="Enter your prompt here..."),
185
+ gr.JSON(label="History", value=[], visible=False), # Hidden but present
186
  gr.Slider(minimum=0.1, maximum=1.0, value=0.8, step=0.1, label="Temperature"),
187
+ gr.Textbox(label="JSON Mode", value="", visible=False), # Hidden compatibility
188
+ gr.Textbox(label="Template", value="", visible=False) # Hidden compatibility
189
  ],
190
  outputs=[
191
+ gr.JSON(label="API Response"),
192
+ gr.Textbox(label="Status", visible=False) # Hidden status
193
  ],
194
+ title="🚀 Question Generation API - BULLETPROOF VERSION",
195
+ description="Simple, reliable API that actually works. Send prompts, get responses. No drama.",
196
  api_name="respond"
197
  )
198
 
gradio_app_simple.py ADDED
@@ -0,0 +1,205 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import logging
3
+ import torch
4
+ from transformers import AutoTokenizer, AutoModelForCausalLM
5
+ import gradio as gr
6
+ import json
7
+ import re
8
+
9
+ # Configure logging
10
+ logging.basicConfig(level=logging.INFO)
11
+ logger = logging.getLogger(__name__)
12
+
13
+ class ModelManager:
14
+ def __init__(self):
15
+ self.model = None
16
+ self.tokenizer = None
17
+ self.device = None
18
+ self.model_loaded = False
19
+ self.load_model()
20
+
21
+ def load_model(self):
22
+ """Load the model and tokenizer"""
23
+ try:
24
+ logger.info("Starting model loading...")
25
+
26
+ # Check if CUDA is available
27
+ if torch.cuda.is_available():
28
+ torch.cuda.set_device(0)
29
+ self.device = "cuda:0"
30
+ else:
31
+ self.device = "cpu"
32
+ logger.info(f"Using device: {self.device}")
33
+
34
+ if self.device == "cuda:0":
35
+ logger.info(f"GPU: {torch.cuda.get_device_name()}")
36
+ logger.info(f"VRAM Available: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.2f} GB")
37
+
38
+ # Get HF token from environment
39
+ hf_token = os.getenv("HF_TOKEN")
40
+
41
+ logger.info("Loading Llama-3.1-8B-Instruct model...")
42
+ base_model_name = "meta-llama/Llama-3.1-8B-Instruct"
43
+
44
+ self.tokenizer = AutoTokenizer.from_pretrained(
45
+ base_model_name,
46
+ use_fast=True,
47
+ trust_remote_code=True,
48
+ token=hf_token
49
+ )
50
+
51
+ self.model = AutoModelForCausalLM.from_pretrained(
52
+ base_model_name,
53
+ torch_dtype=torch.float16 if self.device == "cuda:0" else torch.float32,
54
+ device_map="auto" if self.device == "cuda:0" else None,
55
+ trust_remote_code=True,
56
+ token=hf_token
57
+ )
58
+
59
+ # Set pad token
60
+ if self.tokenizer.pad_token is None:
61
+ self.tokenizer.pad_token = self.tokenizer.eos_token
62
+
63
+ self.model_loaded = True
64
+ logger.info("✅ Model loaded successfully!")
65
+
66
+ except Exception as e:
67
+ logger.error(f"❌ Error loading model: {str(e)}")
68
+ self.model_loaded = False
69
+
70
+ def generate_response(prompt, temperature=0.8, model_manager=None):
71
+ """SIMPLE, WORKING GENERATION"""
72
+ if not model_manager or not model_manager.model_loaded:
73
+ return "Model not loaded"
74
+
75
+ try:
76
+ # Detect request type
77
+ is_cot_request = any(phrase in prompt.lower() for phrase in [
78
+ "return exactly this json array",
79
+ "chain of thinking",
80
+ "verbatim"
81
+ ])
82
+
83
+ # Get model context
84
+ max_context = getattr(model_manager.model.config, "max_position_embeddings", 8192)
85
+ logger.info(f"Model context: {max_context} tokens")
86
+
87
+ # SIMPLE PROMPT
88
+ if is_cot_request:
89
+ system_msg = "Generate JSON training data exactly as requested."
90
+ else:
91
+ system_msg = "You are a helpful AI assistant."
92
+
93
+ formatted_prompt = f"""<|begin_of_text|><|start_header_id|>system<|end_header_id|>
94
+
95
+ {system_msg}
96
+
97
+ <|eot_id|><|start_header_id|>user<|end_header_id|>
98
+
99
+ {prompt}
100
+
101
+ <|eot_id|><|start_header_id|>assistant<|end_header_id|>
102
+
103
+ """
104
+
105
+ # REASONABLE TOKEN LIMITS
106
+ if is_cot_request:
107
+ max_new_tokens = 2048 # Reasonable for JSON
108
+ min_new_tokens = 300 # Ensure completion
109
+ else:
110
+ max_new_tokens = 1024
111
+ min_new_tokens = 50
112
+
113
+ max_input_tokens = max_context - max_new_tokens - 100
114
+ logger.info(f"Tokens: Input≤{max_input_tokens}, Output={min_new_tokens}-{max_new_tokens}")
115
+
116
+ # Tokenize
117
+ inputs = model_manager.tokenizer(
118
+ formatted_prompt,
119
+ return_tensors="pt",
120
+ truncation=True,
121
+ max_length=max_input_tokens
122
+ )
123
+
124
+ # Move to device
125
+ if model_manager.device == "cuda:0":
126
+ inputs = {k: v.to(next(model_manager.model.parameters()).device) for k, v in inputs.items()}
127
+
128
+ # SIMPLE GENERATION
129
+ with torch.no_grad():
130
+ outputs = model_manager.model.generate(
131
+ **inputs,
132
+ max_new_tokens=max_new_tokens,
133
+ min_new_tokens=min_new_tokens,
134
+ temperature=temperature,
135
+ top_p=0.9,
136
+ do_sample=True,
137
+ pad_token_id=model_manager.tokenizer.eos_token_id,
138
+ early_stopping=False,
139
+ repetition_penalty=1.1
140
+ )
141
+
142
+ # Decode
143
+ full_response = model_manager.tokenizer.decode(outputs[0], skip_special_tokens=True)
144
+
145
+ # Extract response
146
+ if "<|start_header_id|>assistant<|end_header_id|>" in full_response:
147
+ response = full_response.split("<|start_header_id|>assistant<|end_header_id|>", 1)[-1].strip()
148
+ else:
149
+ response = full_response[len(formatted_prompt):].strip()
150
+
151
+ # For CoT, try to extract JSON
152
+ if is_cot_request and '[' in response and ']' in response:
153
+ json_match = re.search(r'\[.*\]', response, re.DOTALL)
154
+ if json_match:
155
+ candidate = json_match.group(0)
156
+ if '"user"' in candidate and '"assistant"' in candidate:
157
+ response = candidate
158
+
159
+ logger.info(f"Response: {len(response)} chars")
160
+ return response.strip()
161
+
162
+ except Exception as e:
163
+ logger.error(f"Generation error: {e}")
164
+ return f"Error: {e}"
165
+
166
+ # Initialize model
167
+ model_manager = ModelManager()
168
+
169
+ def respond(message, history, temperature, json_mode=None, template=None):
170
+ """Main API function matching original interface"""
171
+ try:
172
+ response = generate_response(message, temperature, model_manager)
173
+
174
+ # Return in original format
175
+ return [[
176
+ {"role": "user", "metadata": None, "content": message, "options": None},
177
+ {"role": "assistant", "metadata": None, "content": response, "options": None}
178
+ ], ""]
179
+ except Exception as e:
180
+ logger.error(f"API Error: {e}")
181
+ return [[
182
+ {"role": "user", "metadata": None, "content": message, "options": None},
183
+ {"role": "assistant", "metadata": None, "content": f"Error: {e}", "options": None}
184
+ ], ""]
185
+
186
+ # Create simple interface
187
+ demo = gr.Interface(
188
+ fn=respond,
189
+ inputs=[
190
+ gr.Textbox(label="Message", lines=5),
191
+ gr.State(value=[]),
192
+ gr.Slider(minimum=0.1, maximum=1.0, value=0.8, step=0.1, label="Temperature"),
193
+ gr.Textbox(label="JSON Mode", value="", visible=False),
194
+ gr.Textbox(label="Template", value="", visible=False)
195
+ ],
196
+ outputs=[
197
+ gr.JSON(label="Response"),
198
+ gr.Textbox(label="Status", visible=False)
199
+ ],
200
+ title="Question Generation API - Simple & Working",
201
+ api_name="respond"
202
+ )
203
+
204
+ if __name__ == "__main__":
205
+ demo.launch(server_name="0.0.0.0", server_port=7860, share=False)