Spaces:
Sleeping
Sleeping
File size: 9,699 Bytes
f364fe3 0cdc4eb f364fe3 992eedb 6bf8feb 992eedb 6ea58d5 992eedb 0cdc4eb 992eedb 7629837 992eedb 0cdc4eb 992eedb 0cdc4eb 992eedb 0cdc4eb 992eedb f364fe3 caf4bcb 7822d6f caf4bcb 0cdc4eb f364fe3 caf4bcb 0cdc4eb 657d622 0cdc4eb 7822d6f caf4bcb 7822d6f 0cdc4eb caf4bcb 0cdc4eb caf4bcb 0cdc4eb caf4bcb 0cdc4eb f364fe3 6bf8feb f364fe3 fac0be2 7822d6f fac0be2 7822d6f fac0be2 7822d6f caf4bcb 7822d6f 04a4f80 0cdc4eb 6bf8feb caf4bcb 04a4f80 caf4bcb 6bf8feb f364fe3 0cdc4eb 992eedb 0cdc4eb f364fe3 7822d6f f364fe3 14f445d 6bf8feb caf4bcb 14f445d 0cdc4eb 14f445d fac0be2 14f445d fac0be2 7822d6f 14f445d f364fe3 7822d6f 2e7d584 7822d6f f52c60e 7822d6f d82dc35 7822d6f d82dc35 7822d6f d82dc35 7822d6f f52c60e 7822d6f f364fe3 7822d6f d82dc35 7822d6f f364fe3 7822d6f f364fe3 7822d6f 0cdc4eb f364fe3 7822d6f 0cdc4eb caf4bcb 0cdc4eb 6bf8feb 7822d6f b2df124 7822d6f caf4bcb 7822d6f 02333f2 7822d6f 1644c5e 02333f2 7822d6f 657d622 7822d6f 657d622 7822d6f 657d622 02333f2 657d622 7822d6f 657d622 f364fe3 0b2f34f | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 | import os
import logging
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
import gradio as gr
import json
import re
# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
class ModelManager:
def __init__(self):
self.model = None
self.tokenizer = None
self.device = None
self.model_loaded = False
self.load_model()
def load_model(self):
"""Load the model and tokenizer"""
try:
logger.info("Starting model loading...")
# Check if CUDA is available
if torch.cuda.is_available():
torch.cuda.set_device(0)
self.device = "cuda:0"
else:
self.device = "cpu"
logger.info(f"Using device: {self.device}")
if self.device == "cuda:0":
logger.info(f"GPU: {torch.cuda.get_device_name()}")
logger.info(f"VRAM Available: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.2f} GB")
# Get HF token from environment
hf_token = os.getenv("HF_TOKEN")
logger.info("Loading Llama-3.1-8B-Instruct model...")
base_model_name = "meta-llama/Llama-3.1-8B-Instruct"
self.tokenizer = AutoTokenizer.from_pretrained(
base_model_name,
use_fast=True,
trust_remote_code=True,
token=hf_token
)
self.model = AutoModelForCausalLM.from_pretrained(
base_model_name,
torch_dtype=torch.float16 if self.device == "cuda:0" else torch.float32,
device_map="auto" if self.device == "cuda:0" else None,
trust_remote_code=True,
token=hf_token,
attn_implementation="eager" # Use eager attention (compatible)
)
# Set pad token
if self.tokenizer.pad_token is None:
self.tokenizer.pad_token = self.tokenizer.eos_token
self.model_loaded = True
logger.info("β
Model loaded successfully!")
except Exception as e:
logger.error(f"β Error loading model: {str(e)}")
self.model_loaded = False
def generate_response(prompt, temperature=0.8):
"""ZERO TRUNCATION GENERATION - Never cut anything!"""
global model_manager
if not model_manager or not model_manager.model_loaded:
return "Model not loaded"
try:
# Detect CoT requests
is_cot = any(phrase in prompt.lower() for phrase in [
"return exactly this json array",
"chain of thinking",
"verbatim"
])
logger.info(f"π― Request type: {'CoT' if is_cot else 'Standard'}")
# Simple system message
if is_cot:
system = "You are an expert at generating JSON training data exactly as requested."
else:
system = "You are a helpful AI assistant."
# Format prompt
formatted = f"""<|begin_of_text|><|start_header_id|>system<|end_header_id|>
{system}
<|eot_id|><|start_header_id|>user<|end_header_id|>
{prompt}
<|eot_id|><|start_header_id|>assistant<|end_header_id|>
"""
# Optimized token limits for speed
if is_cot:
max_new = 1500 # Reduced for speed
min_new = 400 # Reduced minimum
else:
max_new = 800 # Significantly reduced for speed
min_new = 50 # Lower minimum
max_input = 6000 # Safe input limit
logger.info(f"π’ Token allocation: Inputβ€{max_input}, Output={min_new}-{max_new}")
# Tokenize
inputs = model_manager.tokenizer(
formatted,
return_tensors="pt",
truncation=True,
max_length=max_input
)
# Move to device
if model_manager.device == "cuda:0":
inputs = {k: v.to(next(model_manager.model.parameters()).device) for k, v in inputs.items()}
logger.info("π Starting generation...")
# Generate with generous parameters
with torch.no_grad():
outputs = model_manager.model.generate(
**inputs,
max_new_tokens=max_new,
min_new_tokens=min_new,
temperature=temperature,
top_p=0.9,
do_sample=True,
num_beams=1, # Greedy search for speed
pad_token_id=model_manager.tokenizer.eos_token_id,
early_stopping=True, # Enable early stopping for speed
repetition_penalty=1.1,
use_cache=True
)
# Decode the COMPLETE response
full_response = model_manager.tokenizer.decode(outputs[0], skip_special_tokens=True)
logger.info(f"π Full response length: {len(full_response)} chars")
logger.info(f"π Response preview: {full_response[:200]}...")
# ZERO TRUNCATION EXTRACTION - Find content intelligently but never cut
response = full_response
# Look for the assistant response marker
assistant_marker = "<|start_header_id|>assistant<|end_header_id|>"
if assistant_marker in full_response:
# Find the position after the marker
marker_pos = full_response.find(assistant_marker)
if marker_pos != -1:
# Start after the marker + some whitespace
start_pos = marker_pos + len(assistant_marker)
# Skip any immediate whitespace/newlines
while start_pos < len(full_response) and full_response[start_pos] in ' \n\r\t':
start_pos += 1
if start_pos < len(full_response):
response = full_response[start_pos:]
logger.info(f"βοΈ Extracted after assistant marker: {len(response)} chars")
else:
logger.info("π Marker found but no content after, using full response")
else:
logger.info("π Marker search failed, using full response")
else:
logger.info("π No assistant marker found, using full response")
# For CoT, if we have a JSON array, extract it cleanly
if is_cot and '[' in response and ']' in response:
# Find the outermost JSON array
first_bracket = response.find('[')
last_bracket = response.rfind(']')
if first_bracket != -1 and last_bracket != -1 and last_bracket > first_bracket:
json_candidate = response[first_bracket:last_bracket+1]
# Validate it contains the expected structure
if '"user"' in json_candidate and '"assistant"' in json_candidate:
# Count the objects to make sure we have multiple items
user_count = json_candidate.count('"user"')
if user_count >= 2: # Should have at least 2 user/assistant pairs
response = json_candidate
logger.info(f"π― Extracted JSON array with {user_count} items: {len(response)} chars")
else:
logger.info(f"β οΈ JSON array has only {user_count} items, using full response")
else:
logger.info("β οΈ JSON candidate failed validation, using full response")
# Final response
response = response.strip()
logger.info(f"β
FINAL response: {len(response)} chars")
logger.info(f"π¬ Starts with: {response[:150]}...")
logger.info(f"π Ends with: ...{response[-150:]}")
return response
except Exception as e:
logger.error(f"π₯ Generation error: {e}")
return f"Error: {e}"
# Initialize model ONCE
model_manager = ModelManager()
def api_respond(message, history_str, temperature, json_mode, template):
"""ZERO TRUNCATION API - Pure content, no wrappers"""
try:
logger.info(f"π¨ API Request: {len(message)} chars, temp={temperature}")
response = generate_response(message, temperature)
logger.info(f"π€ API Response: {len(response)} chars")
return response
except Exception as e:
logger.error(f"π₯ API Error: {e}")
return f"Error: {e}"
# BULLETPROOF GRADIO INTERFACE
demo = gr.Interface(
fn=api_respond,
inputs=[
gr.Textbox(label="Message", lines=8, placeholder="Enter your prompt here..."),
gr.Textbox(label="History", value="[]", visible=False),
gr.Slider(minimum=0.1, maximum=1.0, value=0.8, step=0.1, label="Temperature"),
gr.Textbox(label="JSON Mode", value="", visible=False),
gr.Textbox(label="Template", value="", visible=False)
],
outputs=gr.Textbox(label="Response", lines=20, max_lines=50),
title="π― Question Generation API - ZERO TRUNCATION",
description="Rebuilt from scratch with ZERO text cutting. Generates complete responses every time.",
api_name="respond"
)
if __name__ == "__main__":
# Enable queue with concurrency limit of 10
demo.queue(
default_concurrency_limit=10, # Handle 10 concurrent requests
max_size=100 # Allow up to 100 requests in queue
).launch(server_name="0.0.0.0", server_port=7860, share=False) |