Spaces:

david167
/

question-generation-api

Sleeping

question-generation-api / gradio_app_old.py

ELEGANT API REWRITE: Clean architecture, smart token allocation, proper JSON extraction - eliminate placeholder generation

0cdc4eb 8 months ago

raw

history blame contribute delete

13 kB

	import os
	import logging
	import torch
	from transformers import AutoTokenizer, AutoModelForCausalLM
	import gradio as gr

	# Configure logging
	logging.basicConfig(level=logging.INFO)
	logger = logging.getLogger(__name__)

	class ModelManager:
	def __init__(self):
	self.model = None
	self.tokenizer = None
	self.device = None
	self.model_loaded = False
	self.load_model()

	def load_model(self):
	"""Load the model and tokenizer"""
	try:
	logger.info("Starting model loading...")

	# Check if CUDA is available
	if torch.cuda.is_available():
	torch.cuda.set_device(0)
	self.device = "cuda:0"
	else:
	self.device = "cpu"
	logger.info(f"Using device: {self.device}")

	if self.device == "cuda:0":
	logger.info(f"GPU: {torch.cuda.get_device_name()}")
	logger.info(f"VRAM Available: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.2f} GB")

	# Get HF token from environment
	hf_token = os.getenv("HF_TOKEN")

	logger.info("Loading Llama-3.1-8B-Instruct model...")
	base_model_name = "meta-llama/Llama-3.1-8B-Instruct"

	self.tokenizer = AutoTokenizer.from_pretrained(
	base_model_name,
	use_fast=True,
	trust_remote_code=True,
	token=hf_token
	)

	self.model = AutoModelForCausalLM.from_pretrained(
	base_model_name,
	torch_dtype=torch.float16 if self.device == "cuda:0" else torch.float32,
	device_map={"": 0} if self.device == "cuda:0" else None,
	trust_remote_code=True,
	low_cpu_mem_usage=True,
	use_safetensors=True,
	token=hf_token
	)

	if self.device == "cuda:0":
	self.model = self.model.to(self.device)

	self.model_loaded = True
	logger.info("Model loaded successfully!")

	except Exception as e:
	logger.error(f"Error loading model: {str(e)}")
	self.model_loaded = False

	# Initialize model manager
	model_manager = ModelManager()

	def generate_response(prompt, temperature=0.8):
	"""Simple function to generate a response from a prompt"""
	if not model_manager.model_loaded:
	return "Model not loaded yet. Please wait..."

	try:
	# Create the Llama-3.1 chat format
	formatted_prompt = f"""<\|begin_of_text\|><\|start_header_id\|>user<\|end_header_id\|>

	{prompt}

	<\|eot_id\|><\|start_header_id\|>assistant<\|end_header_id\|>

	"""

	# Determine context window and USE ABSOLUTE MAXIMUM
	try:
	max_ctx = getattr(model_manager.model.config, "max_position_embeddings", 131072) # Llama 3.1 supports up to 131k
	except Exception:
	max_ctx = 131072 # Use maximum possible

	logger.info(f"Model max context: {max_ctx} tokens")

	# Detect if this is a Chain of Thinking request
	is_cot_request = ("chain-of-thinking" in prompt.lower() or
	"chain of thinking" in prompt.lower() or
	"Return exactly this JSON array" in prompt or
	("verbatim" in prompt.lower() and "json array" in prompt.lower()))

	# MAXIMIZE GENERATION TOKENS - use most of context for generation
	if is_cot_request:
	# For CoT, use MAXIMUM possible generation tokens
	gen_max_new_tokens = 16384 # Very high limit for complete responses
	min_tokens = 2000 # High minimum to force complete generation
	# Allow most of context for input
	allowed_input_tokens = max_ctx - gen_max_new_tokens - 100 # Small safety buffer
	logger.info(f"CoT REQUEST - MAXIMIZED: min_tokens={min_tokens}, max_new_tokens={gen_max_new_tokens}, input_limit={allowed_input_tokens}")
	else:
	# Standard requests
	gen_max_new_tokens = 8192
	min_tokens = 200
	allowed_input_tokens = max_ctx - gen_max_new_tokens - 100

	# Tokenize the input with safe truncation
	inputs = model_manager.tokenizer(
	formatted_prompt,
	return_tensors="pt",
	truncation=True,
	max_length=allowed_input_tokens
	)

	# Move inputs to the same device as the model
	if model_manager.device == "cuda:0":
	model_device = next(model_manager.model.parameters()).device
	inputs = {k: v.to(model_device) for k, v in inputs.items()}

	# Generate response with MAXIMUM settings
	with torch.no_grad():
	outputs = model_manager.model.generate(
	**inputs,
	max_new_tokens=gen_max_new_tokens,
	min_new_tokens=min_tokens,
	temperature=temperature,
	top_p=0.95,
	do_sample=True,
	num_beams=1,
	pad_token_id=model_manager.tokenizer.eos_token_id,
	eos_token_id=model_manager.tokenizer.eos_token_id,
	early_stopping=False, # Never stop early
	repetition_penalty=1.05,
	no_repeat_ngram_size=0,
	length_penalty=1.0,
	# Force generation to continue
	use_cache=True
	)

	# Decode the response
	generated_text = model_manager.tokenizer.decode(outputs[0], skip_special_tokens=True)

	# Log generation details for debugging
	input_length = inputs['input_ids'].shape[1]
	output_length = outputs[0].shape[0]
	generated_length = output_length - input_length
	logger.info(f"Generation stats - Input: {input_length} tokens, Generated: {generated_length} tokens, Min required: {min_tokens}")

	if generated_length < min_tokens:
	logger.warning(f"Generated {generated_length} tokens but minimum was {min_tokens} - response may be truncated")

	# Post-decode guard: if a top-level JSON array closes, trim to the first full array
	# This helps prevent trailing prose like 'assistant' or 'Message'.
	try:
	# Track both bracket and brace depth to find first complete JSON structure
	bracket_depth = 0 # [ ]
	brace_depth = 0 # { }
	in_string = False
	escape_next = False
	start_idx = None
	end_idx = None

	for i, ch in enumerate(generated_text):
	# Handle string escaping
	if escape_next:
	escape_next = False
	continue

	if ch == '\\':
	escape_next = True
	continue

	# Track if we're inside a string
	if ch == '"' and not escape_next:
	in_string = not in_string
	continue

	# Only count brackets/braces outside of strings
	if not in_string:
	if ch == '[':
	if bracket_depth == 0 and brace_depth == 0 and start_idx is None:
	start_idx = i
	bracket_depth += 1
	elif ch == ']':
	bracket_depth = max(0, bracket_depth - 1)
	if bracket_depth == 0 and brace_depth == 0 and start_idx is not None:
	end_idx = i
	break
	elif ch == '{':
	brace_depth += 1
	elif ch == '}':
	brace_depth = max(0, brace_depth - 1)

	if start_idx is not None and end_idx is not None and end_idx > start_idx:
	# Extract just the complete JSON array
	json_text = generated_text[start_idx:end_idx+1]
	logger.info(f"Extracted complete JSON array of length {len(json_text)}")
	generated_text = json_text
	elif start_idx is not None:
	# Found start but no end - response was truncated
	logger.warning("JSON array started but never closed - response truncated")
	# Try to extract what we have and let the client handle it
	generated_text = generated_text[start_idx:]
	except Exception as e:
	logger.warning(f"Error in JSON extraction: {e}")
	pass

	# Extract just the assistant's response
	if "<\|start_header_id\|>assistant<\|end_header_id\|>" in generated_text:
	response = generated_text.split("<\|start_header_id\|>assistant<\|end_header_id\|>")[-1].strip()
	else:
	# Better fallback: look for the start of actual content (JSON or text)
	import re

	# Look for JSON array or object start
	json_match = re.search(r'(\[\|\{)', generated_text)
	if json_match and json_match.start() > len(formatted_prompt) // 2:
	response = generated_text[json_match.start():].strip()
	else:
	# Look for the end of the prompt pattern
	prompt_end_patterns = [
	"<\|end_header_id\|>",
	"<\|eot_id\|>",
	"assistant",
	"\n\n"
	]

	response = generated_text
	for pattern in prompt_end_patterns:
	if pattern in generated_text:
	parts = generated_text.split(pattern)
	if len(parts) > 1:
	# Take the last substantial part
	candidate = parts[-1].strip()
	if len(candidate) > 20: # Ensure it's not too short
	response = candidate
	break

	# Ultimate fallback - just return everything after a reasonable point
	if response == generated_text:
	# Skip approximately the prompt length but be conservative
	skip_chars = min(len(formatted_prompt) // 2, len(generated_text) // 3)
	response = generated_text[skip_chars:].strip()

	logger.info(f"Generated response length: {len(response)} characters")
	return response

	except Exception as e:
	logger.error(f"Error generating response: {str(e)}")
	return f"Error: {str(e)}"

	def respond(message, history, temperature):
	"""Gradio interface function for chat"""
	response = generate_response(message, temperature)

	# Update history
	history.append({"role": "user", "content": message})
	history.append({"role": "assistant", "content": response})

	return history, ""

	# Create the Gradio interface
	with gr.Blocks(title="Question Generation API") as demo:
	gr.Markdown("# Simple LLM API")
	gr.Markdown("Send a prompt and get a response. No templates, just direct model interaction.")

	with gr.Row():
	with gr.Column(scale=4):
	chatbot = gr.Chatbot(
	label="Chat",
	type="messages",
	height=400
	)
	msg = gr.Textbox(
	label="Message",
	placeholder="Enter your prompt here...",
	lines=3
	)
	with gr.Row():
	submit = gr.Button("Send", variant="primary")
	clear = gr.Button("Clear")

	with gr.Column(scale=1):
	temperature = gr.Slider(
	minimum=0.1,
	maximum=2.0,
	value=0.8,
	step=0.1,
	label="Temperature",
	info="Higher = more creative"
	)
	gr.Markdown("""
	### API Usage
	This model accepts any prompt and returns a response.

	For JSON responses, include instructions in your prompt like:
	- "Return as a JSON array"
	- "Format as JSON"
	- "List as JSON"

	The model will follow your instructions.
	""")

	# Set up event handlers
	submit.click(respond, [msg, chatbot, temperature], [chatbot, msg])
	msg.submit(respond, [msg, chatbot, temperature], [chatbot, msg])
	clear.click(lambda: ([], ""), outputs=[chatbot, msg])

	if __name__ == "__main__":
	demo.launch(
	server_name="0.0.0.0",
	server_port=7860,
	share=False
	)