Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| from transformers import AutoModelForCausalLM, AutoTokenizer | |
| import torch | |
| # --- Model and Tokenizer Loading --- | |
| # It's recommended to load the model and tokenizer once globally | |
| # so they are not reloaded on every prediction. | |
| try: | |
| MODEL_NAME = "Vinnnf/Thinkless-1.5B-Warmup" | |
| print(f"Loading model: {MODEL_NAME}...") | |
| model = AutoModelForCausalLM.from_pretrained( | |
| MODEL_NAME, | |
| torch_dtype="auto", # Use "auto" or torch.float16 if GPU is available and supports it | |
| device_map="auto" # Automatically maps to GPU if available, otherwise CPU | |
| ) | |
| print("Model loaded successfully.") | |
| print(f"Loading tokenizer for: {MODEL_NAME}...") | |
| tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) | |
| print("Tokenizer loaded successfully.") | |
| except Exception as e: | |
| print(f"Error loading model or tokenizer: {e}") | |
| # Fallback or error handling if model loading fails | |
| # For a Gradio app, you might want to display this error in the UI | |
| # For now, we'll let it raise if essential components fail to load. | |
| raise | |
| # --- Prediction Function --- | |
| def generate_response(instruction_text, prompt_question, think_mode_active, max_tokens): | |
| """ | |
| Generates a response from the language model based on the input. | |
| """ | |
| if not instruction_text or not prompt_question: | |
| return "Error: Instruction and Prompt Question cannot be empty.", "", "N/A", "N/A" | |
| try: | |
| # 1. Combine instruction and prompt question | |
| full_prompt_content = f"{instruction_text}\n{prompt_question}" | |
| # 2. Format for chat model | |
| messages = [ | |
| {"role": "user", "content": full_prompt_content} | |
| ] | |
| # 3. Apply chat template | |
| # tokenize=False because we add special tags <think>/<short> afterwards | |
| text_from_template = tokenizer.apply_chat_template( | |
| messages, | |
| tokenize=False, | |
| add_generation_prompt=True # Ensures the model knows to generate a response | |
| ) | |
| # 4. Add <think> or <short> tag | |
| if think_mode_active: | |
| final_input_text = f"{text_from_template}<think>" | |
| else: | |
| final_input_text = f"{text_from_template}<short>" | |
| # 5. Tokenize the final input | |
| # Ensure the tokenizer and model are on the same device | |
| model_inputs = tokenizer([final_input_text], return_tensors="pt").to(model.device) | |
| # 6. Generate response | |
| # Ensure max_new_tokens is an integer | |
| try: | |
| max_new_tokens_int = int(max_tokens) | |
| except ValueError: | |
| return "Error: Max new tokens must be an integer.", final_input_text, "N/A", "N/A" | |
| if max_new_tokens_int <= 0: | |
| return "Error: Max new tokens must be a positive integer.", final_input_text, "N/A", "N/A" | |
| print(f"Generating with max_new_tokens: {max_new_tokens_int}") | |
| generated_ids = model.generate( | |
| **model_inputs, | |
| max_new_tokens=max_new_tokens_int, | |
| # Common generation parameters you might want to add: | |
| # temperature=0.7, | |
| # top_k=50, | |
| # top_p=0.95, | |
| # num_return_sequences=1, | |
| # no_repeat_ngram_size=2, # to prevent some repetition | |
| # early_stopping=True | |
| ) | |
| # 7. Decode the generated part only | |
| # The generated_ids include the input_ids, so we slice them off. | |
| input_ids_length = model_inputs.input_ids.shape[1] | |
| output_only_ids = generated_ids[:, input_ids_length:] | |
| num_generated_tokens = len(output_only_ids[0]) | |
| # 8. Batch decode | |
| response_text = tokenizer.batch_decode(output_only_ids, skip_special_tokens=True)[0] | |
| # For debugging: full generated text including prompt | |
| # full_response_text = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0] | |
| # print(f"Full text (prompt + response): {full_response_text}") | |
| return final_input_text, response_text, num_generated_tokens, full_prompt_content | |
| except Exception as e: | |
| print(f"Error during generation: {e}") | |
| # Return the error message to be displayed in the Gradio UI | |
| return f"An error occurred: {str(e)}", "", "N/A", "N/A" | |
| # --- Gradio Interface Definition --- | |
| # Default values from the original script | |
| DEFAULT_INSTRUCTION = "Please reason step by step, and put your final answer within \\boxed{}." | |
| DEFAULT_PROMPT_QUESTION = "The arithmetic mean of 7, 2, $x$ and 10 is 9. What is the value of $x$?" | |
| DEFAULT_THINK_MODE = True | |
| DEFAULT_MAX_TOKENS = 512 # Default value for max_new_tokens | |
| # Define input components | |
| instruction_input = gr.Textbox( | |
| lines=3, | |
| label="Instruction", | |
| value=DEFAULT_INSTRUCTION, | |
| info="The overall instruction for the model (e.g., reasoning style)." | |
| ) | |
| prompt_question_input = gr.Textbox( | |
| lines=3, | |
| label="Prompt Question", | |
| value=DEFAULT_PROMPT_QUESTION, | |
| info="The specific question or task for the model." | |
| ) | |
| think_mode_checkbox = gr.Checkbox( | |
| label="Enable Think Mode (<think> tag)", | |
| value=DEFAULT_THINK_MODE, | |
| info="If checked, adds '<think>' for detailed reasoning. If unchecked, adds '<short>' for concise answers." | |
| ) | |
| max_tokens_slider = gr.Slider( | |
| minimum=32, | |
| maximum=4096, # As per original script's max_new_tokens | |
| value=DEFAULT_MAX_TOKENS, | |
| step=32, | |
| label="Max New Tokens", | |
| info="Maximum number of tokens to generate for the response." | |
| ) | |
| # Define output components | |
| full_prompt_output = gr.Textbox( | |
| label="Actual Input to Model (with template and tag)", | |
| lines=5, | |
| interactive=False, # Read-only | |
| show_copy_button=True | |
| ) | |
| response_output = gr.Textbox( | |
| label="Model Response", | |
| lines=10, | |
| interactive=False, # Read-only | |
| show_copy_button=True | |
| ) | |
| num_tokens_output = gr.Textbox( | |
| label="Number of Generated Tokens", | |
| interactive=False # Read-only | |
| ) | |
| original_prompt_output = gr.Textbox( | |
| label="Original User Prompt (Instruction + Question)", | |
| lines=3, | |
| interactive=False, # Read-only | |
| show_copy_button=True | |
| ) | |
| # Create the Gradio interface | |
| # We pass a list of inputs and outputs to gr.Interface | |
| # The order in the list corresponds to the arguments of the `generate_response` function | |
| app_interface = gr.Interface( | |
| fn=generate_response, | |
| inputs=[ | |
| instruction_input, | |
| prompt_question_input, | |
| think_mode_checkbox, | |
| max_tokens_slider | |
| ], | |
| outputs=[ | |
| full_prompt_output, | |
| response_output, | |
| num_tokens_output, | |
| original_prompt_output # Added to show the combined instruction + question | |
| ], | |
| title="Thinkless Model Interface", | |
| description=( | |
| "Interact with the Vinnnf/Thinkless-1.5B-Warmup model. " | |
| "Provide an instruction and a prompt, choose a thinking mode, and set max tokens. " | |
| "The model will generate a response based on your input. " | |
| "Note: Model loading might take a few moments when the app starts." | |
| ), | |
| allow_flagging='never', # or 'auto' if you want to enable flagging | |
| examples=[ | |
| [ | |
| "Please reason step by step, and put your final answer within \\boxed{}.", | |
| "Sarah has 5 apples. She gives 2 apples to John and then buys 3 more apples. How many apples does Sarah have now?", | |
| True, | |
| 256 | |
| ], | |
| [ | |
| "Provide a concise answer.", | |
| "What is the capital of France?", | |
| False, | |
| 64 | |
| ], | |
| [ | |
| "Explain the concept of photosynthesis in simple terms.", | |
| "What is photosynthesis?", | |
| True, | |
| 512 | |
| ] | |
| ] | |
| ) | |
| # --- Launch the App --- | |
| if __name__ == "__main__": | |
| print("Starting Gradio app...") | |
| # For Hugging Face Spaces, Gradio automatically handles the server. | |
| # When running locally, this will start a local server. | |
| app_interface.launch() | |
| # To share on Hugging Face Spaces, you would typically save this file as app.py | |
| # and ensure your requirements.txt includes: | |
| # gradio | |
| # transformers | |
| # torch | |
| # sentencepiece (often a dependency for tokenizers) | |
| # accelerate (if using device_map="auto" effectively with multiple GPUs/CPU offload) | |