Spaces:
Running
Running
| import gradio as gr | |
| from openai import OpenAI | |
| import os | |
| # Retrieve the access token from the environment variable | |
| ACCESS_TOKEN = os.getenv("HF_TOKEN") | |
| print("Access token loaded.") | |
| # Initialize the OpenAI client with the Hugging Face Inference API endpoint | |
| client = OpenAI( | |
| base_url="https://api-inference.huggingface.co/v1/", | |
| api_key=ACCESS_TOKEN, | |
| ) | |
| print("OpenAI client initialized.") | |
| def respond( | |
| user_message, | |
| chat_history, | |
| system_msg, | |
| max_tokens, | |
| temperature, | |
| top_p, | |
| frequency_penalty, | |
| seed, | |
| featured_model, | |
| custom_model | |
| ): | |
| """ | |
| This function handles the chatbot response. It takes in: | |
| - user_message: the user's newly typed message | |
| - chat_history: the list of (user, assistant) message pairs | |
| - system_msg: the system instruction or system-level context | |
| - max_tokens: the maximum number of tokens to generate | |
| - temperature: sampling temperature | |
| - top_p: top-p (nucleus) sampling | |
| - frequency_penalty: penalize repeated tokens in the output | |
| - seed: a fixed seed for reproducibility; -1 means 'random' | |
| - featured_model: the chosen model name from 'Featured Models' radio | |
| - custom_model: the optional custom model that overrides the featured one if provided | |
| """ | |
| print(f"Received user message: {user_message}") | |
| print(f"System message: {system_msg}") | |
| print(f"Max tokens: {max_tokens}, Temperature: {temperature}, Top-P: {top_p}, Freq-Penalty: {frequency_penalty}, Seed: {seed}") | |
| print(f"Featured model: {featured_model}") | |
| print(f"Custom model: {custom_model}") | |
| # Convert the seed to None if user set it to -1 (meaning random) | |
| if seed == -1: | |
| seed = None | |
| # Decide which model to actually use | |
| # If custom_model is non-empty, use that; otherwise use the chosen featured_model | |
| model_to_use = custom_model.strip() if custom_model.strip() != "" else featured_model | |
| # Provide a default fallback if for some reason both are empty | |
| if model_to_use.strip() == "": | |
| model_to_use = "meta-llama/Llama-3.3-70B-Instruct" | |
| print(f"Model selected for inference: {model_to_use}") | |
| # Construct the conversation history in the format required by HF's Inference API | |
| messages = [] | |
| if system_msg.strip(): | |
| messages.append({"role": "system", "content": system_msg.strip()}) | |
| # Add the conversation history | |
| for user_text, assistant_text in chat_history: | |
| if user_text: | |
| messages.append({"role": "user", "content": user_text}) | |
| if assistant_text: | |
| messages.append({"role": "assistant", "content": assistant_text}) | |
| # Add the new user message to the conversation | |
| messages.append({"role": "user", "content": user_message}) | |
| # We'll build the response token-by-token in a streaming loop | |
| response_so_far = "" | |
| print("Sending request to the Hugging Face Inference API...") | |
| # Make the streaming request to the HF Inference API | |
| try: | |
| for resp_chunk in client.chat.completions.create( | |
| model=model_to_use, | |
| max_tokens=max_tokens, | |
| stream=True, | |
| temperature=temperature, | |
| top_p=top_p, | |
| frequency_penalty=frequency_penalty, | |
| seed=seed, | |
| messages=messages, | |
| ): | |
| token_text = resp_chunk.choices[0].delta.content | |
| response_so_far += token_text | |
| # We yield back the updated message to display partial progress in the chatbot | |
| yield response_so_far | |
| except Exception as e: | |
| # If there's an error, let's at least show it in the chat | |
| error_text = f"[ERROR] {str(e)}" | |
| print(error_text) | |
| yield response_so_far + "\n\n" + error_text | |
| print("Completed response generation.") | |
| # | |
| # BUILDING THE GRADIO INTERFACE BELOW | |
| # | |
| # List of featured models; adjust or replace these placeholders with real text-generation models | |
| models_list = [ | |
| "meta-llama/Llama-3.3-70B-Instruct", | |
| "meta-llama/Llama-2-13B-chat-hf", | |
| "bigscience/bloom", | |
| "openlm-research/open_llama_7b", | |
| "facebook/opt-6.7b", | |
| "google/flan-t5-xxl", | |
| ] | |
| def filter_models(search_term): | |
| """Filters the models_list by the given search_term and returns an update for the Radio component.""" | |
| filtered = [m for m in models_list if search_term.lower() in m.lower()] | |
| return gr.update(choices=filtered) | |
| with gr.Blocks(theme="Nymbo/Nymbo_Theme_5") as demo: | |
| gr.Markdown("# Serverless-TextGen-Hub (Enhanced)") | |
| gr.Markdown("**A comprehensive UI for text generation with a featured-models dropdown and a custom override**.") | |
| # We keep track of the conversation in a Gradio state variable (list of tuples) | |
| chat_history = gr.State([]) | |
| # Tabs for organization | |
| with gr.Tab("Basic Settings"): | |
| with gr.Row(): | |
| with gr.Column(elem_id="prompt-container"): | |
| # System Message | |
| system_msg = gr.Textbox( | |
| label="System message", | |
| placeholder="Enter system-level instructions or context here.", | |
| lines=2 | |
| ) | |
| # Accordion for featured models | |
| with gr.Accordion("Featured Models", open=True): | |
| model_search = gr.Textbox( | |
| label="Filter Models", | |
| placeholder="Search for a featured model...", | |
| lines=1 | |
| ) | |
| # The radio that lists our featured models | |
| model_radio = gr.Radio( | |
| label="Select a featured model below", | |
| choices=models_list, | |
| value=models_list[0], # default | |
| interactive=True | |
| ) | |
| # Link the search box to update the model_radio choices | |
| model_search.change(filter_models, inputs=model_search, outputs=model_radio) | |
| # Custom Model | |
| custom_model_box = gr.Textbox( | |
| label="Custom Model (Optional)", | |
| info="If provided, overrides the featured model above. e.g. 'meta-llama/Llama-3.3-70B-Instruct'", | |
| placeholder="Your huggingface.co/username/model_name path" | |
| ) | |
| with gr.Tab("Advanced Settings"): | |
| with gr.Row(): | |
| max_tokens_slider = gr.Slider( | |
| minimum=1, | |
| maximum=4096, | |
| value=512, | |
| step=1, | |
| label="Max new tokens" | |
| ) | |
| temperature_slider = gr.Slider( | |
| minimum=0.1, | |
| maximum=4.0, | |
| value=0.7, | |
| step=0.1, | |
| label="Temperature" | |
| ) | |
| top_p_slider = gr.Slider( | |
| minimum=0.1, | |
| maximum=1.0, | |
| value=0.95, | |
| step=0.05, | |
| label="Top-P" | |
| ) | |
| with gr.Row(): | |
| freq_penalty_slider = gr.Slider( | |
| minimum=-2.0, | |
| maximum=2.0, | |
| value=0.0, | |
| step=0.1, | |
| label="Frequency Penalty" | |
| ) | |
| seed_slider = gr.Slider( | |
| minimum=-1, | |
| maximum=65535, | |
| value=-1, | |
| step=1, | |
| label="Seed (-1 for random)" | |
| ) | |
| # Chat interface area: user input -> assistant output | |
| with gr.Row(): | |
| chatbot = gr.Chatbot( | |
| label="TextGen Chat", | |
| height=500 | |
| ) | |
| # The user types a message here | |
| user_input = gr.Textbox( | |
| label="Your message", | |
| placeholder="Type your text prompt here..." | |
| ) | |
| # "Send" button triggers our respond() function, updates the chatbot | |
| send_button = gr.Button("Send") | |
| # A Clear Chat button to reset the conversation | |
| clear_button = gr.Button("Clear Chat") | |
| # Define how the Send button updates the state and chatbot | |
| def user_submission(user_text, history): | |
| """ | |
| This function gets called first to add the user's message to the chat. | |
| We return the updated chat_history with the user's message appended, | |
| plus an empty string for the next user input box. | |
| """ | |
| if user_text.strip() == "": | |
| return history, "" | |
| # Append user message to chat | |
| history = history + [(user_text, None)] | |
| return history, "" | |
| send_button.click( | |
| fn=user_submission, | |
| inputs=[user_input, chat_history], | |
| outputs=[chat_history, user_input] | |
| ) | |
| # Then we run the respond function (streaming) to generate the assistant message | |
| def bot_response( | |
| history, | |
| system_msg, | |
| max_tokens, | |
| temperature, | |
| top_p, | |
| freq_penalty, | |
| seed, | |
| featured_model, | |
| custom_model | |
| ): | |
| """ | |
| This function is called to generate the assistant's response | |
| based on the conversation so far, system message, etc. | |
| We do the streaming here. | |
| """ | |
| if not history: | |
| yield history | |
| # The last user message is in history[-1][0] | |
| user_message = history[-1][0] if history else "" | |
| # We pass everything to respond() generator | |
| bot_stream = respond( | |
| user_message=user_message, | |
| chat_history=history[:-1], # all except the newly appended user message | |
| system_msg=system_msg, | |
| max_tokens=max_tokens, | |
| temperature=temperature, | |
| top_p=top_p, | |
| frequency_penalty=freq_penalty, | |
| seed=seed, | |
| featured_model=featured_model, | |
| custom_model=custom_model | |
| ) | |
| partial_text = "" | |
| for partial_text in bot_stream: | |
| # We'll keep updating the last message in the conversation with partial_text | |
| updated_history = history[:-1] + [(history[-1][0], partial_text)] | |
| yield updated_history | |
| send_button.click( | |
| fn=bot_response, | |
| inputs=[ | |
| chat_history, | |
| system_msg, | |
| max_tokens_slider, | |
| temperature_slider, | |
| top_p_slider, | |
| freq_penalty_slider, | |
| seed_slider, | |
| model_radio, | |
| custom_model_box | |
| ], | |
| outputs=chatbot | |
| ) | |
| # Clear chat just resets the state | |
| def clear_chat(): | |
| return [], "" | |
| clear_button.click( | |
| fn=clear_chat, | |
| inputs=[], | |
| outputs=[chat_history, user_input] | |
| ) | |
| # Launch the application | |
| if __name__ == "__main__": | |
| print("Launching the Serverless-TextGen-Hub with Featured Models & Custom Model override.") | |
| demo.launch() |