Spaces:

Nymbo
/

Serverless-TextGen-Hub

Running

App Files Files Community

Nymbo commited on Jan 4, 2025

Commit

cf508a7

verified ·

1 Parent(s): fde397b

Update app.py

Browse files

Files changed (1) hide show

app.py +112 -174

app.py CHANGED Viewed

@@ -1,231 +1,169 @@
 import gradio as gr
 from openai import OpenAI
 import os
-import time
 # Retrieve the access token from the environment variable
 ACCESS_TOKEN = os.getenv("HF_TOKEN")
-print("Access token loaded.")
-# Initialize the OpenAI client with the Hugging Face Inference API endpoint
 client = OpenAI(
     base_url="https://api-inference.huggingface.co/v1/",
     api_key=ACCESS_TOKEN,
 )
-print("OpenAI client initialized.")
 def respond(
     message,
-    history: list[tuple[str, str]],
     system_message,
     max_tokens,
     temperature,
     top_p,
     frequency_penalty,
-    seed,
-    model_filter,
-    model,
-    custom_model
 ):
-    """
-    This function handles the chatbot response. It takes in:
-    - message: the user's new message
-    - history: the list of previous messages, each as a tuple (user_msg, assistant_msg)
-    - system_message: the system prompt
-    - max_tokens: the maximum number of tokens to generate in the response
-    - temperature: sampling temperature
-    - top_p: top-p (nucleus) sampling
-    - frequency_penalty: penalize repeated tokens in the output
-    - seed: a fixed seed for reproducibility; -1 will mean 'random'
-    - model_filter: search term to filter available models
-    - model: the selected model from the radio choices
-    - custom_model: manually entered HF model path
-    """
     print(f"Received message: {message}")
     print(f"History: {history}")
-    print(f"System message: {system_message}")
-    print(f"Max tokens: {max_tokens}, Temperature: {temperature}, Top-P: {top_p}")
     print(f"Frequency Penalty: {frequency_penalty}, Seed: {seed}")
-    print(f"Model Filter: {model_filter}, Selected Model: {model}, Custom Model: {custom_model}")
-    # Convert seed to None if -1 (meaning random)
     if seed == -1:
         seed = None
-    # Construct the messages array required by the API
     messages = [{"role": "system", "content": system_message}]
     # Add conversation history to the context
-    for val in history:
-        user_part = val[0]
-        assistant_part = val[1]
-        if user_part:
-            messages.append({"role": "user", "content": user_part})
-            print(f"Added user message to context: {user_part}")
-        if assistant_part:
-            messages.append({"role": "assistant", "content": assistant_part})
-            print(f"Added assistant message to context: {assistant_part}")
-    # Append the latest user message
     messages.append({"role": "user", "content": message})
-    # Determine the model to use
-    # Set the API URL based on the selected model or custom model
-    if custom_model.strip() != "":
-        api_model = custom_model.strip()
-    else:
-        if model == "Llama-3-70B-Instruct":
-            api_model = "meta-llama/Llama-3.3-70B-Instruct"
-        elif model == "Mistral-7B-Instruct-v0.2":
-            api_model = "mistralai/Mistral-7B-Instruct-v0.2"
-        elif model == "OpenHermes-2.5-Mistral-7B":
-            api_model = "teknium/OpenHermes-2.5-Mistral-7B"
-        elif model == "Phi-2":
-            api_model = "microsoft/Phi-2"
-        else:
-            api_model = "meta-llama/Llama-3.3-70B-Instruct"
-        print(f"Using model: {api_model}")
-    # Start with an empty string to build the response as tokens stream in
     response = ""
-    print(f"Sending request to OpenAI API, using model {api_model}.")
-    # Make the streaming request to the HF Inference API via openai-like client
-    for message_chunk in client.chat.completions.create(
-        model=api_model,
         max_tokens=max_tokens,
-        stream=True,  # Stream the response
         temperature=temperature,
         top_p=top_p,
         frequency_penalty=frequency_penalty,
         seed=seed,
-        messages=messages,
     ):
         # Extract the token text from the response chunk
-        token_text = message_chunk.choices[0].delta.content
-        print(f"Received token: {token_text}")
-        # Check if token_text is None before appending
-        if token_text is not None:
-            response += token_text
-            yield response
-    print("Completed response generation.")
-# Placeholder list of models for the accordion
-models_list = [
-    "Llama-3-70B-Instruct",
-    "Mistral-7B-Instruct-v0.2",
-    "OpenHermes-2.5-Mistral-7B",
-    "Phi-2",
-]
-# Create a Chatbot component with a specified height
 chatbot = gr.Chatbot(height=600)
-print("Chatbot interface created.")
-# Create the Gradio ChatInterface
 demo = gr.ChatInterface(
-    respond,
-    additional_inputs=[
-        gr.Textbox(value="", label="System message"),
-        gr.Slider(minimum=1, maximum=4096, value=512, step=1, label="Max new tokens"),
-        gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
-        gr.Slider(minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-P"),
-        gr.Slider(
-            minimum=-2.0,
-            maximum=2.0,
-            value=0.0,
-            step=0.1,
-            label="Frequency Penalty"
-        ),
-        gr.Slider(
-            minimum=-1,
-            maximum=65535,
-            value=-1,
-            step=1,
-            label="Seed (-1 for random)"
-        ),
-        gr.Textbox(label="Filter Featured Models", placeholder="Search...", lines=1),
-        gr.Radio(label="Select a Featured Model", choices=models_list, value="Llama-3-70B-Instruct"),
-        gr.Textbox(label="Custom Model", placeholder="Enter Hugging Face model path", lines=1),
-    ],
-    additional_inputs_accordion=gr.Accordion("Advanced Parameters", open=False),
-    fill_height=True,
     chatbot=chatbot,
     theme="Nymbo/Nymbo_Theme",
 )
-# Add the "Information" tab to the demo
-with gr.Tab("Information", parent=demo):
-    with gr.Accordion("Featured Models", open=True):
-        gr.HTML(
             """
-        <table style="width:100%; text-align:center; margin:auto;">
-            <tr>
-                <th>Model Name</th>
-                <th>Provider</th>
-                <th>Notes</th>
-            </tr>
-            <tr>
-                <td>Llama-3-70B-Instruct</td>
-                <td>Meta</td>
-                <td>Powerful large language model.</td>
-            </tr>
-            <tr>
-                <td>Mistral-7B-Instruct-v0.2</td>
-                <td>Mistral AI</td>
-                <td>Efficient and versatile model.</td>
-            </tr>
-            <tr>
-                <td>OpenHermes-2.5-Mistral-7B</td>
-                <td>Teknium</td>
-                <td>Community-driven, fine-tuned model.</td>
-            </tr>
-            <tr>
-                <td>Phi-2</td>
-                <td>Microsoft</td>
-                <td>Compact yet powerful model.</td>
-            </tr>
-        </table>
-        """
         )
     with gr.Accordion("Parameters Overview", open=False):
         gr.Markdown(
-        """
-        ## System Message
-        ###### The system message sets the behavior and persona of the chatbot. It's a way to provide context and instructions to the AI. For example, you can tell it to act as a helpful assistant, a storyteller, or any other role.
-        ## Max New Tokens
-        ###### This setting limits the length of the response generated by the AI. A higher number allows for longer, more detailed responses, while a lower number keeps the responses concise.
-        ## Temperature
-        ###### Temperature controls the randomness of the AI's output. A higher temperature makes the responses more creative and varied, while a lower temperature makes them more predictable and focused.
-        ## Top-P (Nucleus Sampling)
-        ###### Top-P sampling is a way to control the diversity of the AI's responses. It sets a threshold for the cumulative probability of the most likely next words. The AI then randomly selects from the words whose probabilities add up to this threshold. A lower Top-P value means less diversity.
-        ## Frequency Penalty
-        ###### Frequency penalty discourages the AI from repeating the same words or phrases too often in its responses. A higher penalty means the AI is less likely to repeat itself.
-        ## Seed
-        ###### The seed is a starting point for the random number generator that influences the AI's responses. If you set a specific seed, you'll get the same response every time you use that seed with the same prompt and settings. If you set it to -1, the AI will generate a new seed each time, leading to different responses.
-        ## Featured Models
-        ###### This section lists pre-selected models that are known to perform well. You can filter the list by typing in the search box.
-        ## Custom Model
-        ###### If you want to use a model that's not in the featured list, you can enter its Hugging Face model path here.
-        ### Feel free to experiment with these settings to see how they affect the AI's responses. Happy chatting!
-        """
-        )
-# Filter models function
-def filter_models(search_term, model_radio):
-    filtered_models = [m for m in models_list if search_term.lower() in m.lower()]
-    if not filtered_models:
-        filtered_models = ["No matching models"]  # Provide feedback
-    return gr.Radio.update(choices=filtered_models)
-# Update model list when search box is used
-demo.additional_inputs[6].change(filter_models, inputs=[demo.additional_inputs[6], demo.additional_inputs[7]], outputs=demo.additional_inputs[7])
-print("Gradio interface initialized.")
-if __name__ == "__main__":
-    print("Launching the demo application.")
-    demo.queue().launch()

 import gradio as gr
 from openai import OpenAI
 import os
 # Retrieve the access token from the environment variable
 ACCESS_TOKEN = os.getenv("HF_TOKEN")
+# Initialize the OpenAI API client
 client = OpenAI(
     base_url="https://api-inference.huggingface.co/v1/",
     api_key=ACCESS_TOKEN,
 )
 def respond(
     message,
+    history,
     system_message,
     max_tokens,
     temperature,
     top_p,
     frequency_penalty,
+    seed
 ):
+    # Process the incoming message
     print(f"Received message: {message}")
     print(f"History: {history}")
+    print(f"System Message: {system_message}")
+    print(f"Max Tokens: {max_tokens}, Temperature: {temperature}, Top P: {top_p}")
     print(f"Frequency Penalty: {frequency_penalty}, Seed: {seed}")
+    # Convert seed to None if -1 (random)
     if seed == -1:
         seed = None
+    # Construct the messages list for the API
     messages = [{"role": "system", "content": system_message}]
     # Add conversation history to the context
+    for user_message, assistant_message in history:
+        if user_message:
+            messages.append({"role": "user", "content": user_message})
+            print(f"Added user message: {user_message}")
+        if assistant_message:
+            messages.append({"role": "assistant", "content": assistant_message})
+            print(f"Added assistant message: {assistant_message}")
+    # Append the latest message
     messages.append({"role": "user", "content": message})
+    # Initialize response
     response = ""
+    # Make the API request
+    for chunk in client.chat.completions.create(
+        model="meta-llama/Llama-3.3-70B-Instruct",
+        messages=messages,
         max_tokens=max_tokens,
         temperature=temperature,
         top_p=top_p,
         frequency_penalty=frequency_penalty,
         seed=seed,
+        stream=True,
     ):
         # Extract the token text from the response chunk
+        token = chunk.choices[0].message.content
+        response += token
+        yield response
+# Create the Gradio Chatbot component
 chatbot = gr.Chatbot(height=600)
+# Define the Gradio ChatInterface
 demo = gr.ChatInterface(
     chatbot=chatbot,
+    fn=respond,
+    inputs=[
+        gr.Textbox(lines=1, placeholder="Enter your message..."),
+        gr.Chatbot(label="Conversation History"),
+        gr.Textbox(label="System Message"),
+        gr.Slider(minimum=10, maximum=200, step=1, label="Max Tokens"),
+        gr.Slider(minimum=0, maximum=2, step=0.1, label="Temperature"),
+        gr.Slider(minimum=0, maximum=1, step=0.05, label="Top P"),
+        gr.Slider(minimum=-2, maximum=2, step=0.1, label="Frequency Penalty"),
+        gr.Slider(minimum=-1, maximum=1000000, step=1, label="Seed (-1 for random)"),
+    ],
     theme="Nymbo/Nymbo_Theme",
 )
+# Create the "Featured Models" accordion
+with gr.Accordion("Featured Models", open=True) as featured_models:
+    # Textbox for searching models
+    model_search = gr.Textbox(label="Filter Models")
+    # List of featured models
+    models = [
+        "meta-llama/Llama-3.3-70B-Instruct",
+        "meta-llama/Llama-2-70B-Chat-hf",
+        "TheBloke/Llama-2-13B-Chat-GGML",
+        "TheBloke/Llama-2-70B-Chat-GGML",
+        "TheBloke/Llama-2-13B-Chat-GGML-v2",
+        "TheBloke/Llama-2-70B-Chat-GGML-v2",
+        "TheBloke/Llama-2-70B-Chat-HF-API-compatible-GGML",
+        "TheBloke/Llama-2-70b-chat-hf",
+        "TheBloke/Llama-2-70B-Chat-GGML-v2-32K",
+        "TheBloke/Llama-2-13B-Chat-GGML-v2-32K",
+        "TheBloke/Llama-2-70B-Chat-GGML-v2-32K",
+        "TheBloke/Llama-2-13B-Chat-GGML-v2-32K",
+        "TheBloke/Llama-2-70B-Chat-GGML-v2-32K",
+        "TheBloke/Llama-7-13B-Chat-GGML-v2-32K",
+        "TheBloke/Llama-2-70B-Chat-GGML-v2-32K",
+        "TheBloke/Llama-2-13B-Chat-GGML-v2-32K",
+        "TheBloke/Llama-2-70B-Chat-GGML-v2-32K",
+        # Add more models as needed...
+    ]
+    # Radio buttons for selecting a model
+    model_radio = gr.Radio(choices=models, label="Select a Model")
+    # Update the model list based on search input
+    def filter_models(search_term):
+        filtered_models = [model for model in models if search_term.lower() in model.lower()]
+        return gr.update(choices=filtered_models)
+    # Update the model list when the search box is used
+    model_search.change(filter_models, inputs=model_search, outputs=model_radio)
+# Create a "Custom Model" textbox
+custom_model = gr.Textbox(label="Custom Model", placeholder="Hugging Face model path")
+# Create the "Information" tab
+with gr.Tab("Information"):
+    # Featured Models accordion
+    with gr.Accordion("Featured Models", open=False):
+        gr.Markdown(
+            """
+            # Featured Models
+            Here's a list of some popular models available on Hugging Face:
+            - meta-llama/Llama-3.3-70B-Instruct
+            - meta-llama/Llama-2-70B-Chat-hf
+            - TheBloke/Llama-2-13B-Chat-GGML
+            - TheBloke/Llama-2-70B-Chat-GGML
+            - TheBloke/Llama-2-13B-Chat-GGML-v2
+            - TheBloke/Llama-2-70B-Chat-GGML-v2
+            - ... (and many more)
+            You can search and select a model from the list above, or use your own custom model path.
             """
         )
+    # Parameters Overview accordion
     with gr.Accordion("Parameters Overview", open=False):
         gr.Markdown(
+            """
+            # Parameters Overview
+            Here's a brief explanation of the parameters you can adjust:
+            - **Max Tokens**: The maximum number of tokens to generate in the response.
+            - **Temperature**: Controls the randomness of the output. Higher values make the output more random.
+            - **Top P**: Also known as nucleus sampling, it filters the least probable tokens, encouraging the model to be more creative.
+            - **Frequency Penalty**: Penalizes repeated tokens to avoid repetition.
+            - **Seed**: A fixed seed for reproducibility. Use -1 for a random seed.
+            Feel free to experiment with these settings to achieve the desired output.
+            """
+        )
+# Launch the Gradio interface
+demo.launch(share=True)