Spaces:

Nymbo
/

Serverless-TextGen-Hub

Running

App Files Files Community

Nymbo commited on Apr 30, 2025

Commit

1cee504

verified ·

1 Parent(s): 2d6eaa5

Update app.py

Browse files

Files changed (1) hide show

app.py +67 -88

app.py CHANGED Viewed

@@ -1,21 +1,14 @@
 import gradio as gr
-from openai import OpenAI
 import os
-import requests
 import json
 ACCESS_TOKEN = os.getenv("HF_TOKEN")
 print("Access token loaded.")
-# Initialize the OpenAI client for HF Inference
-hf_client = OpenAI(
-    base_url="https://api-inference.huggingface.co/v1/",
-    api_key=ACCESS_TOKEN,
-)
-print("HF Inference OpenAI client initialized.")
-# Cerebras API endpoint
-CEREBRAS_API_URL = "https://router.huggingface.co/cerebras/v1/chat/completions"
 def respond(
     message,
@@ -41,7 +34,7 @@ def respond(
     if seed == -1:
         seed = None
-    # Prepare messages for API
     messages = [{"role": "system", "content": system_message}]
     print("Initial messages array constructed.")
@@ -66,80 +59,45 @@ def respond(
     # Start with an empty string to build the response as tokens stream in
     response = ""
-    # Handle different providers
-    if provider == "hf-inference":
-        print("Using HF Inference API.")
-        # Use the OpenAI client for HF Inference
-        for message_chunk in hf_client.chat.completions.create(
             model=model_to_use,
-            max_tokens=max_tokens,
-            stream=True,
-            temperature=temperature,
-            top_p=top_p,
-            frequency_penalty=frequency_penalty,
-            seed=seed,
             messages=messages,
-        ):
-            token_text = message_chunk.choices[0].delta.content
-            if token_text is not None:  # Handle None values that might come in stream
-                print(f"Received token: {token_text}")
-                response += token_text
-                yield response
-    elif provider == "cerebras":
-        print("Using Cerebras API via HF Router.")
-        # Prepare headers and payload for the Cerebras API
-        headers = {
-            "Authorization": f"Bearer {ACCESS_TOKEN}",
-            "Content-Type": "application/json"
-        }
-        payload = {
-            "model": model_to_use,
-            "messages": messages,
-            "max_tokens": max_tokens,
-            "temperature": temperature,
-            "top_p": top_p,
-            "frequency_penalty": frequency_penalty,
-            "stream": True
-        }
-        if seed is not None:
-            payload["seed"] = seed
-        # Make the streaming request to Cerebras
-        with requests.post(
-            CEREBRAS_API_URL,
-            headers=headers,
-            json=payload,
-            stream=True
-        ) as req:
-            # Handle Server-Sent Events (SSE) format
-            for line in req.iter_lines():
-                if line:
-                    # Skip the "data: " prefix
-                    if line.startswith(b'data: '):
-                        line = line[6:]
-                    # Skip "[DONE]" message
-                    if line == b'[DONE]':
-                        continue
-                    try:
-                        # Parse the JSON chunk
-                        chunk = json.loads(line)
-                        token_text = chunk.get("choices", [{}])[0].get("delta", {}).get("content")
-                        if token_text:
-                            print(f"Received Cerebras token: {token_text}")
-                            response += token_text
-                            yield response
-                    except json.JSONDecodeError as e:
-                        print(f"Error decoding JSON: {e}, Line: {line}")
-                        continue
     print("Completed response generation.")
 # GRADIO UI
@@ -193,12 +151,22 @@ custom_model_box = gr.Textbox(
     placeholder="meta-llama/Llama-3.3-70B-Instruct"
 )
-# New provider selection radio
 provider_radio = gr.Radio(
-    choices=["hf-inference", "cerebras"],
     value="hf-inference",
     label="Inference Provider",
-    info="Select which inference provider to use"
 )
 def set_custom_model_from_radio(selected):
@@ -298,11 +266,22 @@ with demo:
     # Add new accordion for advanced settings including provider selection
     with gr.Accordion("Advanced Settings", open=False):
-        # The provider_radio is already defined above, we're just adding it to the UI here
         gr.Markdown("### Inference Provider")
-        gr.Markdown("Select which provider to use for inference. Default is Hugging Face Inference API.")
         # Provider radio is already included in the additional_inputs
-        gr.Markdown("Note: Different providers may support different models and parameters.")
 print("Gradio interface initialized.")

 import gradio as gr
+from huggingface_hub import InferenceClient
 import os
 import json
 ACCESS_TOKEN = os.getenv("HF_TOKEN")
 print("Access token loaded.")
+# Initialize the HF Inference Client
+client = InferenceClient(token=ACCESS_TOKEN)
+print("Hugging Face Inference Client initialized.")
 def respond(
     message,
     if seed == -1:
         seed = None
+    # Prepare messages in the format expected by the API
     messages = [{"role": "system", "content": system_message}]
     print("Initial messages array constructed.")
     # Start with an empty string to build the response as tokens stream in
     response = ""
+    print(f"Sending request to {provider} provider.")
+    # Prepare parameters for the chat completion request
+    parameters = {
+        "max_new_tokens": max_tokens,
+        "temperature": temperature,
+        "top_p": top_p,
+        "frequency_penalty": frequency_penalty,
+    }
+    if seed is not None:
+        parameters["seed"] = seed
+    # Use the InferenceClient for making the request with proper provider selection
+    try:
+        # Create a generator for the streaming response
+        stream = client.chat_completion(
             model=model_to_use,
             messages=messages,
+            stream=True,
+            provider=provider,  # Use the selected provider
+            **parameters  # Pass all other parameters
+        )
+        # Process the streaming response
+        for chunk in stream:
+            if hasattr(chunk, 'choices') and len(chunk.choices) > 0:
+                # Extract the content from the response
+                if hasattr(chunk.choices[0], 'delta') and hasattr(chunk.choices[0].delta, 'content'):
+                    token_text = chunk.choices[0].delta.content
+                    if token_text:
+                        print(f"Received token: {token_text}")
+                        response += token_text
+                        yield response
+    except Exception as e:
+        print(f"Error during inference: {e}")
+        response += f"\nError: {str(e)}"
+        yield response
     print("Completed response generation.")
 # GRADIO UI
     placeholder="meta-llama/Llama-3.3-70B-Instruct"
 )
+# Available providers as of April 2025
+providers_list = [
+    "hf-inference",  # Default Hugging Face Inference
+    "cerebras",      # Cerebras provider
+    "together",      # Together AI
+    "sambanova",     # SambaNova
+    "replicate",     # Replicate
+    "fal-ai"         # Fal.ai
+]
+# Provider selection radio
 provider_radio = gr.Radio(
+    choices=providers_list,
     value="hf-inference",
     label="Inference Provider",
+    info="Select which inference provider to use. Uses your Hugging Face PRO credits."
 )
 def set_custom_model_from_radio(selected):
     # Add new accordion for advanced settings including provider selection
     with gr.Accordion("Advanced Settings", open=False):
         gr.Markdown("### Inference Provider")
+        gr.Markdown("Select which provider to use for inference. Uses your Hugging Face PRO credits.")
         # Provider radio is already included in the additional_inputs
+        gr.Markdown("""
+        ### Provider Information
+        - **hf-inference**: Default Hugging Face Inference API
+        - **cerebras**: Cerebras AI models via Hugging Face router
+        - **together**: Together AI models
+        - **sambanova**: SambaNova models
+        - **replicate**: Replicate models
+        - **fal-ai**: Fal.ai models
+        As a PRO user, you receive $2 of credits monthly across all providers.
+        """)
 print("Gradio interface initialized.")