Spaces:

kambris
/

LLMLP

Sleeping

App Files Files Community

kambris commited on Dec 2, 2025

Commit

06bd57c

verified ·

1 Parent(s): 69798b5

Update app.py

Browse files

Files changed (1) hide show

app.py +103 -43

app.py CHANGED Viewed

@@ -1,46 +1,75 @@
 import gradio as gr
-import csv
-from huggingface_hub import InferenceClient
 import os
 from datetime import datetime
 import pandas as pd
-# Initialize the Hugging Face Inference Client
 HF_TOKEN = os.getenv("HUGGINGFACE_TOKEN")
-client = InferenceClient(token=HF_TOKEN)
-# Define the four models to use
 MODELS = [
     "mistralai/Mistral-7B-Instruct-v0.2",
-    "HuggingFaceH4/zephyr-7b-beta",
-    "microsoft/Phi-3-mini-4k-instruct",
-    "google/flan-t5-xxl"
 ]
-def get_llm_response(model_name, prompt, max_tokens=500, temperature=0.7):
     """
-    Get response from a specific LLM model.
-    Each call is independent with no conversation history.
     """
     try:
-        # Create a fresh client for each request to ensure no state persistence
-        fresh_client = InferenceClient(token=HF_TOKEN)
-        # Only send the current prompt - no conversation history
-        response = fresh_client.chat_completion(
-            model=model_name,
-            messages=[{"role": "user", "content": prompt}],
-            max_tokens=max_tokens,
-            temperature=temperature
-        )
-        return response.choices[0].message.content
     except Exception as e:
-        return f"Error: {str(e)}"
-def collect_responses(prompt_text, max_tokens=500, temperature=0.7):
     """
-    Collect responses from all four models for a given prompt
-    and return as a dataframe and CSV file.
     Each model gets a fresh, independent query with no history.
     """
     results = []
@@ -50,7 +79,14 @@ def collect_responses(prompt_text, max_tokens=500, temperature=0.7):
         status_updates.append(f"⏳ Querying {model}...")
         yield "\n".join(status_updates), None, None
-        response = get_llm_response(model, prompt_text, max_tokens, temperature)
         result = {
             'timestamp': datetime.now().isoformat(),
@@ -74,7 +110,7 @@ def collect_responses(prompt_text, max_tokens=500, temperature=0.7):
     yield "\n".join(status_updates), df, csv_filename
-def batch_collect_responses(prompts_text, max_tokens=500, temperature=0.7):
     """
     Collect responses for multiple prompts (one per line).
     Each prompt is processed independently with no conversation history.
@@ -95,7 +131,14 @@ def batch_collect_responses(prompts_text, max_tokens=500, temperature=0.7):
             status_updates.append(f"  ⏳ Querying {model}...")
             yield "\n".join(status_updates), None, None
-            response = get_llm_response(model, prompt, max_tokens, temperature)
             result = {
                 'timestamp': datetime.now().isoformat(),
@@ -122,16 +165,19 @@ def batch_collect_responses(prompts_text, max_tokens=500, temperature=0.7):
 # Create Gradio interface
 with gr.Blocks(title="Multi-LLM Response Collector") as demo:
     gr.Markdown("""
-    # 🤖 Multi-LLM Response Collector
     Collect and compare **one-shot** responses from four different LLMs:
-    - Meta Llama 3.2 3B
-    - Mistral 7B
-    - Google Gemma 2 2B
-    - Qwen 2.5 7B
-    **Important:** Each query is independent with no conversation history.
-    Every prompt gets a fresh response with zero context from previous queries.
     Responses are saved to a CSV file for easy analysis.
     """)
@@ -145,9 +191,9 @@ with gr.Blocks(title="Multi-LLM Response Collector") as demo:
                     lines=3
                 )
                 max_tokens_single = gr.Slider(
-                    minimum=100,
-                    maximum=1000,
-                    value=500,
                     step=50,
                     label="Max Tokens"
                 )
@@ -158,6 +204,10 @@ with gr.Blocks(title="Multi-LLM Response Collector") as demo:
                     step=0.1,
                     label="Temperature (creativity)"
                 )
                 submit_btn = gr.Button("Collect Responses", variant="primary")
         status_output = gr.Textbox(label="Status", lines=6)
@@ -169,7 +219,7 @@ with gr.Blocks(title="Multi-LLM Response Collector") as demo:
         submit_btn.click(
             fn=collect_responses,
-            inputs=[prompt_input, max_tokens_single, temperature_single],
             outputs=[status_output, df_output, csv_output]
         )
@@ -182,9 +232,9 @@ with gr.Blocks(title="Multi-LLM Response Collector") as demo:
                     lines=5
                 )
                 max_tokens_batch = gr.Slider(
-                    minimum=100,
-                    maximum=1000,
-                    value=500,
                     step=50,
                     label="Max Tokens"
                 )
@@ -195,6 +245,10 @@ with gr.Blocks(title="Multi-LLM Response Collector") as demo:
                     step=0.1,
                     label="Temperature (creativity)"
                 )
                 batch_btn = gr.Button("Collect Batch Responses", variant="primary")
         batch_status = gr.Textbox(label="Status", lines=10)
@@ -206,7 +260,7 @@ with gr.Blocks(title="Multi-LLM Response Collector") as demo:
         batch_btn.click(
             fn=batch_collect_responses,
-            inputs=[batch_input, max_tokens_batch, temperature_batch],
             outputs=[batch_status, batch_df, batch_csv]
         )
@@ -218,6 +272,12 @@ with gr.Blocks(title="Multi-LLM Response Collector") as demo:
     - `prompt`: The input prompt
     - `model`: Which model generated the response
     - `response`: The model's response
     """)
 if __name__ == "__main__":

 import gradio as gr
+import requests
 import os
 from datetime import datetime
 import pandas as pd
+import time
+# Initialize with your token
 HF_TOKEN = os.getenv("HUGGINGFACE_TOKEN")
+# Use models that work with the free Serverless Inference API
 MODELS = [
     "mistralai/Mistral-7B-Instruct-v0.2",
+    "google/flan-t5-xxl",
+    "microsoft/DialoGPT-large",
+    "bigscience/bloom-560m"
 ]
+def query_model(model_id, prompt, max_tokens=500, temperature=0.7):
     """
+    Query a model using the direct Inference API endpoint
     """
+    API_URL = f"https://api-inference.huggingface.co/models/{model_id}"
+    headers = {"Authorization": f"Bearer {HF_TOKEN}"}
+    payload = {
+        "inputs": prompt,
+        "parameters": {
+            "max_new_tokens": max_tokens,
+            "temperature": temperature,
+            "return_full_text": False
+        }
+    }
     try:
+        response = requests.post(API_URL, headers=headers, json=payload)
+        # Handle model loading (503 error)
+        if response.status_code == 503:
+            result = response.json()
+            if "estimated_time" in result:
+                wait_time = result["estimated_time"]
+                return f"Model is loading... estimated wait: {wait_time}s. Please try again."
+            return "Model is currently loading. Please try again in a moment."
+        if response.status_code == 200:
+            result = response.json()
+            # Handle different response formats
+            if isinstance(result, list) and len(result) > 0:
+                if "generated_text" in result[0]:
+                    return result[0]["generated_text"]
+                elif "translation_text" in result[0]:
+                    return result[0]["translation_text"]
+                else:
+                    return str(result[0])
+            elif isinstance(result, dict):
+                if "generated_text" in result:
+                    return result["generated_text"]
+                else:
+                    return str(result)
+            else:
+                return str(result)
+        else:
+            return f"Error {response.status_code}: {response.text}"
     except Exception as e:
+        return f"Exception: {str(e)}"
+def collect_responses(prompt_text, max_tokens=500, temperature=0.7, retry_loading=True):
     """
+    Collect responses from all models for a given prompt.
     Each model gets a fresh, independent query with no history.
     """
     results = []
         status_updates.append(f"⏳ Querying {model}...")
         yield "\n".join(status_updates), None, None
+        response = query_model(model, prompt_text, max_tokens, temperature)
+        # If model is loading and retry is enabled, wait and try again
+        if retry_loading and "loading" in response.lower():
+            status_updates[-1] = f"⏳ {model} is loading, waiting 20s..."
+            yield "\n".join(status_updates), None, None
+            time.sleep(20)
+            response = query_model(model, prompt_text, max_tokens, temperature)
         result = {
             'timestamp': datetime.now().isoformat(),
     yield "\n".join(status_updates), df, csv_filename
+def batch_collect_responses(prompts_text, max_tokens=500, temperature=0.7, retry_loading=True):
     """
     Collect responses for multiple prompts (one per line).
     Each prompt is processed independently with no conversation history.
             status_updates.append(f"  ⏳ Querying {model}...")
             yield "\n".join(status_updates), None, None
+            response = query_model(model, prompt, max_tokens, temperature)
+            # If model is loading and retry is enabled, wait and try again
+            if retry_loading and "loading" in response.lower():
+                status_updates[-1] = f"  ⏳ {model} is loading, waiting 20s..."
+                yield "\n".join(status_updates), None, None
+                time.sleep(20)
+                response = query_model(model, prompt, max_tokens, temperature)
             result = {
                 'timestamp': datetime.now().isoformat(),
 # Create Gradio interface
 with gr.Blocks(title="Multi-LLM Response Collector") as demo:
     gr.Markdown("""
+    # 🤖 Multi-LLM Response Collector (Free Tier)
     Collect and compare **one-shot** responses from four different LLMs:
+    - Mistral 7B Instruct v0.2
+    - Google Flan-T5 XXL
+    - Microsoft DialoGPT Large
+    - BigScience BLOOM 560M
+    **Important:**
+    - Each query is independent with no conversation history
+    - Uses Hugging Face's free Serverless Inference API
+    - Models may take 20+ seconds to load on first request
+    - Free tier has rate limits (~100 requests/hour)
     Responses are saved to a CSV file for easy analysis.
     """)
                     lines=3
                 )
                 max_tokens_single = gr.Slider(
+                    minimum=50,
+                    maximum=500,
+                    value=200,
                     step=50,
                     label="Max Tokens"
                 )
                     step=0.1,
                     label="Temperature (creativity)"
                 )
+                retry_single = gr.Checkbox(
+                    label="Auto-retry if model is loading",
+                    value=True
+                )
                 submit_btn = gr.Button("Collect Responses", variant="primary")
         status_output = gr.Textbox(label="Status", lines=6)
         submit_btn.click(
             fn=collect_responses,
+            inputs=[prompt_input, max_tokens_single, temperature_single, retry_single],
             outputs=[status_output, df_output, csv_output]
         )
                     lines=5
                 )
                 max_tokens_batch = gr.Slider(
+                    minimum=50,
+                    maximum=500,
+                    value=200,
                     step=50,
                     label="Max Tokens"
                 )
                     step=0.1,
                     label="Temperature (creativity)"
                 )
+                retry_batch = gr.Checkbox(
+                    label="Auto-retry if model is loading",
+                    value=True
+                )
                 batch_btn = gr.Button("Collect Batch Responses", variant="primary")
         batch_status = gr.Textbox(label="Status", lines=10)
         batch_btn.click(
             fn=batch_collect_responses,
+            inputs=[batch_input, max_tokens_batch, temperature_batch, retry_batch],
             outputs=[batch_status, batch_df, batch_csv]
         )
     - `prompt`: The input prompt
     - `model`: Which model generated the response
     - `response`: The model's response
+    ### ⚠️ Free Tier Limitations
+    - Rate limit: ~100 requests/hour
+    - Models may take 20+ seconds to load on first use
+    - Some large models may not be available
+    - For production use, consider Hugging Face Pro ($9/month)
     """)
 if __name__ == "__main__":