kambris commited on
Commit
06bd57c
·
verified ·
1 Parent(s): 69798b5

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +103 -43
app.py CHANGED
@@ -1,46 +1,75 @@
1
  import gradio as gr
2
- import csv
3
- from huggingface_hub import InferenceClient
4
  import os
5
  from datetime import datetime
6
  import pandas as pd
 
7
 
8
- # Initialize the Hugging Face Inference Client
9
  HF_TOKEN = os.getenv("HUGGINGFACE_TOKEN")
10
- client = InferenceClient(token=HF_TOKEN)
11
 
12
- # Define the four models to use
13
  MODELS = [
14
  "mistralai/Mistral-7B-Instruct-v0.2",
15
- "HuggingFaceH4/zephyr-7b-beta",
16
- "microsoft/Phi-3-mini-4k-instruct",
17
- "google/flan-t5-xxl"
18
  ]
19
 
20
- def get_llm_response(model_name, prompt, max_tokens=500, temperature=0.7):
21
  """
22
- Get response from a specific LLM model.
23
- Each call is independent with no conversation history.
24
  """
 
 
 
 
 
 
 
 
 
 
 
 
25
  try:
26
- # Create a fresh client for each request to ensure no state persistence
27
- fresh_client = InferenceClient(token=HF_TOKEN)
28
 
29
- # Only send the current prompt - no conversation history
30
- response = fresh_client.chat_completion(
31
- model=model_name,
32
- messages=[{"role": "user", "content": prompt}],
33
- max_tokens=max_tokens,
34
- temperature=temperature
35
- )
36
- return response.choices[0].message.content
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
37
  except Exception as e:
38
- return f"Error: {str(e)}"
39
 
40
- def collect_responses(prompt_text, max_tokens=500, temperature=0.7):
41
  """
42
- Collect responses from all four models for a given prompt
43
- and return as a dataframe and CSV file.
44
  Each model gets a fresh, independent query with no history.
45
  """
46
  results = []
@@ -50,7 +79,14 @@ def collect_responses(prompt_text, max_tokens=500, temperature=0.7):
50
  status_updates.append(f"⏳ Querying {model}...")
51
  yield "\n".join(status_updates), None, None
52
 
53
- response = get_llm_response(model, prompt_text, max_tokens, temperature)
 
 
 
 
 
 
 
54
 
55
  result = {
56
  'timestamp': datetime.now().isoformat(),
@@ -74,7 +110,7 @@ def collect_responses(prompt_text, max_tokens=500, temperature=0.7):
74
 
75
  yield "\n".join(status_updates), df, csv_filename
76
 
77
- def batch_collect_responses(prompts_text, max_tokens=500, temperature=0.7):
78
  """
79
  Collect responses for multiple prompts (one per line).
80
  Each prompt is processed independently with no conversation history.
@@ -95,7 +131,14 @@ def batch_collect_responses(prompts_text, max_tokens=500, temperature=0.7):
95
  status_updates.append(f" ⏳ Querying {model}...")
96
  yield "\n".join(status_updates), None, None
97
 
98
- response = get_llm_response(model, prompt, max_tokens, temperature)
 
 
 
 
 
 
 
99
 
100
  result = {
101
  'timestamp': datetime.now().isoformat(),
@@ -122,16 +165,19 @@ def batch_collect_responses(prompts_text, max_tokens=500, temperature=0.7):
122
  # Create Gradio interface
123
  with gr.Blocks(title="Multi-LLM Response Collector") as demo:
124
  gr.Markdown("""
125
- # 🤖 Multi-LLM Response Collector
126
 
127
  Collect and compare **one-shot** responses from four different LLMs:
128
- - Meta Llama 3.2 3B
129
- - Mistral 7B
130
- - Google Gemma 2 2B
131
- - Qwen 2.5 7B
132
 
133
- **Important:** Each query is independent with no conversation history.
134
- Every prompt gets a fresh response with zero context from previous queries.
 
 
 
135
 
136
  Responses are saved to a CSV file for easy analysis.
137
  """)
@@ -145,9 +191,9 @@ with gr.Blocks(title="Multi-LLM Response Collector") as demo:
145
  lines=3
146
  )
147
  max_tokens_single = gr.Slider(
148
- minimum=100,
149
- maximum=1000,
150
- value=500,
151
  step=50,
152
  label="Max Tokens"
153
  )
@@ -158,6 +204,10 @@ with gr.Blocks(title="Multi-LLM Response Collector") as demo:
158
  step=0.1,
159
  label="Temperature (creativity)"
160
  )
 
 
 
 
161
  submit_btn = gr.Button("Collect Responses", variant="primary")
162
 
163
  status_output = gr.Textbox(label="Status", lines=6)
@@ -169,7 +219,7 @@ with gr.Blocks(title="Multi-LLM Response Collector") as demo:
169
 
170
  submit_btn.click(
171
  fn=collect_responses,
172
- inputs=[prompt_input, max_tokens_single, temperature_single],
173
  outputs=[status_output, df_output, csv_output]
174
  )
175
 
@@ -182,9 +232,9 @@ with gr.Blocks(title="Multi-LLM Response Collector") as demo:
182
  lines=5
183
  )
184
  max_tokens_batch = gr.Slider(
185
- minimum=100,
186
- maximum=1000,
187
- value=500,
188
  step=50,
189
  label="Max Tokens"
190
  )
@@ -195,6 +245,10 @@ with gr.Blocks(title="Multi-LLM Response Collector") as demo:
195
  step=0.1,
196
  label="Temperature (creativity)"
197
  )
 
 
 
 
198
  batch_btn = gr.Button("Collect Batch Responses", variant="primary")
199
 
200
  batch_status = gr.Textbox(label="Status", lines=10)
@@ -206,7 +260,7 @@ with gr.Blocks(title="Multi-LLM Response Collector") as demo:
206
 
207
  batch_btn.click(
208
  fn=batch_collect_responses,
209
- inputs=[batch_input, max_tokens_batch, temperature_batch],
210
  outputs=[batch_status, batch_df, batch_csv]
211
  )
212
 
@@ -218,6 +272,12 @@ with gr.Blocks(title="Multi-LLM Response Collector") as demo:
218
  - `prompt`: The input prompt
219
  - `model`: Which model generated the response
220
  - `response`: The model's response
 
 
 
 
 
 
221
  """)
222
 
223
  if __name__ == "__main__":
 
1
  import gradio as gr
2
+ import requests
 
3
  import os
4
  from datetime import datetime
5
  import pandas as pd
6
+ import time
7
 
8
+ # Initialize with your token
9
  HF_TOKEN = os.getenv("HUGGINGFACE_TOKEN")
 
10
 
11
+ # Use models that work with the free Serverless Inference API
12
  MODELS = [
13
  "mistralai/Mistral-7B-Instruct-v0.2",
14
+ "google/flan-t5-xxl",
15
+ "microsoft/DialoGPT-large",
16
+ "bigscience/bloom-560m"
17
  ]
18
 
19
+ def query_model(model_id, prompt, max_tokens=500, temperature=0.7):
20
  """
21
+ Query a model using the direct Inference API endpoint
 
22
  """
23
+ API_URL = f"https://api-inference.huggingface.co/models/{model_id}"
24
+ headers = {"Authorization": f"Bearer {HF_TOKEN}"}
25
+
26
+ payload = {
27
+ "inputs": prompt,
28
+ "parameters": {
29
+ "max_new_tokens": max_tokens,
30
+ "temperature": temperature,
31
+ "return_full_text": False
32
+ }
33
+ }
34
+
35
  try:
36
+ response = requests.post(API_URL, headers=headers, json=payload)
 
37
 
38
+ # Handle model loading (503 error)
39
+ if response.status_code == 503:
40
+ result = response.json()
41
+ if "estimated_time" in result:
42
+ wait_time = result["estimated_time"]
43
+ return f"Model is loading... estimated wait: {wait_time}s. Please try again."
44
+ return "Model is currently loading. Please try again in a moment."
45
+
46
+ if response.status_code == 200:
47
+ result = response.json()
48
+
49
+ # Handle different response formats
50
+ if isinstance(result, list) and len(result) > 0:
51
+ if "generated_text" in result[0]:
52
+ return result[0]["generated_text"]
53
+ elif "translation_text" in result[0]:
54
+ return result[0]["translation_text"]
55
+ else:
56
+ return str(result[0])
57
+ elif isinstance(result, dict):
58
+ if "generated_text" in result:
59
+ return result["generated_text"]
60
+ else:
61
+ return str(result)
62
+ else:
63
+ return str(result)
64
+ else:
65
+ return f"Error {response.status_code}: {response.text}"
66
+
67
  except Exception as e:
68
+ return f"Exception: {str(e)}"
69
 
70
+ def collect_responses(prompt_text, max_tokens=500, temperature=0.7, retry_loading=True):
71
  """
72
+ Collect responses from all models for a given prompt.
 
73
  Each model gets a fresh, independent query with no history.
74
  """
75
  results = []
 
79
  status_updates.append(f"⏳ Querying {model}...")
80
  yield "\n".join(status_updates), None, None
81
 
82
+ response = query_model(model, prompt_text, max_tokens, temperature)
83
+
84
+ # If model is loading and retry is enabled, wait and try again
85
+ if retry_loading and "loading" in response.lower():
86
+ status_updates[-1] = f"⏳ {model} is loading, waiting 20s..."
87
+ yield "\n".join(status_updates), None, None
88
+ time.sleep(20)
89
+ response = query_model(model, prompt_text, max_tokens, temperature)
90
 
91
  result = {
92
  'timestamp': datetime.now().isoformat(),
 
110
 
111
  yield "\n".join(status_updates), df, csv_filename
112
 
113
+ def batch_collect_responses(prompts_text, max_tokens=500, temperature=0.7, retry_loading=True):
114
  """
115
  Collect responses for multiple prompts (one per line).
116
  Each prompt is processed independently with no conversation history.
 
131
  status_updates.append(f" ⏳ Querying {model}...")
132
  yield "\n".join(status_updates), None, None
133
 
134
+ response = query_model(model, prompt, max_tokens, temperature)
135
+
136
+ # If model is loading and retry is enabled, wait and try again
137
+ if retry_loading and "loading" in response.lower():
138
+ status_updates[-1] = f" ⏳ {model} is loading, waiting 20s..."
139
+ yield "\n".join(status_updates), None, None
140
+ time.sleep(20)
141
+ response = query_model(model, prompt, max_tokens, temperature)
142
 
143
  result = {
144
  'timestamp': datetime.now().isoformat(),
 
165
  # Create Gradio interface
166
  with gr.Blocks(title="Multi-LLM Response Collector") as demo:
167
  gr.Markdown("""
168
+ # 🤖 Multi-LLM Response Collector (Free Tier)
169
 
170
  Collect and compare **one-shot** responses from four different LLMs:
171
+ - Mistral 7B Instruct v0.2
172
+ - Google Flan-T5 XXL
173
+ - Microsoft DialoGPT Large
174
+ - BigScience BLOOM 560M
175
 
176
+ **Important:**
177
+ - Each query is independent with no conversation history
178
+ - Uses Hugging Face's free Serverless Inference API
179
+ - Models may take 20+ seconds to load on first request
180
+ - Free tier has rate limits (~100 requests/hour)
181
 
182
  Responses are saved to a CSV file for easy analysis.
183
  """)
 
191
  lines=3
192
  )
193
  max_tokens_single = gr.Slider(
194
+ minimum=50,
195
+ maximum=500,
196
+ value=200,
197
  step=50,
198
  label="Max Tokens"
199
  )
 
204
  step=0.1,
205
  label="Temperature (creativity)"
206
  )
207
+ retry_single = gr.Checkbox(
208
+ label="Auto-retry if model is loading",
209
+ value=True
210
+ )
211
  submit_btn = gr.Button("Collect Responses", variant="primary")
212
 
213
  status_output = gr.Textbox(label="Status", lines=6)
 
219
 
220
  submit_btn.click(
221
  fn=collect_responses,
222
+ inputs=[prompt_input, max_tokens_single, temperature_single, retry_single],
223
  outputs=[status_output, df_output, csv_output]
224
  )
225
 
 
232
  lines=5
233
  )
234
  max_tokens_batch = gr.Slider(
235
+ minimum=50,
236
+ maximum=500,
237
+ value=200,
238
  step=50,
239
  label="Max Tokens"
240
  )
 
245
  step=0.1,
246
  label="Temperature (creativity)"
247
  )
248
+ retry_batch = gr.Checkbox(
249
+ label="Auto-retry if model is loading",
250
+ value=True
251
+ )
252
  batch_btn = gr.Button("Collect Batch Responses", variant="primary")
253
 
254
  batch_status = gr.Textbox(label="Status", lines=10)
 
260
 
261
  batch_btn.click(
262
  fn=batch_collect_responses,
263
+ inputs=[batch_input, max_tokens_batch, temperature_batch, retry_batch],
264
  outputs=[batch_status, batch_df, batch_csv]
265
  )
266
 
 
272
  - `prompt`: The input prompt
273
  - `model`: Which model generated the response
274
  - `response`: The model's response
275
+
276
+ ### ⚠️ Free Tier Limitations
277
+ - Rate limit: ~100 requests/hour
278
+ - Models may take 20+ seconds to load on first use
279
+ - Some large models may not be available
280
+ - For production use, consider Hugging Face Pro ($9/month)
281
  """)
282
 
283
  if __name__ == "__main__":