|
|
|
|
|
import gradio as gr |
|
|
from huggingface_hub import InferenceClient |
|
|
import os |
|
|
|
|
|
DEFAULT_MODEL_NAME = "mistralai/Mistral-7B-Instruct-v0.1" |
|
|
|
|
|
HF_TOKEN = os.getenv("HF_API_TOKEN") |
|
|
|
|
|
|
|
|
client = None |
|
|
|
|
|
def get_inference_client(model_name): |
|
|
global client |
|
|
|
|
|
if client is None or client.model != model_name: |
|
|
try: |
|
|
|
|
|
|
|
|
|
|
|
client = InferenceClient(model=model_name, token=HF_TOKEN if HF_TOKEN else None) |
|
|
print(f"InferenceClient initialized for {model_name}. Token {'provided' if HF_TOKEN else 'not explicitly provided'}.") |
|
|
except Exception as e: |
|
|
print(f"Failed to initialize InferenceClient for {model_name}: {e}") |
|
|
return None |
|
|
return client |
|
|
|
|
|
|
|
|
def evaluate_understanding(prompt, response): |
|
|
""" |
|
|
Analyzes the model's response to give a basic evaluation of understanding. |
|
|
This is a simple heuristic and not a comprehensive NLU assessment. |
|
|
""" |
|
|
if not response or response.strip() == "": |
|
|
return "β Not Understood (Empty or whitespace response)" |
|
|
|
|
|
response_lower = response.lower() |
|
|
|
|
|
misunderstanding_keywords = [ |
|
|
"i'm sorry", "i apologize", "i cannot", "i am unable", "unable to", |
|
|
"i don't understand", "could you please rephrase", "i'm not sure i follow", |
|
|
"that's not clear", "i do not have enough information", "as an ai language model, i don't", |
|
|
"i'm not programmed to", "i lack the ability to" |
|
|
] |
|
|
|
|
|
for keyword in misunderstanding_keywords: |
|
|
if keyword in response_lower: |
|
|
return f"β οΈ Potentially Not Understood (Contains: '{keyword}')" |
|
|
|
|
|
if len(prompt.split()) > 7 and len(response.split()) < 10: |
|
|
return "β οΈ Potentially Not Understood (Response seems too short for the prompt)" |
|
|
|
|
|
if prompt.lower() in response_lower and len(response_lower) < len(prompt.lower()) * 1.5 : |
|
|
if len(prompt.split()) > 5 : |
|
|
return "β οΈ Potentially Not Understood (Response might be echoing the prompt)" |
|
|
|
|
|
return "βοΈ Likely Understood" |
|
|
|
|
|
|
|
|
def query_model_and_evaluate(user_prompt, model_name_to_use): |
|
|
""" |
|
|
Sends the prompt to the model, gets the response, and evaluates it. |
|
|
""" |
|
|
if not user_prompt or user_prompt.strip() == "": |
|
|
return "Please enter a prompt.", "Evaluation N/A", model_name_to_use |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
print(f"Querying model: {model_name_to_use}. HF_TOKEN {'is set' if HF_TOKEN else 'is NOT set/empty'}.") |
|
|
|
|
|
current_client = get_inference_client(model_name_to_use) |
|
|
if current_client is None: |
|
|
error_msg = f"Error: Could not initialize the model API client for {model_name_to_use}. Check logs. This might be due to the model requiring authentication (like a token or accepting terms on Hugging Face) which was not available or successful." |
|
|
return error_msg, "Evaluation N/A", model_name_to_use |
|
|
|
|
|
try: |
|
|
if "mistral" in model_name_to_use.lower() and "instruct" in model_name_to_use.lower(): |
|
|
formatted_prompt = f"<s>[INST] {user_prompt.strip()} [/INST]" |
|
|
elif "llama-2" in model_name_to_use.lower() and "chat" in model_name_to_use.lower(): |
|
|
formatted_prompt = ( |
|
|
f"[INST] <<SYS>>\nYou are a helpful assistant. Your goal is to understand the user's prompt and respond accurately and relevantly.\n" |
|
|
f"<</SYS>>\n\n{user_prompt.strip()} [/INST]" |
|
|
) |
|
|
else: |
|
|
formatted_prompt = user_prompt.strip() |
|
|
|
|
|
params = { |
|
|
"max_new_tokens": 300, |
|
|
"temperature": 0.6, |
|
|
"top_p": 0.9, |
|
|
"repetition_penalty": 1.1, |
|
|
"do_sample": True, |
|
|
"return_full_text": False |
|
|
} |
|
|
|
|
|
model_response_text = current_client.text_generation(formatted_prompt, **params) |
|
|
|
|
|
if not model_response_text: |
|
|
model_response_text = "" |
|
|
|
|
|
except Exception as e: |
|
|
error_message = f"Error calling model API for {model_name_to_use}: {str(e)}. This can happen if the model is gated, requires a Hugging Face token, or if you need to accept its terms of use on the Hugging Face website." |
|
|
print(error_message) |
|
|
return error_message, "Evaluation N/A", model_name_to_use |
|
|
|
|
|
understanding_evaluation = evaluate_understanding(user_prompt, model_response_text) |
|
|
|
|
|
return model_response_text, understanding_evaluation, model_name_to_use |
|
|
|
|
|
|
|
|
with gr.Blocks(theme=gr.themes.Soft(primary_hue="blue", secondary_hue="orange")) as demo: |
|
|
gr.Markdown( |
|
|
f""" |
|
|
# π― Model Prompt Understanding Test |
|
|
Enter a prompt for the selected language model. The application will send this to the model via Hugging Face's Inference API. |
|
|
The model's response will be analyzed to provide a **basic heuristic assessment** of its understanding. |
|
|
|
|
|
**Selected Model:** <span id='current-model-display'>{DEFAULT_MODEL_NAME}</span> |
|
|
""" |
|
|
) |
|
|
|
|
|
current_model_name_state = gr.State(DEFAULT_MODEL_NAME) |
|
|
|
|
|
with gr.Row(): |
|
|
user_input_prompt = gr.Textbox( |
|
|
label="βοΈ Enter your Prompt:", |
|
|
placeholder="e.g., Explain the concept of zero-shot learning in 3 sentences.", |
|
|
lines=4, |
|
|
scale=3 |
|
|
) |
|
|
|
|
|
submit_button = gr.Button("π Submit Prompt and Evaluate", variant="primary") |
|
|
|
|
|
gr.Markdown("---") |
|
|
gr.Markdown("### π€ Model Response & Evaluation") |
|
|
|
|
|
with gr.Row(): |
|
|
with gr.Column(scale=2): |
|
|
model_output_response = gr.Textbox( |
|
|
label="π Model's Response:", |
|
|
lines=10, |
|
|
interactive=False, |
|
|
show_copy_button=True |
|
|
) |
|
|
with gr.Column(scale=1): |
|
|
evaluation_output = gr.Textbox( |
|
|
label="π§ Understanding Evaluation:", |
|
|
lines=2, |
|
|
interactive=False, |
|
|
show_copy_button=True |
|
|
) |
|
|
displayed_model = gr.Textbox( |
|
|
label="βοΈ Model Used for this Response:", |
|
|
interactive=False, |
|
|
lines=1 |
|
|
) |
|
|
|
|
|
submit_button.click( |
|
|
fn=query_model_and_evaluate, |
|
|
inputs=[user_input_prompt, current_model_name_state], |
|
|
outputs=[model_output_response, evaluation_output, displayed_model] |
|
|
) |
|
|
|
|
|
gr.Markdown( |
|
|
""" |
|
|
--- |
|
|
**Disclaimer:** |
|
|
* The 'Understanding Evaluation' is a very basic automated heuristic. |
|
|
* **Using Models:** This app will attempt to connect to the selected model. Some models (especially gated ones like Llama-2) may require you to have a Hugging Face account, accept their terms of use on the Hugging Face website, and might implicitly require a valid `HF_TOKEN` associated with your account (even if not explicitly set as a secret in this Space). If a model call fails, it could be due to these reasons. |
|
|
* Response quality depends heavily on the chosen model and the clarity of your prompt. |
|
|
""" |
|
|
) |
|
|
|
|
|
gr.Examples( |
|
|
examples=[ |
|
|
["Explain the difference between supervised and unsupervised machine learning.", DEFAULT_MODEL_NAME], |
|
|
["Write a short poem about a curious robot.", DEFAULT_MODEL_NAME], |
|
|
["What are the main challenges in developing AGI?", DEFAULT_MODEL_NAME], |
|
|
["Summarize the plot of 'War and Peace' in one paragraph.", DEFAULT_MODEL_NAME], |
|
|
["asdfjkl; qwerpoiu", DEFAULT_MODEL_NAME] |
|
|
], |
|
|
inputs=[user_input_prompt, current_model_name_state], |
|
|
outputs=[model_output_response, evaluation_output, displayed_model], |
|
|
fn=query_model_and_evaluate, |
|
|
cache_examples=False, |
|
|
label="π‘ Example Prompts (click to try)" |
|
|
) |
|
|
|
|
|
if __name__ == "__main__": |
|
|
print("Attempting to launch Gradio demo...") |
|
|
print(f"Default model: {DEFAULT_MODEL_NAME}") |
|
|
if HF_TOKEN: |
|
|
print("HF_TOKEN is set.") |
|
|
else: |
|
|
print("HF_TOKEN is NOT set. Some models (especially gated ones like Llama) might require a token or prior agreement to terms on the Hugging Face website to function correctly. The app will attempt to run, but API calls may fail.") |
|
|
demo.launch() |
|
|
|