File size: 8,883 Bytes
f39d8b6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
# app.py
import gradio as gr
from huggingface_hub import InferenceClient
import os

DEFAULT_MODEL_NAME = "mistralai/Mistral-7B-Instruct-v0.1"

HF_TOKEN = os.getenv("HF_API_TOKEN") # Loads token if set as a secret

# --- Initialize Inference Client ---
client = None

def get_inference_client(model_name):
    global client
    # Initialize client if it hasn't been, or if model name changes
    if client is None or client.model != model_name:
        try:
            # InferenceClient will use HF_TOKEN if it's not None,
            # or try to infer token otherwise (e.g. from CLI login if running locally).
            # If no token is found and the model requires one, the API call will fail.
            client = InferenceClient(model=model_name, token=HF_TOKEN if HF_TOKEN else None)
            print(f"InferenceClient initialized for {model_name}. Token {'provided' if HF_TOKEN else 'not explicitly provided'}.")
        except Exception as e:
            print(f"Failed to initialize InferenceClient for {model_name}: {e}")
            return None
    return client

# --- Evaluation Logic ---
def evaluate_understanding(prompt, response):
    """
    Analyzes the model's response to give a basic evaluation of understanding.
    This is a simple heuristic and not a comprehensive NLU assessment.
    """
    if not response or response.strip() == "":
        return "❌ Not Understood (Empty or whitespace response)"

    response_lower = response.lower() # For case-insensitive checks

    misunderstanding_keywords = [
        "i'm sorry", "i apologize", "i cannot", "i am unable", "unable to",
        "i don't understand", "could you please rephrase", "i'm not sure i follow",
        "that's not clear", "i do not have enough information", "as an ai language model, i don't",
        "i'm not programmed to", "i lack the ability to"
    ]

    for keyword in misunderstanding_keywords:
        if keyword in response_lower:
            return f"⚠️ Potentially Not Understood (Contains: '{keyword}')"

    if len(prompt.split()) > 7 and len(response.split()) < 10:
        return "⚠️ Potentially Not Understood (Response seems too short for the prompt)"
    
    if prompt.lower() in response_lower and len(response_lower) < len(prompt.lower()) * 1.5 :
        if len(prompt.split()) > 5 :
            return "⚠️ Potentially Not Understood (Response might be echoing the prompt)"

    return "βœ”οΈ Likely Understood"

# --- Core Logic: Query Model and Evaluate ---
def query_model_and_evaluate(user_prompt, model_name_to_use):
    """
    Sends the prompt to the model, gets the response, and evaluates it.
    """
    if not user_prompt or user_prompt.strip() == "":
        return "Please enter a prompt.", "Evaluation N/A", model_name_to_use
    
    # Note: The explicit block for Llama models without HF_TOKEN has been removed.
    # The InferenceClient will attempt the call. If the model is gated and requires
    # a token or terms acceptance, the API call itself will likely fail.
    print(f"Querying model: {model_name_to_use}. HF_TOKEN {'is set' if HF_TOKEN else 'is NOT set/empty'}.")

    current_client = get_inference_client(model_name_to_use)
    if current_client is None:
        error_msg = f"Error: Could not initialize the model API client for {model_name_to_use}. Check logs. This might be due to the model requiring authentication (like a token or accepting terms on Hugging Face) which was not available or successful."
        return error_msg, "Evaluation N/A", model_name_to_use

    try:
        if "mistral" in model_name_to_use.lower() and "instruct" in model_name_to_use.lower():
            formatted_prompt = f"<s>[INST] {user_prompt.strip()} [/INST]"
        elif "llama-2" in model_name_to_use.lower() and "chat" in model_name_to_use.lower():
            formatted_prompt = (
                f"[INST] <<SYS>>\nYou are a helpful assistant. Your goal is to understand the user's prompt and respond accurately and relevantly.\n"
                f"<</SYS>>\n\n{user_prompt.strip()} [/INST]"
            )
        else:
            formatted_prompt = user_prompt.strip()

        params = {
            "max_new_tokens": 300,
            "temperature": 0.6,
            "top_p": 0.9,
            "repetition_penalty": 1.1,
            "do_sample": True,
            "return_full_text": False
        }
        
        model_response_text = current_client.text_generation(formatted_prompt, **params)

        if not model_response_text:
            model_response_text = ""

    except Exception as e:
        error_message = f"Error calling model API for {model_name_to_use}: {str(e)}. This can happen if the model is gated, requires a Hugging Face token, or if you need to accept its terms of use on the Hugging Face website."
        print(error_message)
        return error_message, "Evaluation N/A", model_name_to_use

    understanding_evaluation = evaluate_understanding(user_prompt, model_response_text)
    
    return model_response_text, understanding_evaluation, model_name_to_use

# --- Gradio Interface Definition ---
with gr.Blocks(theme=gr.themes.Soft(primary_hue="blue", secondary_hue="orange")) as demo:
    gr.Markdown(
        f"""
        # 🎯 Model Prompt Understanding Test
        Enter a prompt for the selected language model. The application will send this to the model via Hugging Face's Inference API.
        The model's response will be analyzed to provide a **basic heuristic assessment** of its understanding.
        
        **Selected Model:** <span id='current-model-display'>{DEFAULT_MODEL_NAME}</span>
        """
    )
    
    current_model_name_state = gr.State(DEFAULT_MODEL_NAME)

    with gr.Row():
        user_input_prompt = gr.Textbox(
            label="✏️ Enter your Prompt:",
            placeholder="e.g., Explain the concept of zero-shot learning in 3 sentences.",
            lines=4,
            scale=3
        )
    
    submit_button = gr.Button("πŸš€ Submit Prompt and Evaluate", variant="primary")
    
    gr.Markdown("---")
    gr.Markdown("### πŸ€– Model Response & Evaluation")
    
    with gr.Row():
        with gr.Column(scale=2):
            model_output_response = gr.Textbox(
                label="πŸ“ Model's Response:", 
                lines=10, 
                interactive=False,
                show_copy_button=True
            )
        with gr.Column(scale=1):
            evaluation_output = gr.Textbox(
                label="🧐 Understanding Evaluation:", 
                lines=2, 
                interactive=False,
                show_copy_button=True
            )
            displayed_model = gr.Textbox(
                label="βš™οΈ Model Used for this Response:",
                interactive=False,
                lines=1
            )

    submit_button.click(
        fn=query_model_and_evaluate,
        inputs=[user_input_prompt, current_model_name_state],
        outputs=[model_output_response, evaluation_output, displayed_model]
    )
    
    gr.Markdown(
        """
        ---
        **Disclaimer:**
        * The 'Understanding Evaluation' is a very basic automated heuristic.
        * **Using Models:** This app will attempt to connect to the selected model. Some models (especially gated ones like Llama-2) may require you to have a Hugging Face account, accept their terms of use on the Hugging Face website, and might implicitly require a valid `HF_TOKEN` associated with your account (even if not explicitly set as a secret in this Space). If a model call fails, it could be due to these reasons.
        * Response quality depends heavily on the chosen model and the clarity of your prompt.
        """
    )
    
    gr.Examples(
        examples=[
            ["Explain the difference between supervised and unsupervised machine learning.", DEFAULT_MODEL_NAME],
            ["Write a short poem about a curious robot.", DEFAULT_MODEL_NAME],
            ["What are the main challenges in developing AGI?", DEFAULT_MODEL_NAME],
            ["Summarize the plot of 'War and Peace' in one paragraph.", DEFAULT_MODEL_NAME],
            ["asdfjkl; qwerpoiu", DEFAULT_MODEL_NAME]
        ],
        inputs=[user_input_prompt, current_model_name_state],
        outputs=[model_output_response, evaluation_output, displayed_model],
        fn=query_model_and_evaluate,
        cache_examples=False,
        label="πŸ’‘ Example Prompts (click to try)"
    )

if __name__ == "__main__":
    print("Attempting to launch Gradio demo...")
    print(f"Default model: {DEFAULT_MODEL_NAME}")
    if HF_TOKEN:
        print("HF_TOKEN is set.")
    else:
        print("HF_TOKEN is NOT set. Some models (especially gated ones like Llama) might require a token or prior agreement to terms on the Hugging Face website to function correctly. The app will attempt to run, but API calls may fail.")
    demo.launch()