File size: 15,472 Bytes
bdb213e
9b5b26a
 
 
c19d193
d139b43
655eb16
 
 
d139b43
6aae614
8fe992b
9b5b26a
 
d139b43
 
 
3cee4ad
 
 
8b3a261
 
bdb213e
8b3a261
b72b866
ad924eb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9b5b26a
d139b43
ad924eb
9b5b26a
ad924eb
 
9b5b26a
ad924eb
 
 
 
 
655eb16
 
4bd6f35
 
116bda5
 
 
4bd6f35
116bda5
 
 
4bd6f35
116bda5
 
 
4bd6f35
 
13b9d4a
655eb16
 
 
13b9d4a
40e5f48
9e2fccb
 
 
655eb16
 
4bd6f35
655eb16
4bd6f35
655eb16
116bda5
6220e54
9e2fccb
6220e54
 
9e2fccb
13b9d4a
e95248e
13b9d4a
 
9e2fccb
13b9d4a
0be1b82
e95248e
 
4bd6f35
13b9d4a
9e2fccb
 
dfafa93
9e2fccb
dfafa93
4bd6f35
dfafa93
4bd6f35
dfafa93
4bd6f35
dfafa93
9e2fccb
 
40e5f48
116bda5
 
 
 
 
 
 
8010a87
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
116bda5
 
 
8010a87
 
 
116bda5
 
 
 
 
 
 
9e2fccb
6220e54
 
4bd6f35
6220e54
8010a87
 
 
 
 
 
dfafa93
40e5f48
 
 
 
8010a87
 
40e5f48
4bd6f35
8010a87
 
 
 
 
9e2fccb
6220e54
8010a87
9e2fccb
40e5f48
9e2fccb
4bd6f35
9e2fccb
13b9d4a
9e2fccb
4bd6f35
9e2fccb
6220e54
 
9e2fccb
6220e54
 
13b9d4a
4bd6f35
 
655eb16
e95248e
 
4bd6f35
 
655eb16
894372d
 
6220e54
894372d
 
 
4bd6f35
 
6220e54
4bd6f35
6220e54
4bd6f35
894372d
 
4bd6f35
894372d
 
829cff2
4bd6f35
829cff2
 
894372d
4bd6f35
829cff2
4bd6f35
894372d
 
 
 
829cff2
d8d214d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9b5b26a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8c01ffb
 
6aae614
9e2fccb
 
e121372
d139b43
 
9e2fccb
 
13d500a
8c01ffb
9b5b26a
 
8c01ffb
861422e
 
9b5b26a
8c01ffb
8fe992b
655eb16
9e2fccb
8b3a261
9e2fccb
 
655eb16
 
d8d214d
 
9e2fccb
8c01ffb
 
 
 
 
 
861422e
8fe992b
 
9e2fccb
 
3cee4ad
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
from smolagents import CodeAgent, DuckDuckGoSearchTool, HfApiModel, load_tool, tool
import datetime
import requests
import pytz
import yaml
import os
import uuid
from datasets import Dataset
from huggingface_hub import HfApi
from openai import OpenAI
from tools.final_answer import FinalAnswerTool

from Gradio_UI import GradioUI

# Define the Perplexity system prompt
Perplex_Assistant_Prompt = """You are a helpful AI assistant that searches the web for accurate information."""

# Set up API key in environment variable as expected by HfApiModel
os.environ["HUGGINGFACE_API_TOKEN"] = os.getenv("HUGGINGFACE_API_KEY", "")

# Initialize the standard search tools
ddg_search_tool = DuckDuckGoSearchTool(max_results=10)  # Default is 10 results
# google_search_tool = GoogleSearchTool()

#@weave.op()
def tracked_perplexity_call(prompt: str, system_messages: str, model_name: str = "sonar-pro", assistant_meta: bool = False):
    """Enhanced Perplexity API call with explicit model tracking."""
    client = OpenAI(api_key=os.getenv("PERPLEXITY_API_KEY"), base_url="https://api.perplexity.ai")
    system_message = Perplex_Assistant_Prompt
    if assistant_meta:
        system_message += f"\n\n{system_messages}"
    
    # Minimal parameters for Perplexity
    return client.chat.completions.create(
        model=model_name,
        messages=[
            {"role": "system", "content": system_message},
            {"role": "user", "content": prompt},
        ],
        stream=False,
    ).choices[0].message.content

@tool
def Sonar_Web_Search_Tool(arg1: str, arg2: str) -> str:
    """A tool that accesses Perplexity Sonar to search the web when the answer requires or would benefit from a real world web reference. 
    Args:
        arg1: User Prompt
        arg2: Details on the desired web search results as system message for sonar web search
    """
    try:
        sonar_response = tracked_perplexity_call(arg1, arg2)
        return sonar_response
    except Exception as e:
        return f"Error using Sonar Websearch tool '{arg1} {arg2}': {str(e)}"
    

def parse_json(text: str):
    """
    A safer JSON parser using ast.literal_eval.
    Converts JSON-like strings to Python objects without executing code.
    Handles common JSON literals (true, false, null) by converting them to Python equivalents.
    """
    # Replace JSON literals with Python equivalents
    prepared_text = text.replace("true", "True").replace("false", "False").replace("null", "None")
    
    try:
        import ast
        return ast.literal_eval(prepared_text)
    except (SyntaxError, ValueError) as e:
        raise ValueError(f"Failed to parse JSON: {str(e)}")

def Dataset_Creator_Function(dataset_name: str, conversation_data: str) -> str:
    """Creates and pushes a dataset to Hugging Face with the conversation history.
    
    Args:
        dataset_name: Name for the dataset (will be prefixed with username)
        conversation_data: String representing the conversation data. Can be:
            - JSON array of objects (each object becomes a row)
            - Pipe-separated values (first row as headers, subsequent rows as values)
            - Plain text (stored in a single 'text' column)
    
    Returns:
        URL of the created dataset or error message along with the log output.
    """
    log_text = ""
    try:
        # Required imports
        import pandas as pd
        from datasets import Dataset, DatasetDict
        from huggingface_hub import HfApi
        
        # Get API key
        api_key = os.getenv("HF_API_KEY") or os.getenv("HUGGINGFACE_API_KEY")
        if not api_key:
            return "Error: No Hugging Face API key found in environment variables"
        
        # Set fixed username
        username = "Misfits-and-Machines"
        safe_dataset_name = dataset_name.replace(" ", "_").lower()
        repo_id = f"{username}/{safe_dataset_name}"
        
        log_text += f"Creating dataset: {repo_id}\n"
        
        # Ensure repository exists
        hf_api = HfApi(token=api_key)
        try:
            if not hf_api.repo_exists(repo_id=repo_id, repo_type="dataset"):
                hf_api.create_repo(repo_id=repo_id, repo_type="dataset")
                log_text += f"Created repository: {repo_id}\n"
            else:
                log_text += f"Repository already exists: {repo_id}\n"
        except Exception as e:
            log_text += f"Note when checking/creating repository: {str(e)}\n"
        
        # Process input data
        created_ds = None
        try:
            # Try parsing as JSON using the safer parse_json function
            try:
                json_data = parse_json(conversation_data)
                
                # Process based on data structure
                if isinstance(json_data, list) and all(isinstance(item, dict) for item in json_data):
                    log_text += f"Processing JSON array with {len(json_data)} items\n"
                    
                    # Create a dataset with columns for all keys in the first item
                    # This ensures the dataset structure is consistent
                    first_item = json_data[0]
                    columns = list(first_item.keys())
                    log_text += f"Detected columns: {columns}\n"
                    
                    # Initialize data dictionary with empty lists for each column
                    data_dict = {col: [] for col in columns}
                    
                    # Process each item
                    for item in json_data:
                        for col in columns:
                            # Get the value for this column, or empty string if missing
                            value = item.get(col, "")
                            data_dict[col].append(value)
                    
                    # Debug output to verify data structure
                    for col in columns:
                        log_text += f"Column '{col}' has {len(data_dict[col])} entries\n"
                        
                    # Create dataset from dictionary
                    ds = Dataset.from_dict(data_dict)
                    log_text += f"Created dataset with {len(ds)} rows\n"
                    created_ds = DatasetDict({"train": ds})
                elif isinstance(json_data, dict):
                    log_text += "Processing single JSON object\n"
                    # For a single object, create a dataset with one row
                    data_dict = {k: [v] for k, v in json_data.items()}
                    ds = Dataset.from_dict(data_dict)
                    created_ds = DatasetDict({"train": ds})
                else:
                    raise ValueError("JSON not recognized as array or single object")
            except Exception as json_error:
                log_text += f"Not processing as JSON: {str(json_error)}\n"
                raise json_error  # Propagate to next handler
        except Exception:
            # Try pipe-separated format
            lines = conversation_data.strip().split('\n')
            if '|' in conversation_data and len(lines) > 1:
                log_text += "Processing as pipe-separated data\n"
                headers = [h.strip() for h in lines[0].split('|')]
                log_text += f"Detected headers: {headers}\n"
                
                # Initialize data dictionary
                data_dict = {header: [] for header in headers}
                
                # Process each data row
                for i, line in enumerate(lines[1:], 1):
                    if not line.strip():
                        continue
                    values = [val.strip() for val in line.split('|')]
                    if len(values) == len(headers):
                        for j, header in enumerate(headers):
                            data_dict[header].append(values[j])
                    else:
                        log_text += f"Warning: Skipping row {i} (column count mismatch)\n"
                
                # Create dataset from dictionary
                if all(len(values) > 0 for values in data_dict.values()):
                    ds = Dataset.from_dict(data_dict)
                    log_text += f"Created dataset with {len(ds)} rows\n"
                    created_ds = DatasetDict({"train": ds})
                else:
                    log_text += "No valid rows found in pipe-separated data\n"
                    created_ds = DatasetDict({"train": Dataset.from_dict({"text": [conversation_data]})})
            else:
                # Fallback for plain text
                log_text += "Processing as plain text\n"
                created_ds = DatasetDict({"train": Dataset.from_dict({"text": [conversation_data]})})
        
        # Push using the DatasetDict push_to_hub method.
        log_text += f"Pushing dataset to {repo_id}\n"
        created_ds.push_to_hub(
            repo_id=repo_id,
            token=api_key,
            commit_message=f"Upload dataset: {dataset_name}"
        )
        
        dataset_url = f"https://huggingface.co/datasets/{repo_id}"
        log_text += f"Dataset successfully pushed to: {dataset_url}\n"
        return f"Successfully created dataset at {dataset_url}\nLogs:\n{log_text}"
    except Exception as e:
        import traceback
        error_trace = traceback.format_exc()
        log_text += f"Dataset creation error: {str(e)}\n{error_trace}\n"
        return f"Error creating dataset: {str(e)}\nLogs:\n{log_text}"

@tool
def Dataset_Creator_Tool(dataset_name: str, conversation_data: str) -> str:
    """A tool that creates and pushes a dataset to Hugging Face.
    
    Args:
        dataset_name: Name for the dataset (will be prefixed with 'Misfits-and-Machines/')
        conversation_data: Data content to save in the dataset. Formats supported:
            1. JSON array of objects – Each object becomes a row (keys as columns).
               Example: [{"name": "Product A", "brand": "Company X"}, {"name": "Product B", "brand": "Company Y"}]
            2. Pipe-separated values – First row as headers, remaining rows as values.
               Example: "name | brand\nProduct A | Company X\nProduct B | Company Y"
            3. Plain text – Stored in a single 'text' column.
    
    Returns:
        A link to the created dataset on the Hugging Face Hub or an error message, along with log details.
    """
    try:
        log_text = f"Creating dataset '{dataset_name}' with {len(conversation_data)} characters of data\n"
        log_text += f"Dataset will be created at Misfits-and-Machines/{dataset_name.replace(' ', '_').lower()}\n"
        
        # Call Dataset_Creator_Function directly without trying to define any new functions
        result = Dataset_Creator_Function(dataset_name, conversation_data)
        log_text += f"Dataset creation result: {result}\n"
        
        return log_text
    except Exception as e:
        import traceback
        error_trace = traceback.format_exc()
        return f"Error using Dataset Creator tool: {str(e)}\n{error_trace}"

def verify_dataset_exists(repo_id: str) -> dict:
    """Verify that a dataset exists and is valid on the Hugging Face Hub.
    
    Args:
        repo_id: Full repository ID in format "username/dataset_name"
    
    Returns:
        Dict with "exists" boolean and "message" string
    """
    try:
        # Check if dataset exists using the datasets-server API
        api_url = f"https://datasets-server.huggingface.co/is-valid?dataset={repo_id}"
        response = requests.get(api_url)
        
        # Parse the response
        if response.status_code == 200:
            data = response.json()
            # If any of these are True, the dataset exists in some form
            if data.get("viewer", False) or data.get("preview", False):
                return {"exists": True, "message": "Dataset is valid and accessible"}
            else:
                return {"exists": False, "message": "Dataset exists but may not be fully processed yet"}
        else:
            return {"exists": False, "message": f"API returned status code {response.status_code}"}
    except Exception as e:
        return {"exists": False, "message": f"Error verifying dataset: {str(e)}"}

@tool
def Check_Dataset_Validity(dataset_name: str) -> str:
    """A tool that checks if a dataset exists and is valid on Hugging Face.
    
    Args:
        dataset_name: Name of the dataset to check (with or without organization prefix)
    
    Returns:
        Status message about the dataset validity
    """
    try:
        # Ensure the dataset name has the organization prefix
        if "/" not in dataset_name:
            dataset_name = f"Misfits-and-Machines/{dataset_name.replace(' ', '_').lower()}"
        
        # Check dataset validity
        result = verify_dataset_exists(dataset_name)
        
        if result["exists"]:
            return f"Dataset '{dataset_name}' exists and is valid. You can access it at https://huggingface.co/datasets/{dataset_name}"
        else:
            return f"Dataset '{dataset_name}' could not be verified: {result['message']}. It may still be processing or may not exist."
    except Exception as e:
        return f"Error checking dataset validity: {str(e)}"

@tool
def get_current_time_in_timezone(timezone: str) -> str:
    """A tool that fetches the current local time in a specified timezone.
    Args:
        timezone: A string representing a valid timezone (e.g., 'America/New_York').
    """
    try:
        # Create timezone object
        tz = pytz.timezone(timezone)
        # Get current time in that timezone
        local_time = datetime.datetime.now(tz).strftime("%Y-%m-%d %H:%M:%S")
        return f"The current local time in {timezone} is: {local_time}"
    except Exception as e:
        return f"Error fetching time for timezone '{timezone}': {str(e)}"


final_answer = FinalAnswerTool()

# Remove the huggingface_api_key parameter - it's not supported
model = HfApiModel(
    max_tokens=2096,
    temperature=0.5,
    model_id='https://pflgm2locj2t89co.us-east-1.aws.endpoints.huggingface.cloud',  # Using the backup endpoint
    custom_role_conversions=None
)

# Import tool from Hub
image_generation_tool = load_tool("agents-course/text-to-image", trust_remote_code=True)

with open("prompts.yaml", 'r') as stream:
    prompt_templates = yaml.safe_load(stream)
    
agent = CodeAgent(
    model=model,
    tools=[
        final_answer, 
        Sonar_Web_Search_Tool,
        ddg_search_tool,  # Added DuckDuckGo search tool
#        google_search_tool,  # Added Google search tool 
        get_current_time_in_timezone, 
        image_generation_tool,
        Dataset_Creator_Tool,
        Check_Dataset_Validity
    ],  
    max_steps=6,
    verbosity_level=1,
    grammar=None,
    planning_interval=None,
    name=None,
    description=None,
    prompt_templates=prompt_templates
)

# To fix the TypeError in Gradio_UI.py, you would need to modify that file
# For now, we'll just use the agent directly
try:
    GradioUI(agent).launch()
except TypeError as e:
    if "unsupported operand type(s) for +=" in str(e):
        print("Error: Token counting issue in Gradio UI")
        print("To fix, edit Gradio_UI.py and change:")
        print("total_input_tokens += agent.model.last_input_token_count")
        print("To:")
        print("total_input_tokens += (agent.model.last_input_token_count or 0)")
    else:
        raise e