Agent-Example / app.py
SolshineMisfit's picture
Changed the method for hugging face data push to docs again and tried to integrate json structuring in between
6220e54 verified
raw
history blame
13 kB
from smolagents import CodeAgent, DuckDuckGoSearchTool, HfApiModel, load_tool, tool
import datetime
import requests
import pytz
import yaml
import os
import json
import uuid
from datasets import Dataset
from huggingface_hub import HfApi
from openai import OpenAI
from tools.final_answer import FinalAnswerTool
from Gradio_UI import GradioUI
# Define the Perplexity system prompt
Perplex_Assistant_Prompt = """You are a helpful AI assistant that searches the web for accurate information."""
# Set up API key in environment variable as expected by HfApiModel
os.environ["HUGGINGFACE_API_TOKEN"] = os.getenv("HUGGINGFACE_API_KEY", "")
# Initialize the standard search tools
ddg_search_tool = DuckDuckGoSearchTool(max_results=10) # Default is 10 results
# google_search_tool = GoogleSearchTool()
#@weave.op()
def tracked_perplexity_call(prompt: str, system_messages: str, model_name: str = "sonar-pro", assistant_meta: bool = False):
"""Enhanced Perplexity API call with explicit model tracking."""
client = OpenAI(api_key=os.getenv("PERPLEXITY_API_KEY"), base_url="https://api.perplexity.ai")
system_message = Perplex_Assistant_Prompt
if assistant_meta:
system_message += f"\n\n{system_messages}"
# Minimal parameters for Perplexity
return client.chat.completions.create(
model=model_name,
messages=[
{"role": "system", "content": system_message},
{"role": "user", "content": prompt},
],
stream=False,
).choices[0].message.content
@tool
def Sonar_Web_Search_Tool(arg1: str, arg2: str) -> str:
"""A tool that accesses Perplexity Sonar to search the web when the answer requires or would benefit from a real world web reference.
Args:
arg1: User Prompt
arg2: Details on the desired web search results as system message for sonar web search
"""
try:
sonar_response = tracked_perplexity_call(arg1, arg2)
return sonar_response
except Exception as e:
return f"Error using Sonar Websearch tool '{arg1} {arg2}': {str(e)}"
def Dataset_Creator_Function(dataset_name: str, conversation_data: str) -> str:
"""Creates and pushes a dataset to Hugging Face with the conversation history.
Args:
dataset_name: Name for the dataset (will be prefixed with username)
conversation_data: String representing the conversation data. Can be:
- JSON array of objects (each object becomes a row)
- Pipe-separated values (col1 | col2 | col3) for tabular data
- Plain text (stored in a 'text' column)
Returns:
URL of the created dataset or error message
"""
try:
# Required imports
import json
import pandas as pd
from datasets import Dataset
from huggingface_hub import HfApi
# Get API key from environment variables
api_key = os.getenv("HF_API_KEY") or os.getenv("HUGGINGFACE_API_KEY")
if not api_key:
return "Error: No Hugging Face API key found in environment variables"
# Set fixed username for dataset organization
username = "Misfits-and-Machines"
# Initialize Hugging Face API
hf_api = HfApi(token=api_key)
# Sanitize dataset name
safe_dataset_name = dataset_name.replace(" ", "_").lower()
repo_id = f"{username}/{safe_dataset_name}"
print(f"Creating dataset: {repo_id}")
# First ensure the repository exists
try:
repo_exists = hf_api.repo_exists(repo_id=repo_id, repo_type="dataset")
if not repo_exists:
hf_api.create_repo(repo_id=repo_id, repo_type="dataset")
print(f"Created repository: {repo_id}")
else:
print(f"Repository already exists: {repo_id}")
except Exception as e:
print(f"Note when checking/creating repository: {str(e)}")
# Process the data based on format
try:
# Try parsing as JSON first
json_data = json.loads(conversation_data)
if isinstance(json_data, list) and all(isinstance(item, dict) for item in json_data):
# Process JSON array of objects (preferred format)
print(f"Processing JSON array with {len(json_data)} items")
df = pd.DataFrame(json_data)
dataset = Dataset.from_pandas(df)
elif isinstance(json_data, dict):
# Single JSON object
print("Processing single JSON object")
df = pd.DataFrame([json_data])
dataset = Dataset.from_pandas(df)
else:
raise ValueError("JSON format not recognized as array of objects or single object")
except (json.JSONDecodeError, ValueError) as e:
# Not valid JSON or not in expected format
print(f"Not processing as JSON: {str(e)}")
# Check if pipe-separated format
lines = conversation_data.strip().split('\n')
if '|' in conversation_data and len(lines) > 1:
print("Processing as pipe-separated data")
# Parse headers and data rows
headers = [h.strip() for h in lines[0].split('|')]
data = []
for i, line in enumerate(lines[1:], 1):
if not line.strip():
continue
values = [val.strip() for val in line.split('|')]
if len(values) == len(headers):
data.append(dict(zip(headers, values)))
else:
print(f"Warning: Skipping row {i} (column count mismatch)")
if data:
df = pd.DataFrame(data)
dataset = Dataset.from_pandas(df)
else:
# Fallback to text if no valid rows
dataset = Dataset.from_dict({"text": [conversation_data]})
else:
# Plain text
print("Processing as plain text")
dataset = Dataset.from_dict({"text": [conversation_data]})
# Push to Hugging Face Hub
print(f"Pushing dataset to {repo_id}")
dataset.push_to_hub(
repo_id=repo_id,
token=api_key,
split="train"
)
dataset_url = f"https://huggingface.co/datasets/{repo_id}"
print(f"Dataset successfully pushed to: {dataset_url}")
return f"Successfully created dataset at {dataset_url}"
except Exception as e:
import traceback
error_trace = traceback.format_exc()
print(f"Dataset creation error: {str(e)}\n{error_trace}")
return f"Error creating dataset: {str(e)}\n\nTroubleshooting tips:\n1. Verify your HF_API_KEY is valid\n2. Try a simpler dataset name with only letters and underscores"
@tool
def Dataset_Creator_Tool(dataset_name: str, conversation_data: str) -> str:
"""A tool that creates and pushes a dataset to Hugging Face.
Args:
dataset_name: Name for the dataset (will be prefixed with 'Misfits-and-Machines/')
conversation_data: Data content to save in the dataset. Can be formatted in three ways:
1. JSON array of objects - Each object becomes a row in the dataset with its properties as columns:
Example: [{"name": "Product A", "brand": "Company X"}, {"name": "Product B", "brand": "Company Y"}]
2. Pipe-separated values - First row as headers, subsequent rows as values:
Example: "name | brand\nProduct A | Company X\nProduct B | Company Y"
3. Plain text - Will be stored in a single 'text' column
Returns:
Link to the created dataset or error message with troubleshooting steps
"""
try:
print(f"Creating dataset '{dataset_name}' with {len(conversation_data)} characters of data")
print(f"Dataset will be created at Misfits-and-Machines/{dataset_name.replace(' ', '_').lower()}")
result = Dataset_Creator_Function(dataset_name, conversation_data)
print(f"Dataset creation result: {result}")
return result
except Exception as e:
import traceback
error_trace = traceback.format_exc()
return f"Error using Dataset Creator tool: {str(e)}\n{error_trace}"
def verify_dataset_exists(repo_id: str) -> dict:
"""Verify that a dataset exists and is valid on the Hugging Face Hub.
Args:
repo_id: Full repository ID in format "username/dataset_name"
Returns:
Dict with "exists" boolean and "message" string
"""
try:
# Check if dataset exists using the datasets-server API
api_url = f"https://datasets-server.huggingface.co/is-valid?dataset={repo_id}"
response = requests.get(api_url)
# Parse the response
if response.status_code == 200:
data = response.json()
# If any of these are True, the dataset exists in some form
if data.get("viewer", False) or data.get("preview", False):
return {"exists": True, "message": "Dataset is valid and accessible"}
else:
return {"exists": False, "message": "Dataset exists but may not be fully processed yet"}
else:
return {"exists": False, "message": f"API returned status code {response.status_code}"}
except Exception as e:
return {"exists": False, "message": f"Error verifying dataset: {str(e)}"}
@tool
def Check_Dataset_Validity(dataset_name: str) -> str:
"""A tool that checks if a dataset exists and is valid on Hugging Face.
Args:
dataset_name: Name of the dataset to check (with or without organization prefix)
Returns:
Status message about the dataset validity
"""
try:
# Ensure the dataset name has the organization prefix
if "/" not in dataset_name:
dataset_name = f"Misfits-and-Machines/{dataset_name.replace(' ', '_').lower()}"
# Check dataset validity
result = verify_dataset_exists(dataset_name)
if result["exists"]:
return f"Dataset '{dataset_name}' exists and is valid. You can access it at https://huggingface.co/datasets/{dataset_name}"
else:
return f"Dataset '{dataset_name}' could not be verified: {result['message']}. It may still be processing or may not exist."
except Exception as e:
return f"Error checking dataset validity: {str(e)}"
@tool
def get_current_time_in_timezone(timezone: str) -> str:
"""A tool that fetches the current local time in a specified timezone.
Args:
timezone: A string representing a valid timezone (e.g., 'America/New_York').
"""
try:
# Create timezone object
tz = pytz.timezone(timezone)
# Get current time in that timezone
local_time = datetime.datetime.now(tz).strftime("%Y-%m-%d %H:%M:%S")
return f"The current local time in {timezone} is: {local_time}"
except Exception as e:
return f"Error fetching time for timezone '{timezone}': {str(e)}"
final_answer = FinalAnswerTool()
# Remove the huggingface_api_key parameter - it's not supported
model = HfApiModel(
max_tokens=2096,
temperature=0.5,
model_id='https://pflgm2locj2t89co.us-east-1.aws.endpoints.huggingface.cloud', # Using the backup endpoint
custom_role_conversions=None
)
# Import tool from Hub
image_generation_tool = load_tool("agents-course/text-to-image", trust_remote_code=True)
with open("prompts.yaml", 'r') as stream:
prompt_templates = yaml.safe_load(stream)
agent = CodeAgent(
model=model,
tools=[
final_answer,
Sonar_Web_Search_Tool,
ddg_search_tool, # Added DuckDuckGo search tool
# google_search_tool, # Added Google search tool
get_current_time_in_timezone,
image_generation_tool,
Dataset_Creator_Tool,
Check_Dataset_Validity
],
max_steps=6,
verbosity_level=1,
grammar=None,
planning_interval=None,
name=None,
description=None,
prompt_templates=prompt_templates
)
# To fix the TypeError in Gradio_UI.py, you would need to modify that file
# For now, we'll just use the agent directly
try:
GradioUI(agent).launch()
except TypeError as e:
if "unsupported operand type(s) for +=" in str(e):
print("Error: Token counting issue in Gradio UI")
print("To fix, edit Gradio_UI.py and change:")
print("total_input_tokens += agent.model.last_input_token_count")
print("To:")
print("total_input_tokens += (agent.model.last_input_token_count or 0)")
else:
raise e