Agent-Example

Runtime error

App Files Files Community

SolshineMisfit commited on Mar 6

Commit

9e2fccb

verified ·

1 Parent(s): ad13edc

json dataframe updates

Browse files

In this updated solution, the JSON input is converted into a pandas DataFrame and then wrapped into a Dataset object; finally, it is placed in a DatasetDict under the "train" split before being pushed to the Hub. This ensures that every entry in a JSON array becomes its own row in the "train" split, allowing for multiple updates or separate entries as desired.

Files changed (1) hide show

app.py +36 -47

app.py CHANGED Viewed

@@ -61,9 +61,9 @@ def Dataset_Creator_Function(dataset_name: str, conversation_data: str) -> str:
     Args:
         dataset_name: Name for the dataset (will be prefixed with username)
         conversation_data: String representing the conversation data. Can be:
-                          - JSON array of objects (each object becomes a row)
-                          - Pipe-separated values (col1 | col2 | col3) for tabular data
-                          - Plain text (stored in a 'text' column)
     Returns:
         URL of the created dataset or error message
@@ -72,30 +72,25 @@ def Dataset_Creator_Function(dataset_name: str, conversation_data: str) -> str:
         # Required imports
         import json
         import pandas as pd
-        from datasets import Dataset
         from huggingface_hub import HfApi
-        # Get API key from environment variables
         api_key = os.getenv("HF_API_KEY") or os.getenv("HUGGINGFACE_API_KEY")
         if not api_key:
             return "Error: No Hugging Face API key found in environment variables"
-        # Set fixed username for dataset organization
         username = "Misfits-and-Machines"
-        # Initialize Hugging Face API
-        hf_api = HfApi(token=api_key)
-        # Sanitize dataset name
         safe_dataset_name = dataset_name.replace(" ", "_").lower()
         repo_id = f"{username}/{safe_dataset_name}"
         print(f"Creating dataset: {repo_id}")
-        # First ensure the repository exists
         try:
-            repo_exists = hf_api.repo_exists(repo_id=repo_id, repo_type="dataset")
-            if not repo_exists:
                 hf_api.create_repo(repo_id=repo_id, repo_type="dataset")
                 print(f"Created repository: {repo_id}")
             else:
@@ -103,36 +98,30 @@ def Dataset_Creator_Function(dataset_name: str, conversation_data: str) -> str:
         except Exception as e:
             print(f"Note when checking/creating repository: {str(e)}")
-        # Process the data based on format
         try:
-            # Try parsing as JSON first
             json_data = json.loads(conversation_data)
             if isinstance(json_data, list) and all(isinstance(item, dict) for item in json_data):
-                # Process JSON array of objects (preferred format)
                 print(f"Processing JSON array with {len(json_data)} items")
                 df = pd.DataFrame(json_data)
-                dataset = Dataset.from_pandas(df)
             elif isinstance(json_data, dict):
-                # Single JSON object
                 print("Processing single JSON object")
                 df = pd.DataFrame([json_data])
-                dataset = Dataset.from_pandas(df)
             else:
-                raise ValueError("JSON format not recognized as array of objects or single object")
         except (json.JSONDecodeError, ValueError) as e:
-            # Not valid JSON or not in expected format
             print(f"Not processing as JSON: {str(e)}")
-            # Check if pipe-separated format
             lines = conversation_data.strip().split('\n')
             if '|' in conversation_data and len(lines) > 1:
                 print("Processing as pipe-separated data")
-                # Parse headers and data rows
                 headers = [h.strip() for h in lines[0].split('|')]
                 data = []
                 for i, line in enumerate(lines[1:], 1):
                     if not line.strip():
                         continue
@@ -141,35 +130,33 @@ def Dataset_Creator_Function(dataset_name: str, conversation_data: str) -> str:
                         data.append(dict(zip(headers, values)))
                     else:
                         print(f"Warning: Skipping row {i} (column count mismatch)")
                 if data:
                     df = pd.DataFrame(data)
-                    dataset = Dataset.from_pandas(df)
                 else:
-                    # Fallback to text if no valid rows
-                    dataset = Dataset.from_dict({"text": [conversation_data]})
             else:
-                # Plain text
                 print("Processing as plain text")
-                dataset = Dataset.from_dict({"text": [conversation_data]})
-        # Push to Hugging Face Hub
         print(f"Pushing dataset to {repo_id}")
-        dataset.push_to_hub(
             repo_id=repo_id,
             token=api_key,
-            split="train"
         )
         dataset_url = f"https://huggingface.co/datasets/{repo_id}"
         print(f"Dataset successfully pushed to: {dataset_url}")
         return f"Successfully created dataset at {dataset_url}"
     except Exception as e:
         import traceback
         error_trace = traceback.format_exc()
         print(f"Dataset creation error: {str(e)}\n{error_trace}")
-        return f"Error creating dataset: {str(e)}\n\nTroubleshooting tips:\n1. Verify your HF_API_KEY is valid\n2. Try a simpler dataset name with only letters and underscores"
 @tool
 def Dataset_Creator_Tool(dataset_name: str, conversation_data: str) -> str:
@@ -266,13 +253,14 @@ def get_current_time_in_timezone(timezone: str) -> str:
         return f"Error fetching time for timezone '{timezone}': {str(e)}"
-# Update the model configuration to use Qwen2.5-Coder
 final_answer = FinalAnswerTool()
 model = HfApiModel(
     max_tokens=2096,
     temperature=0.5,
-    model_id='Qwen/Qwen2.5-Coder-32B-Instruct',  # Changed to Qwen model
-    custom_role_conversions=None,
 )
 # Import tool from Hub
@@ -281,18 +269,18 @@ image_generation_tool = load_tool("agents-course/text-to-image", trust_remote_co
 with open("prompts.yaml", 'r') as stream:
     prompt_templates = yaml.safe_load(stream)
-# Updated CodeAgent with tools
 agent = CodeAgent(
     model=model,
     tools=[
-        final_answer,
         Sonar_Web_Search_Tool,
-        ddg_search_tool,
         get_current_time_in_timezone,
         image_generation_tool,
         Dataset_Creator_Tool,
         Check_Dataset_Validity
-    ],
     max_steps=6,
     verbosity_level=1,
     grammar=None,
@@ -302,7 +290,8 @@ agent = CodeAgent(
     prompt_templates=prompt_templates
 )
-# Launch the Gradio UI
 try:
     GradioUI(agent).launch()
 except TypeError as e:

     Args:
         dataset_name: Name for the dataset (will be prefixed with username)
         conversation_data: String representing the conversation data. Can be:
+            - JSON array of objects (each object becomes a row)
+            - Pipe-separated values (first row as headers, subsequent rows as values)
+            - Plain text (stored in a single 'text' column)
     Returns:
         URL of the created dataset or error message
         # Required imports
         import json
         import pandas as pd
+        from datasets import Dataset, DatasetDict
         from huggingface_hub import HfApi
+        # Get API key
         api_key = os.getenv("HF_API_KEY") or os.getenv("HUGGINGFACE_API_KEY")
         if not api_key:
             return "Error: No Hugging Face API key found in environment variables"
+        # Set fixed username
         username = "Misfits-and-Machines"
         safe_dataset_name = dataset_name.replace(" ", "_").lower()
         repo_id = f"{username}/{safe_dataset_name}"
         print(f"Creating dataset: {repo_id}")
+        # Ensure repository exists
+        hf_api = HfApi(token=api_key)
         try:
+            if not hf_api.repo_exists(repo_id=repo_id, repo_type="dataset"):
                 hf_api.create_repo(repo_id=repo_id, repo_type="dataset")
                 print(f"Created repository: {repo_id}")
             else:
         except Exception as e:
             print(f"Note when checking/creating repository: {str(e)}")
+        # Process input data
+        created_ds = None
         try:
             json_data = json.loads(conversation_data)
             if isinstance(json_data, list) and all(isinstance(item, dict) for item in json_data):
                 print(f"Processing JSON array with {len(json_data)} items")
                 df = pd.DataFrame(json_data)
+                ds = Dataset.from_pandas(df)
+                created_ds = DatasetDict({"train": ds})
             elif isinstance(json_data, dict):
                 print("Processing single JSON object")
                 df = pd.DataFrame([json_data])
+                ds = Dataset.from_pandas(df)
+                created_ds = DatasetDict({"train": ds})
             else:
+                raise ValueError("JSON not recognized as array or single object")
         except (json.JSONDecodeError, ValueError) as e:
             print(f"Not processing as JSON: {str(e)}")
+            # Try pipe-separated format
             lines = conversation_data.strip().split('\n')
             if '|' in conversation_data and len(lines) > 1:
                 print("Processing as pipe-separated data")
                 headers = [h.strip() for h in lines[0].split('|')]
                 data = []
                 for i, line in enumerate(lines[1:], 1):
                     if not line.strip():
                         continue
                         data.append(dict(zip(headers, values)))
                     else:
                         print(f"Warning: Skipping row {i} (column count mismatch)")
                 if data:
                     df = pd.DataFrame(data)
+                    ds = Dataset.from_pandas(df)
+                    created_ds = DatasetDict({"train": ds})
                 else:
+                    created_ds = DatasetDict({"train": Dataset.from_dict({"text": [conversation_data]})})
             else:
+                # Fallback for plain text
                 print("Processing as plain text")
+                created_ds = DatasetDict({"train": Dataset.from_dict({"text": [conversation_data]})})
+        # Push using the DatasetDict push_to_hub method.
         print(f"Pushing dataset to {repo_id}")
+        created_ds.push_to_hub(
             repo_id=repo_id,
             token=api_key,
+            commit_message=f"Upload dataset: {dataset_name}"
         )
         dataset_url = f"https://huggingface.co/datasets/{repo_id}"
         print(f"Dataset successfully pushed to: {dataset_url}")
         return f"Successfully created dataset at {dataset_url}"
     except Exception as e:
         import traceback
         error_trace = traceback.format_exc()
         print(f"Dataset creation error: {str(e)}\n{error_trace}")
+        return f"Error creating dataset: {str(e)}\n\nTroubleshooting tips:\n1. Verify your HF_API_KEY is valid\n2. Use a simpler dataset name (letters and underscores only)"
 @tool
 def Dataset_Creator_Tool(dataset_name: str, conversation_data: str) -> str:
         return f"Error fetching time for timezone '{timezone}': {str(e)}"
 final_answer = FinalAnswerTool()
+# Remove the huggingface_api_key parameter - it's not supported
 model = HfApiModel(
     max_tokens=2096,
     temperature=0.5,
+    model_id='https://pflgm2locj2t89co.us-east-1.aws.endpoints.huggingface.cloud',  # Using the backup endpoint
+    custom_role_conversions=None
 )
 # Import tool from Hub
 with open("prompts.yaml", 'r') as stream:
     prompt_templates = yaml.safe_load(stream)
 agent = CodeAgent(
     model=model,
     tools=[
+        final_answer,
         Sonar_Web_Search_Tool,
+        ddg_search_tool,  # Added DuckDuckGo search tool
+#        google_search_tool,  # Added Google search tool
         get_current_time_in_timezone,
         image_generation_tool,
         Dataset_Creator_Tool,
         Check_Dataset_Validity
+    ],
     max_steps=6,
     verbosity_level=1,
     grammar=None,
     prompt_templates=prompt_templates
 )
+# To fix the TypeError in Gradio_UI.py, you would need to modify that file
+# For now, we'll just use the agent directly
 try:
     GradioUI(agent).launch()
 except TypeError as e: