SolshineMisfit commited on
Commit
13b9d4a
·
verified ·
1 Parent(s): 3f1cc0a

Reverted to docs way for dataset upload

Browse files
Files changed (1) hide show
  1. app.py +35 -69
app.py CHANGED
@@ -51,27 +51,24 @@ def Sonar_Web_Search_Tool(arg1: str, arg2: str) -> str:
51
  return f"Error using Sonar Websearch tool '{arg1} {arg2}': {str(e)}"
52
 
53
 
54
- def Dataset_Creator_Function(dataset_name: str, username: str, conversation_data: str) -> str:
55
  """Creates and pushes a dataset to Hugging Face with the conversation history.
56
 
57
  Args:
58
- dataset_name: Name for the dataset
59
- username: Default is "Misfits-and-Machines"
60
  conversation_data: String representing the conversation data
61
 
62
  Returns:
63
  URL of the created dataset or error message
64
  """
65
  try:
66
- import tempfile
67
- import pathlib
68
- from datasets import Dataset, DatasetDict
69
- import pandas as pd
70
-
71
  # Get API key from environment variables
72
- api_key = os.getenv("HF_API_KEY")
73
  if not api_key:
74
- return "Error: No Hugging Face API key found in environment variables. Please set HF_API_KEY or HUGGINGFACE_API_KEY."
 
 
 
75
 
76
  # Initialize Hugging Face API
77
  hf_api = HfApi(token=api_key)
@@ -80,74 +77,43 @@ def Dataset_Creator_Function(dataset_name: str, username: str, conversation_data
80
  safe_dataset_name = dataset_name.replace(" ", "_").lower()
81
  repo_id = f"{username}/{safe_dataset_name}"
82
 
83
- print(f"Creating dataset repository: {repo_id}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
84
 
85
- # Create a temporary directory to store the dataset files
86
- with tempfile.TemporaryDirectory() as tmp_dir:
87
- # Convert data to DataFrame and save as CSV
88
- df = pd.DataFrame({
89
- "text": [conversation_data],
90
- "timestamp": [datetime.datetime.now().isoformat()],
91
- "dataset_id": [str(uuid.uuid4())]
92
- })
93
-
94
- # Save CSV in the temp directory
95
- csv_path = pathlib.Path(tmp_dir) / "train.csv"
96
- df.to_csv(csv_path, index=False)
97
-
98
- print(f"Data saved to temporary CSV file: {csv_path}")
99
-
100
- # Load from CSV to ensure proper dataset structure
101
- train_dataset = Dataset.from_pandas(df)
102
-
103
- # Create a DatasetDict with a train split
104
- dataset_dict = DatasetDict({"train": train_dataset})
105
- print(f"Created dataset with {len(train_dataset)} rows")
106
-
107
- # Create the repository explicitly if it doesn't exist
108
- try:
109
- if not hf_api.repo_exists(repo_id=repo_id, repo_type="dataset"):
110
- hf_api.create_repo(repo_id=repo_id, repo_type="dataset")
111
- print(f"Repository {repo_id} created")
112
- else:
113
- print(f"Repository {repo_id} already exists")
114
- except Exception as repo_error:
115
- print(f"Repository creation error: {str(repo_error)}")
116
-
117
- # Push to Hugging Face Hub
118
- print(f"Pushing dataset to {repo_id}")
119
-
120
- # Use the DatasetDict push_to_hub method
121
- dataset_dict.push_to_hub(
122
- repo_id=repo_id,
123
- token=api_key,
124
- private=False
125
- )
126
-
127
- dataset_url = f"https://huggingface.co/datasets/{repo_id}"
128
- print(f"Dataset successfully pushed to: {dataset_url}")
129
-
130
- # Double-check that the repo exists
131
- if hf_api.repo_exists(repo_id=repo_id, repo_type="dataset"):
132
- print(f"Verified: Repository {repo_id} exists")
133
- else:
134
- print(f"Warning: Repository {repo_id} not found after push")
135
-
136
- return f"Successfully created dataset at {dataset_url}"
137
  except Exception as e:
138
  import traceback
139
  error_trace = traceback.format_exc()
140
  print(f"Dataset creation error: {str(e)}\n{error_trace}")
141
- return f"Error creating dataset: {str(e)}\n\nTo troubleshoot:\n1. Verify API key is valid\n2. Try with a different dataset name"
142
 
143
  @tool
144
- def Dataset_Creator_Tool(dataset_name: str, username: str, conversation_data: str) -> str:
145
  """A tool that posts a new dataset of the current conversation to Hugging Face.
146
 
147
  Args:
148
- dataset_name: Name for the dataset
149
- username: (Default should be 'Misfits-and-Machines/')
150
- conversation_data: String content to save to the dataset (no JSON conversion needed)
151
 
152
  Returns:
153
  Link to the created dataset or error message with troubleshooting steps
@@ -155,7 +121,7 @@ def Dataset_Creator_Tool(dataset_name: str, username: str, conversation_data: st
155
  try:
156
  print(f"Creating dataset '{dataset_name}' with {len(conversation_data)} characters of data")
157
  print(f"Dataset will be created at Misfits-and-Machines/{dataset_name.replace(' ', '_').lower()}")
158
- result = Dataset_Creator_Function(dataset_name, username, conversation_data)
159
  print(f"Dataset creation result: {result}")
160
  return result
161
  except Exception as e:
 
51
  return f"Error using Sonar Websearch tool '{arg1} {arg2}': {str(e)}"
52
 
53
 
54
+ def Dataset_Creator_Function(dataset_name: str, conversation_data: str) -> str:
55
  """Creates and pushes a dataset to Hugging Face with the conversation history.
56
 
57
  Args:
58
+ dataset_name: Name for the dataset (will be prefixed with username)
 
59
  conversation_data: String representing the conversation data
60
 
61
  Returns:
62
  URL of the created dataset or error message
63
  """
64
  try:
 
 
 
 
 
65
  # Get API key from environment variables
66
+ api_key = os.getenv("HF_API_KEY") or os.getenv("HUGGINGFACE_API_KEY")
67
  if not api_key:
68
+ return "Error: No Hugging Face API key found in environment variables"
69
+
70
+ # Set fixed username for dataset organization
71
+ username = "Misfits-and-Machines"
72
 
73
  # Initialize Hugging Face API
74
  hf_api = HfApi(token=api_key)
 
77
  safe_dataset_name = dataset_name.replace(" ", "_").lower()
78
  repo_id = f"{username}/{safe_dataset_name}"
79
 
80
+ print(f"Creating dataset: {repo_id}")
81
+
82
+ # Create a simple dataset from a dictionary
83
+ data = {
84
+ "text": [conversation_data],
85
+ "timestamp": [datetime.datetime.now().isoformat()],
86
+ "id": [str(uuid.uuid4())]
87
+ }
88
+
89
+ # Create the dataset directly
90
+ dataset = Dataset.from_dict(data)
91
+
92
+ # Push to Hugging Face Hub using the simpler method from documentation
93
+ dataset.push_to_hub(
94
+ repo_id=repo_id, # Include username in repo_id
95
+ token=api_key, # Pass token explicitly
96
+ private=False # Make it public
97
+ )
98
+
99
+ # Generate the URL for the dataset
100
+ dataset_url = f"https://huggingface.co/datasets/{repo_id}"
101
+ print(f"Dataset successfully pushed to: {dataset_url}")
102
 
103
+ return f"Successfully created dataset at {dataset_url}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
104
  except Exception as e:
105
  import traceback
106
  error_trace = traceback.format_exc()
107
  print(f"Dataset creation error: {str(e)}\n{error_trace}")
108
+ return f"Error creating dataset: {str(e)}\n\nTroubleshooting tips:\n1. Verify your HF_API_KEY is valid\n2. Try a simpler dataset name with only letters and underscores\n3. Check your permissions for the Misfits-and-Machines organization"
109
 
110
  @tool
111
+ def Dataset_Creator_Tool(dataset_name: str, conversation_data: str) -> str:
112
  """A tool that posts a new dataset of the current conversation to Hugging Face.
113
 
114
  Args:
115
+ dataset_name: Name for the dataset (will be prefixed with 'Misfits-and-Machines/')
116
+ conversation_data: String content to save to the dataset
 
117
 
118
  Returns:
119
  Link to the created dataset or error message with troubleshooting steps
 
121
  try:
122
  print(f"Creating dataset '{dataset_name}' with {len(conversation_data)} characters of data")
123
  print(f"Dataset will be created at Misfits-and-Machines/{dataset_name.replace(' ', '_').lower()}")
124
+ result = Dataset_Creator_Function(dataset_name, conversation_data)
125
  print(f"Dataset creation result: {result}")
126
  return result
127
  except Exception as e: