RRPATEL228 commited on
Commit
0b68e9f
·
verified ·
1 Parent(s): eb353ff

Upload hugging.py

Browse files
Files changed (1) hide show
  1. hugging.py +232 -0
hugging.py ADDED
@@ -0,0 +1,232 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """hugging.ipynb
3
+
4
+ Automatically generated by Colab.
5
+
6
+ Original file is located at
7
+ https://colab.research.google.com/drive/1L3wB_9pZG9AWiAlibB_lGeZkfea-BqTW
8
+ """
9
+
10
+ !pip install transformers
11
+
12
+ !pip install huggingface_hub
13
+
14
+ # Install the Hugging Face CLI
15
+ !pip install -U "huggingface_hub[cli]"
16
+
17
+ from google.colab import userdata
18
+ userdata.get('HF_READ')
19
+
20
+ HF_READ_TOKEN = userdata.get('HF_READ')
21
+
22
+ !git config --global credential.helper store
23
+
24
+ !hf auth logout # clear old/invalid token
25
+ !hf auth login
26
+
27
+ !hf auth whoami
28
+
29
+ from huggingface_hub import notebook_login
30
+ notebook_login()
31
+
32
+ !pip install --upgrade huggingface_hub
33
+
34
+ !hf upload hf://datasets/Anthropic/EconomicIndex/release_2025_03_27/automation_vs_augmentation_by_task.csv.csv --repo https://huggingface.co/datasets/RRPATEL228/repo
35
+
36
+ mkdir -p RRPATEL228/test_repo
37
+
38
+ get_ipython().system('echo "Test upload" > RRPATEL228/test_repo/README.md')
39
+
40
+ !hf upload RRPATEL228/test_repo --repo RRPATEL228/test_repo
41
+
42
+ pip install llama-stack
43
+
44
+ pip install llama-stack -U
45
+
46
+ !llama model list
47
+
48
+ from huggingface_hub import notebook_login
49
+
50
+ notebook_login()
51
+
52
+ from datasets import load_dataset
53
+ from huggingface_hub import hf_hub_download
54
+ import os
55
+
56
+ # Define the repository ID and filename
57
+ repo_id = "RRPATEL228/test_repo"
58
+ filename = "Customer_Attributes_and_Purchase_Propensity.csv"
59
+
60
+ # Download the file
61
+ file_path = hf_hub_download(repo_id=repo_id, filename=filename)
62
+
63
+ # Load the dataset from the downloaded file
64
+ ds = load_dataset("csv", data_files=file_path)
65
+
66
+ display(ds)
67
+
68
+ ds['train'].shape
69
+
70
+ display(ds['train'].features)
71
+
72
+ ds
73
+
74
+ small_train = ds["train"].shuffle(seed=42).select(range(100))
75
+ small_eval = ds["train"].shuffle(seed=42).select(range(100, 200)) # Selecting a different range for evaluation set
76
+
77
+ small_train
78
+
79
+ small_eval
80
+
81
+ from transformers import Trainer, TrainingArguments
82
+
83
+ import torch
84
+ from torch.utils.data import Dataset # Import Dataset base class
85
+
86
+ class CustomDataset(Dataset): # Inherit from torch.utils.data.Dataset
87
+ def __init__(self, data):
88
+ # Assuming 'Score' is the feature and 'Purchased' is the label
89
+ self.X = torch.tensor(data['Score'], dtype=torch.float32).unsqueeze(1) # Add unsqueeze(1) to make it a 2D tensor
90
+ self.y = torch.tensor(data['Purchased'], dtype=torch.long)
91
+
92
+ def __len__(self):
93
+ return len(self.y)
94
+
95
+ def __getitem__(self, idx):
96
+ return {'input': self.X[idx], 'label': self.y[idx]}
97
+
98
+ train_dataset = CustomDataset(small_train)
99
+ eval_dataset = CustomDataset(small_eval)
100
+
101
+ import torch.nn as nn
102
+ import torch
103
+
104
+ class TabularMLP(nn.Module):
105
+ def __init__(self, input_dim, num_classes):
106
+ super().__init__()
107
+ self.network = nn.Sequential(
108
+ nn.Linear(input_dim, 64),
109
+ nn.ReLU(),
110
+ nn.Linear(64, num_classes),
111
+ )
112
+ self.loss_fct = nn.CrossEntropyLoss() # Define loss function
113
+
114
+ def forward(self, input, labels=None): # Accept 'input' and 'labels'
115
+ logits = self.network(input)
116
+ loss = None
117
+ if labels is not None:
118
+ loss = self.loss_fct(logits.view(-1, self.network[-1].out_features), labels.view(-1)) # Calculate loss
119
+
120
+ return {"loss": loss, "logits": logits} # Return loss and logits in a dictionary
121
+
122
+ import torch
123
+
124
+ import torch.nn as nn
125
+
126
+ training_args = TrainingArguments(
127
+ output_dir='./results',
128
+ num_train_epochs=10,
129
+ per_device_train_batch_size=32,
130
+ # evaluation_strategy="epoch" # Removed the unexpected argument
131
+ )
132
+
133
+ # Initialize the model
134
+ input_dim = train_dataset.X.shape[1] # Get input dimension from the dataset
135
+ num_classes = len(torch.unique(train_dataset.y)) # Get number of classes from the dataset
136
+
137
+ model = TabularMLP(input_dim=input_dim, num_classes=num_classes)
138
+
139
+ from transformers import TrainingArguments
140
+
141
+ training_args = TrainingArguments(
142
+ output_dir="Purchased_data",
143
+ learning_rate=2e-5,
144
+ per_device_train_batch_size=8,
145
+ per_device_eval_batch_size=8,
146
+ num_train_epochs=2,
147
+ push_to_hub=True,
148
+ )
149
+
150
+ from huggingface_hub import notebook_login # Corrected import
151
+ notebook_login()
152
+
153
+ !hf auth login
154
+
155
+ import wandb
156
+
157
+ wandb.init(project="huggingface") # replace with your project name
158
+
159
+ from transformers import TrainingArguments
160
+
161
+ training_args = TrainingArguments(
162
+ output_dir='./results',
163
+ # evaluation_strategy="epoch", # Removed the unexpected argument
164
+ logging_dir='./logs',
165
+ # logging_strategy="steps", # Removed for consistency
166
+ logging_steps=10,
167
+ report_to="wandb", # IMPORTANT: enables wandb logging
168
+ save_strategy="epoch",
169
+ per_device_train_batch_size=32,
170
+ per_device_eval_batch_size=32,
171
+ num_train_epochs=3,
172
+ )
173
+
174
+ trainer.evaluate()
175
+
176
+ predictions = trainer.predict(eval_dataset)
177
+ print(predictions.predictions)
178
+ print(predictions.label_ids)
179
+
180
+ import numpy as np
181
+ from transformers import EvalPrediction
182
+ import evaluate # Using the evaluate library for metrics
183
+
184
+ # Load accuracy metric
185
+ accuracy_metric = evaluate.load("accuracy")
186
+
187
+ def compute_metrics(p: EvalPrediction):
188
+ # Get predicted labels by finding the class with the highest logit
189
+ predictions = np.argmax(p.predictions, axis=1)
190
+ # Compute accuracy
191
+ accuracy = accuracy_metric.compute(predictions=predictions, references=p.label_ids)
192
+ return accuracy
193
+
194
+ !pip install evaluate
195
+
196
+
197
+
198
+ from transformers import Trainer
199
+
200
+ trainer = Trainer(
201
+ model=model,
202
+ args=training_args,
203
+ train_dataset=train_dataset,
204
+ eval_dataset=eval_dataset,
205
+ compute_metrics=compute_metrics, # Assuming compute_metrics is defined and needed
206
+ )
207
+
208
+ trainer.train()
209
+
210
+ import wandb
211
+ import matplotlib.pyplot as plt
212
+ from sklearn.metrics import ConfusionMatrixDisplay
213
+
214
+ # After evaluation step, e.g. after trainer.evaluate()
215
+ eval_results = trainer.evaluate()
216
+
217
+ # Log scalar metrics explicitly
218
+ wandb.log(eval_results)
219
+
220
+ # For logging confusion matrix or any plot
221
+ def log_confusion_matrix(predictions, labels, class_names):
222
+ fig, ax = plt.subplots(figsize=(8,8))
223
+ ConfusionMatrixDisplay.from_predictions(labels, predictions, display_labels=class_names, ax=ax)
224
+ wandb.log({"confusion_matrix": wandb.Image(fig)})
225
+ plt.close(fig)
226
+
227
+ # Example usage after predictions
228
+ preds = trainer.predict(eval_dataset) # Corrected variable name
229
+ predicted_labels = preds.predictions.argmax(axis=-1)
230
+ true_labels = preds.label_ids
231
+ log_confusion_matrix(predicted_labels, true_labels, class_names=['No','Yes'])
232
+