Spaces:
Build error
Build error
Emanuele Mercadante commited on
Commit ·
1ce3fe8
1
Parent(s): 6d08e25
first commit
Browse files- Dockerfile +27 -0
- model/model_description.txt +0 -0
- requirements.txt +3 -0
- train_model.py +65 -0
Dockerfile
ADDED
|
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Utilizza l'immagine base ufficiale di Hugging Face per PyTorch con supporto CUDA
|
| 2 |
+
FROM huggingface/transformers-pytorch-cuda:latest
|
| 3 |
+
|
| 4 |
+
# Installa le dipendenze di sistema
|
| 5 |
+
RUN apt-get update && apt-get install -y \
|
| 6 |
+
git \
|
| 7 |
+
wget \
|
| 8 |
+
&& rm -rf /var/lib/apt/lists/*
|
| 9 |
+
|
| 10 |
+
# Imposta le variabili d'ambiente per il token e l'utente
|
| 11 |
+
ENV HUGGINGFACE_TOKEN=hf_1234567890abcdef1234567890abcdef12345678
|
| 12 |
+
ENV HUGGINGFACE_USER=Rathalos
|
| 13 |
+
ENV HUGGINGFACE_REPO=training_incite
|
| 14 |
+
|
| 15 |
+
# Clona il repository privato
|
| 16 |
+
RUN git clone https://${HUGGINGFACE_TOKEN}@huggingface.co/spaces/${HUGGINGFACE_USER}/${HUGGINGFACE_REPO}.git
|
| 17 |
+
|
| 18 |
+
# Copia il file requirements.txt e installa le dipendenze Python
|
| 19 |
+
COPY requirements.txt .
|
| 20 |
+
RUN pip install --no-cache-dir -r requirements.txt
|
| 21 |
+
|
| 22 |
+
# Copia tutto il contenuto della directory corrente nella directory /app del container
|
| 23 |
+
COPY . /app
|
| 24 |
+
WORKDIR /app
|
| 25 |
+
|
| 26 |
+
# Comando per eseguire lo script di training
|
| 27 |
+
CMD ["python", "train_model.py"]
|
model/model_description.txt
ADDED
|
File without changes
|
requirements.txt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
torch
|
| 2 |
+
transformers
|
| 3 |
+
datasets
|
train_model.py
ADDED
|
@@ -0,0 +1,65 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import torch
|
| 2 |
+
from torch.utils.data import Dataset, DataLoader
|
| 3 |
+
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments
|
| 4 |
+
|
| 5 |
+
class TextDataset(Dataset):
|
| 6 |
+
def __init__(self, text, tokenizer, max_length):
|
| 7 |
+
self.tokenizer = tokenizer
|
| 8 |
+
self.input_ids = self.tokenizer(text, return_tensors='pt', max_length=max_length, truncation=True, padding="max_length").input_ids
|
| 9 |
+
|
| 10 |
+
def __len__(self):
|
| 11 |
+
return self.input_ids.size(1)
|
| 12 |
+
|
| 13 |
+
def __getitem__(self, idx):
|
| 14 |
+
return self.input_ids[:, idx]
|
| 15 |
+
|
| 16 |
+
def main():
|
| 17 |
+
# Hyperparameters
|
| 18 |
+
max_length = 512
|
| 19 |
+
batch_size = 32
|
| 20 |
+
epochs = 3
|
| 21 |
+
learning_rate = 5e-5
|
| 22 |
+
|
| 23 |
+
# File path
|
| 24 |
+
text_file_path = 'path/to/your/text/file.txt' # Modifica questo percorso
|
| 25 |
+
|
| 26 |
+
# Load text data
|
| 27 |
+
with open(text_file_path, 'r', encoding='utf-8') as file:
|
| 28 |
+
text = file.read()
|
| 29 |
+
|
| 30 |
+
# Load tokenizer and model
|
| 31 |
+
model_name = "togethercomputer/RedPajama-INCITE-Chat-3B-v1"
|
| 32 |
+
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
| 33 |
+
model = AutoModelForCausalLM.from_pretrained(model_name)
|
| 34 |
+
|
| 35 |
+
# Preprocess data
|
| 36 |
+
dataset = TextDataset(text, tokenizer, max_length)
|
| 37 |
+
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
|
| 38 |
+
|
| 39 |
+
# Setup device
|
| 40 |
+
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
|
| 41 |
+
model.to(device)
|
| 42 |
+
|
| 43 |
+
# Setup optimizer
|
| 44 |
+
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
|
| 45 |
+
|
| 46 |
+
# Training loop
|
| 47 |
+
for epoch in range(epochs):
|
| 48 |
+
print(f"Epoch {epoch + 1}/{epochs}")
|
| 49 |
+
model.train()
|
| 50 |
+
for batch in dataloader:
|
| 51 |
+
inputs = batch.to(device)
|
| 52 |
+
outputs = model(inputs, labels=inputs)
|
| 53 |
+
loss = outputs.loss
|
| 54 |
+
optimizer.zero_grad()
|
| 55 |
+
loss.backward()
|
| 56 |
+
optimizer.step()
|
| 57 |
+
print(f"Loss: {loss.item()}")
|
| 58 |
+
|
| 59 |
+
# Save the model
|
| 60 |
+
model_save_path = 'model' # Modifica questo percorso
|
| 61 |
+
model.save_pretrained(model_save_path)
|
| 62 |
+
tokenizer.save_pretrained(model_save_path)
|
| 63 |
+
|
| 64 |
+
if __name__ == '__main__':
|
| 65 |
+
main()
|