SoniR commited on
Commit
0b0afda
·
1 Parent(s): 1768d68

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +112 -0
app.py ADDED
@@ -0,0 +1,112 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """LLM.ipynb
3
+
4
+ Automatically generated by Colaboratory.
5
+
6
+ Original file is located at
7
+ https://colab.research.google.com/drive/1AbtqagXl-cWKhXqd5z_uxZ2_JiCI4e6q
8
+ """
9
+
10
+ # Step 1: Setup
11
+ # pip install transformers
12
+ # pip install datasets
13
+
14
+ # !pip install transformers[torch]
15
+
16
+ # !pip install transformers --upgrade
17
+ # !pip install accelerate --upgrade
18
+
19
+ import transformers
20
+ import accelerate
21
+ print("Transformers version:", transformers.__version__)
22
+ print("Accelerate version:", accelerate.__version__)
23
+
24
+ # Step 2: Mount Google Drive to access your data
25
+ from google.colab import drive
26
+ drive.mount('/content/drive')
27
+
28
+ # Step 2: Import necessary libraries
29
+ import torch
30
+ from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
31
+ from datasets import load_dataset
32
+ from datasets import set_caching_enabled
33
+ set_caching_enabled(False)
34
+
35
+ from datasets import load_dataset
36
+
37
+ # Replace 'path_to_your_text_file.txt' with the actual path of your text file in Google Drive
38
+ dataset = load_dataset('text', data_files={'train': '/content/drive/My Drive/Ethics.txt'})
39
+
40
+ # Split the 'train' dataset into training and validation sets
41
+ train_size = int(len(dataset['train']) * 0.9)
42
+ train_dataset = dataset['train'].select(list(range(train_size)))
43
+ validation_dataset = dataset['train'].select(list(range(train_size, len(dataset['train']))))
44
+
45
+ print("Train dataset size:", len(train_dataset))
46
+ print("Validation dataset size:", len(validation_dataset))
47
+
48
+ # Step 4: Tokenization
49
+ from transformers import GPT2Tokenizer
50
+ tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
51
+
52
+ def tokenize_function(examples):
53
+ return tokenizer(examples["text"])
54
+
55
+ # Tokenize the dataset with a reduced number of workers
56
+ tokenized_dataset = dataset.map(
57
+ tokenize_function,
58
+ batched=True,
59
+ num_proc=1 # Set the number of workers to 1
60
+ )
61
+
62
+ # !pip install gpt-2-simple
63
+
64
+ import gpt_2_simple as gpt2
65
+
66
+ # !pip install accelerate>=0.20.1
67
+ # !pip install accelerate -U
68
+
69
+ # Step 5: Model Preparation
70
+ from transformers import GPT2LMHeadModel
71
+ model = GPT2LMHeadModel.from_pretrained('gpt2')
72
+
73
+ # Step 6: Training
74
+ from transformers import TrainingArguments,Trainer
75
+ training_args = TrainingArguments(
76
+ output_dir="./results",
77
+ overwrite_output_dir=True,
78
+ num_train_epochs=3,
79
+ per_device_train_batch_size=1,
80
+ save_steps=10_000,
81
+ save_total_limit=2,
82
+ )
83
+
84
+ trainer = Trainer(
85
+ model=model,
86
+ args=training_args,
87
+ train_dataset=train_dataset,
88
+ eval_dataset=validation_dataset,
89
+ )
90
+
91
+ # Step 9: Save the Model
92
+ model.save_pretrained("fine_tuned_model")
93
+
94
+ # !pip install xformers
95
+
96
+ # # Step 7: Testing
97
+ # from transformers import pipeline
98
+ # generator = pipeline('text-generation', model=model)
99
+ # result = generator('My custom model says,')[0]
100
+ # print(result['generated_text'])
101
+ # Step 7: Testing
102
+ from transformers import pipeline, GPT2Tokenizer
103
+
104
+ # Create a tokenizer for your GPT-2 model
105
+ tokenizer = GPT2Tokenizer.from_pretrained('gpt2') # You might need to adjust the model name
106
+
107
+ # Create a text generation pipeline
108
+ generator = pipeline('text-generation', model=model, tokenizer=tokenizer)
109
+
110
+ # Generate text using the pipeline
111
+ result = generator('My custom model says,')[0]
112
+ print(result['generated_text'])