Pranathi2612 commited on
Commit
257c574
·
verified ·
1 Parent(s): 4d1f5c5

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +78 -3
README.md CHANGED
@@ -1,3 +1,78 @@
1
- ---
2
- license: apache-2.0
3
- ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: apache-2.0
3
+ datasets:
4
+ - fka/awesome-chatgpt-prompts
5
+ language:
6
+ - hi
7
+ - ta
8
+ - ml
9
+ ---
10
+ import datasets
11
+ import torch
12
+ from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
13
+ from datasets import Dataset
14
+
15
+ # Step 1: Define your colloquial dataset
16
+ # Sample conversational data in different languages (adjust based on your task)
17
+ data = {
18
+ 'text': [
19
+ 'kaise ho?', # informal Hindi greeting
20
+ 'kya scene hai?', # Hindi slang phrase
21
+ 'apne kahan jana hai?', # informal Hindi sentence
22
+ 'yentha vara', # Tamil slang
23
+ 'mizhhi pidichu', # Malayalam slang
24
+ 'enthu cheyyumo', # Malayalam slang
25
+ 'uru kuthi', # Tamil slang
26
+ 'ekdam mast', # Hindi slang
27
+ ],
28
+ 'label': [0, 1, 2, 3, 4, 4, 3, 1] # Example labels for intent or sentiment
29
+ }
30
+
31
+ # Step 2: Convert data into Hugging Face Dataset format
32
+ dataset = Dataset.from_dict(data)
33
+
34
+ # Step 3: Tokenize the data using a multilingual model tokenizer
35
+ tokenizer = AutoTokenizer.from_pretrained("bert-base-multilingual-cased")
36
+
37
+ # Tokenization function
38
+ def tokenize_function(examples):
39
+ return tokenizer(examples['text'], padding="max_length", truncation=True)
40
+
41
+ # Apply tokenization to the dataset
42
+ dataset = dataset.map(tokenize_function, batched=True)
43
+
44
+ # Step 4: Load a pre-trained model for sequence classification
45
+ model = AutoModelForSequenceClassification.from_pretrained("bert-base-multilingual-cased", num_labels=5)
46
+
47
+ # Step 5: Set up Trainer for fine-tuning the model
48
+ training_args = TrainingArguments(
49
+ output_dir='./results', # Output directory to save model and logs
50
+ evaluation_strategy="epoch", # Evaluate after each epoch
51
+ per_device_train_batch_size=8, # Batch size during training
52
+ per_device_eval_batch_size=8, # Batch size during evaluation
53
+ num_train_epochs=3, # Number of epochs for training
54
+ logging_dir='./logs', # Log directory for training details
55
+ logging_steps=10, # Number of steps to log
56
+ )
57
+
58
+ # Initialize the Trainer
59
+ trainer = Trainer(
60
+ model=model,
61
+ args=training_args,
62
+ train_dataset=dataset,
63
+ eval_dataset=dataset, # Typically, split dataset into training and validation sets
64
+ )
65
+
66
+ # Step 6: Train the model
67
+ trainer.train()
68
+
69
+ # Step 7: Save the trained model and tokenizer
70
+ model.save_pretrained("./my_colloquial_model")
71
+ tokenizer.save_pretrained("./my_colloquial_model")
72
+
73
+ # Optional: Upload to Hugging Face
74
+ # Uncomment and use Hugging Face CLI to upload the model:
75
+ # !huggingface-cli login # Log in to your Hugging Face account
76
+ # model.push_to_hub("my_colloquial_model")
77
+
78
+ print("Model training and saving complete.")