jaksani1 commited on
Commit
9c8d06b
·
verified ·
1 Parent(s): 6ae34c1

Upload 4 files

Browse files
TeluguFineTunedModel.ipynb ADDED
The diff for this file is too large to render. See raw diff
 
telugufinetunedmodel.py ADDED
@@ -0,0 +1,142 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """TeluguFineTunedModel.ipynb
3
+
4
+ Automatically generated by Colab.
5
+
6
+ Original file is located at
7
+ https://colab.research.google.com/drive/1e6ZAY9LbNyAe__urbLAqPmxGQex8d8aw
8
+ """
9
+
10
+ from huggingface_hub import notebook_login
11
+ notebook_login()
12
+
13
+ !pip install unsloth
14
+
15
+ from unsloth import FastLanguageModel
16
+ import torch
17
+ from transformers import AutoModelForSequenceClassification, AutoTokenizer, TrainingArguments, Trainer
18
+ from datasets import Dataset, DatasetDict
19
+ import pandas as pd
20
+ from sklearn.model_selection import train_test_split
21
+ from sklearn.metrics import accuracy_score, precision_recall_fscore_support
22
+
23
+ df = pd.read_csv("Telugu.csv") # Replace "your_dataset.csv" with your filename
24
+ df = df.dropna() # remove null values
25
+ df = df.rename(columns={"text_column": "text", "label_column": "label"}) # rename colums
26
+ print(df.head()) # Inspect the first few rows
27
+
28
+ from google.colab import drive
29
+
30
+ try:
31
+ drive.flush_and_unmount()
32
+ print('Drive unmounted')
33
+ except ValueError:
34
+ pass
35
+
36
+ # Remount the drive
37
+ drive.mount('/content/drive')
38
+ df = pd.read_csv("/content/Telugu.csv") # replace with your path in Google Drive
39
+ df = df.dropna() # remove null values
40
+ df = df.rename(columns={"text_column": "text", "label_column": "label"}) # rename colums
41
+ print(df.head())
42
+
43
+ dataset = Dataset.from_pandas(df)
44
+
45
+ dataset = dataset.train_test_split(test_size=0.2, seed=42) # 80% train, 20% validation. seed for reproducibility
46
+
47
+ model_name = "bert-base-multilingual-cased" # Or try "xlm-roberta-base" if that's faster
48
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
49
+
50
+ def tokenized_function(examples):
51
+ tokenized_datasets = dataset.map(preprocess_function, batched=True)
52
+ return tokenized_datasets
53
+
54
+ def tokenize_fn(examples):
55
+ inputs = [ex for ex in examples['te']]
56
+ targets = [ex for ex in examples['en']]
57
+ model_inputs = tokenizer(inputs, max_length=128, truncation=True, padding="max_length")
58
+ labels = tokenizer(targets, max_length=128, truncation=True, padding="max_length").input_ids
59
+ model_inputs["labels"] = labels
60
+ return model_inputs
61
+
62
+ tokenized_dataset = dataset.map(tokenize_fn, batched=True)
63
+
64
+ def compute_metrics(pred):
65
+ labels = pred.label_ids
66
+ preds = pred.predictions.argmax(-1)
67
+ precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
68
+ acc = accuracy_score(labels, preds)
69
+ return {
70
+ 'accuracy': acc,
71
+ 'f1': f1,
72
+ 'precision': precision,
73
+ 'recall': recall
74
+ }
75
+
76
+ model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)
77
+
78
+ !pip install peft
79
+ from peft import LoraConfig, get_peft_model
80
+
81
+ # ... (rest of your code)
82
+
83
+ lora_config = LoraConfig(
84
+ r=16,
85
+ lora_alpha=32,
86
+ target_modules=["query", "key", "value", "dense"],
87
+ lora_dropout=0.05,
88
+ bias="none",
89
+ task_type="SEQ_CLS", # Specify task type as sequence classification
90
+ )
91
+
92
+ model = get_peft_model(model, lora_config) # Use peft.get_peft_model directly
93
+
94
+ from peft import LoraConfig, get_peft_model
95
+
96
+ # ... (rest of your code)
97
+
98
+ lora_config = LoraConfig(
99
+ r=16,
100
+ lora_alpha=32,
101
+ target_modules=["query", "key", "value", "dense"],
102
+ lora_dropout=0.05,
103
+ bias="none",
104
+ task_type="SEQ_CLS", # Specify task type as sequence classification
105
+ )
106
+
107
+ model = get_peft_model(model, lora_config) # Use peft.get_peft_model directly
108
+
109
+ training_args = TrainingArguments(
110
+ output_dir="./results",
111
+ learning_rate=2e-5,
112
+ per_device_train_batch_size=32,
113
+ per_device_eval_batch_size=32,
114
+ num_train_epochs=3,
115
+ weight_decay=0.01,
116
+ evaluation_strategy="epoch",
117
+ save_strategy="epoch",
118
+ load_best_model_at_end=True,
119
+ metric_for_best_model="f1", # Use F1 score to determine the best model
120
+ report_to="none" # Disable WANDB to avoid login issues
121
+ )
122
+
123
+ tokenized_datasets = dataset.map(tokenize_fn, batched=True)
124
+ trainer = Trainer(
125
+ model=model,
126
+ args=training_args,
127
+ train_dataset=tokenized_datasets["train"],
128
+ eval_dataset=tokenized_datasets["test"],
129
+ tokenizer=tokenizer,
130
+ compute_metrics=compute_metrics,
131
+ )
132
+
133
+ trainer.save_model("./my_colloquial_telugu_model")
134
+
135
+ from huggingface_hub import notebook_login
136
+
137
+ notebook_login()
138
+
139
+ import os
140
+
141
+ # Replace "YOUR_HUGGING_FACE_TOKEN" with the actual token you copied
142
+ os.environ["ML_project_token"] = "ML_project_token"
train-00000-of-00001.parquet ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:16bd7d210710f152d20e9fa349c643c678d3aa09efb3afadb6898fa0d600a0f5
3
+ size 17192
validation-00000-of-00001.parquet ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1d54570ed960f96dff1356307a1b0b3990d319c0dec0bfd1d7f109db2bb0059e
3
+ size 9085