ananthvk commited on
Commit
e1fbc2d
·
0 Parent(s):

add code + models

Browse files
.gitattributes ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
app.py ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import pipeline
2
+ import gradio as gr
3
+
4
+ classifier = pipeline("text-classification", model="model")
5
+
6
+ def detect_sentiment(text):
7
+ pred = classifier(text)
8
+ if len(pred) > 0:
9
+ pred = pred[0]
10
+ if pred['label'] == 'LABEL_1':
11
+ pred['label'] = "positive"
12
+ else:
13
+ pred['label'] = "negative"
14
+ return {pred['label'] : pred['score']}
15
+ return "Internal error"
16
+
17
+ review = gr.Textbox()
18
+ label = gr.Label()
19
+
20
+ inter = gr.Interface(fn=detect_sentiment, inputs=review, outputs=label)
21
+ inter.launch(inline=False)
model/config.json ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_cross_attention": false,
3
+ "architectures": [
4
+ "BertForSequenceClassification"
5
+ ],
6
+ "attention_probs_dropout_prob": 0.1,
7
+ "bos_token_id": null,
8
+ "classifier_dropout": null,
9
+ "dtype": "float32",
10
+ "eos_token_id": null,
11
+ "hidden_act": "gelu",
12
+ "hidden_dropout_prob": 0.1,
13
+ "hidden_size": 256,
14
+ "initializer_range": 0.02,
15
+ "intermediate_size": 1024,
16
+ "is_decoder": false,
17
+ "layer_norm_eps": 1e-12,
18
+ "max_position_embeddings": 512,
19
+ "model_type": "bert",
20
+ "num_attention_heads": 4,
21
+ "num_hidden_layers": 4,
22
+ "pad_token_id": 0,
23
+ "problem_type": "single_label_classification",
24
+ "tie_word_embeddings": true,
25
+ "transformers_version": "5.0.0",
26
+ "type_vocab_size": 2,
27
+ "use_cache": false,
28
+ "vocab_size": 30522
29
+ }
model/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2eb775b9b1c93a1a39b1349fad782c1f9e8d177d377a04b67767f57fb90f47d1
3
+ size 44692592
model/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
model/tokenizer_config.json ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "backend": "tokenizers",
3
+ "cls_token": "[CLS]",
4
+ "do_lower_case": false,
5
+ "is_local": false,
6
+ "mask_token": "[MASK]",
7
+ "model_max_length": 1000000000000000019884624838656,
8
+ "pad_token": "[PAD]",
9
+ "sep_token": "[SEP]",
10
+ "strip_accents": null,
11
+ "tokenize_chinese_chars": true,
12
+ "tokenizer_class": "BertTokenizer",
13
+ "unk_token": "[UNK]"
14
+ }
model/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:47f425273f344ada539744ea93f17586d413b4d7d87f3ae6b26b884ccc75f266
3
+ size 5201
requirements-local.txt ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ aiofiles==24.1.0
2
+ annotated-doc==0.0.4
3
+ annotated-types==0.7.0
4
+ anyio==4.12.1
5
+ brotli==1.2.0
6
+ certifi==2026.2.25
7
+ click==8.3.1
8
+ fastapi==0.135.1
9
+ ffmpy==1.0.0
10
+ filelock==3.25.2
11
+ fsspec==2026.2.0
12
+ gradio==6.9.0
13
+ gradio_client==2.3.0
14
+ groovy==0.1.2
15
+ h11==0.16.0
16
+ hf-xet==1.4.2
17
+ httpcore==1.0.9
18
+ httpx==0.28.1
19
+ huggingface_hub==1.7.1
20
+ idna==3.11
21
+ Jinja2==3.1.6
22
+ markdown-it-py==4.0.0
23
+ MarkupSafe==3.0.3
24
+ mdurl==0.1.2
25
+ numpy==2.4.3
26
+ orjson==3.11.7
27
+ packaging==26.0
28
+ pandas==3.0.1
29
+ pillow==12.1.1
30
+ pydantic==2.12.5
31
+ pydantic_core==2.41.5
32
+ pydub==0.25.1
33
+ Pygments==2.19.2
34
+ python-dateutil==2.9.0.post0
35
+ python-multipart==0.0.22
36
+ pytz==2026.1.post1
37
+ PyYAML==6.0.3
38
+ regex==2026.2.28
39
+ rich==14.3.3
40
+ safehttpx==0.1.7
41
+ safetensors==0.7.0
42
+ semantic-version==2.10.0
43
+ shellingham==1.5.4
44
+ six==1.17.0
45
+ starlette==0.52.1
46
+ tokenizers==0.22.2
47
+ tomlkit==0.13.3
48
+ tqdm==4.67.3
49
+ transformers==5.3.0
50
+ typer==0.24.1
51
+ typing-inspection==0.4.2
52
+ typing_extensions==4.15.0
53
+ uvicorn==0.41.0
54
+ torch @ https://download.pytorch.org/whl/cpu/torch-2.10.0%2Bcpu-cp312-cp312-manylinux_2_28_x86_64.whl#sha256=ee40b8a4b4b2cf0670c6fd4f35a7ef23871af956fecb238fbf5da15a72650b1d
55
+ torchvision @ https://download.pytorch.org/whl/cpu/torchvision-0.25.0%2Bcpu-cp312-cp312-manylinux_2_28_x86_64.whl#sha256=c1be164e93c68b2dbf460fd58975377c892dbcf3358fb72941709c3857351bba
requirements.txt ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ aiofiles==24.1.0
2
+ annotated-doc==0.0.4
3
+ annotated-types==0.7.0
4
+ anyio==4.12.1
5
+ brotli==1.2.0
6
+ certifi==2026.2.25
7
+ click==8.3.1
8
+ fastapi==0.135.1
9
+ ffmpy==1.0.0
10
+ filelock==3.25.2
11
+ fsspec==2026.2.0
12
+ gradio==6.9.0
13
+ gradio_client==2.3.0
14
+ groovy==0.1.2
15
+ h11==0.16.0
16
+ hf-xet==1.4.2
17
+ httpcore==1.0.9
18
+ httpx==0.28.1
19
+ huggingface_hub==1.7.1
20
+ idna==3.11
21
+ Jinja2==3.1.6
22
+ markdown-it-py==4.0.0
23
+ MarkupSafe==3.0.3
24
+ mdurl==0.1.2
25
+ numpy==2.4.3
26
+ orjson==3.11.7
27
+ packaging==26.0
28
+ pandas==3.0.1
29
+ pillow==12.1.1
30
+ pydantic==2.12.5
31
+ pydantic_core==2.41.5
32
+ pydub==0.25.1
33
+ Pygments==2.19.2
34
+ python-dateutil==2.9.0.post0
35
+ python-multipart==0.0.22
36
+ pytz==2026.1.post1
37
+ PyYAML==6.0.3
38
+ regex==2026.2.28
39
+ rich==14.3.3
40
+ safehttpx==0.1.7
41
+ safetensors==0.7.0
42
+ semantic-version==2.10.0
43
+ shellingham==1.5.4
44
+ six==1.17.0
45
+ starlette==0.52.1
46
+ tokenizers==0.22.2
47
+ tomlkit==0.13.3
48
+ tqdm==4.67.3
49
+ transformers==5.3.0
50
+ typer==0.24.1
51
+ typing-inspection==0.4.2
52
+ typing_extensions==4.15.0
53
+ uvicorn==0.41.0
54
+ torch
55
+ torchvision
train.py ADDED
@@ -0,0 +1,98 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """FastAI_04_NLP_IMDB_MoviesDataset.ipynb
3
+
4
+ Automatically generated by Colab.
5
+
6
+ """
7
+
8
+ # Set your KAGGLE_API_TOKEN
9
+
10
+ # !pip install kagglehub "kagglehub[pandas-datasets]" "transformers[torch]"
11
+
12
+ import kagglehub
13
+ from kagglehub import KaggleDatasetAdapter
14
+ from datasets import Dataset, DatasetDict
15
+ from transformers import AutoModelForSequenceClassification, AutoTokenizer
16
+ from transformers import TrainingArguments, Trainer
17
+
18
+ file_path = "IMDB Dataset.csv"
19
+ model = 'prajjwal1/bert-mini'
20
+
21
+ bs = 64
22
+ epochs = 4
23
+ lr = 5e-5
24
+ args = TrainingArguments(
25
+ 'outputs',
26
+ learning_rate=lr,
27
+ warmup_ratio=0.1,
28
+ lr_scheduler_type='cosine',
29
+ fp16=True,
30
+ eval_strategy="epoch",
31
+ per_device_train_batch_size=bs,
32
+ per_device_eval_batch_size=bs*2,
33
+ num_train_epochs=epochs,
34
+ weight_decay=0.01,
35
+ report_to='none'
36
+ )
37
+
38
+ df = kagglehub.dataset_load(
39
+ KaggleDatasetAdapter.PANDAS,
40
+ "lakshmi25npathi/imdb-dataset-of-50k-movie-reviews",
41
+ file_path,
42
+ )
43
+
44
+ df["sentiment"] = df["sentiment"].replace({
45
+ "negative": 0,
46
+ "positive": 1
47
+ })
48
+
49
+ df['review'] = df['review'].str.lower()
50
+
51
+ ds = Dataset.from_pandas(df)
52
+ tokz = AutoTokenizer.from_pretrained(model)
53
+ tokenized_ds = ds.map(lambda x: tokz(x["review"], truncation=True, max_length=512), batched=True)
54
+ tokenized_ds = tokenized_ds.rename_columns({'sentiment': 'labels', 'review': 'input'})
55
+ dataset_dict = tokenized_ds.train_test_split(0.30, seed=2026)
56
+
57
+
58
+ mdl = AutoModelForSequenceClassification.from_pretrained(model, num_labels=2)
59
+ trainer = Trainer(
60
+ mdl,
61
+ args,
62
+ train_dataset=dataset_dict['train'],
63
+ eval_dataset=dataset_dict['test'],
64
+ processing_class=tokz
65
+ )
66
+
67
+ trainer.train()
68
+
69
+ # To free GPU memory
70
+ # ===================
71
+ #del dataset_dict
72
+ #del trainer
73
+ #del mdl
74
+ #import gc
75
+ #gc.collect()
76
+ #import torch
77
+ #torch.cuda.empty_cache()
78
+ #torch.cuda.ipc_collect()
79
+ #!nvidia-smi
80
+
81
+ # Save the model
82
+ trainer.save_model("imdb_sentiment")
83
+ tokz.save_pretrained("imdb_sentiment")
84
+
85
+ #from google.colab import drive
86
+ #drive.mount('/content/drive')
87
+ #!cp -r ./imdb_sentiment/ /content/drive/MyDrive/imdb_sentiment
88
+
89
+ # Check the accuracy (since we did not specify compute_accuracy while training)
90
+ # TODO: Add this while training the next time
91
+ from sklearn.metrics import accuracy_score
92
+ import numpy as np
93
+
94
+ predictions = trainer.predict(dataset_dict['test'])
95
+ preds = np.argmax(predictions.predictions, axis=1)
96
+ labels = predictions.label_ids
97
+ acc = accuracy_score(labels, preds)
98
+ print("Validation Accuracy:", acc)