Spaces:
Sleeping
Sleeping
Commit ·
e1fbc2d
0
Parent(s):
add code + models
Browse files- .gitattributes +35 -0
- app.py +21 -0
- model/config.json +29 -0
- model/model.safetensors +3 -0
- model/tokenizer.json +0 -0
- model/tokenizer_config.json +14 -0
- model/training_args.bin +3 -0
- requirements-local.txt +55 -0
- requirements.txt +55 -0
- train.py +98 -0
.gitattributes
ADDED
|
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
*.7z filter=lfs diff=lfs merge=lfs -text
|
| 2 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
| 3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
| 4 |
+
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
| 5 |
+
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
| 6 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
| 7 |
+
*.gz filter=lfs diff=lfs merge=lfs -text
|
| 8 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
| 9 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
| 10 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
| 11 |
+
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
| 12 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
| 13 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
| 14 |
+
*.npy filter=lfs diff=lfs merge=lfs -text
|
| 15 |
+
*.npz filter=lfs diff=lfs merge=lfs -text
|
| 16 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
| 17 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
| 18 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
| 19 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
| 20 |
+
*.pickle filter=lfs diff=lfs merge=lfs -text
|
| 21 |
+
*.pkl filter=lfs diff=lfs merge=lfs -text
|
| 22 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
| 23 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
| 24 |
+
*.rar filter=lfs diff=lfs merge=lfs -text
|
| 25 |
+
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
| 26 |
+
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
| 27 |
+
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
| 28 |
+
*.tar filter=lfs diff=lfs merge=lfs -text
|
| 29 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
| 30 |
+
*.tgz filter=lfs diff=lfs merge=lfs -text
|
| 31 |
+
*.wasm filter=lfs diff=lfs merge=lfs -text
|
| 32 |
+
*.xz filter=lfs diff=lfs merge=lfs -text
|
| 33 |
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
+
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
app.py
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from transformers import pipeline
|
| 2 |
+
import gradio as gr
|
| 3 |
+
|
| 4 |
+
classifier = pipeline("text-classification", model="model")
|
| 5 |
+
|
| 6 |
+
def detect_sentiment(text):
|
| 7 |
+
pred = classifier(text)
|
| 8 |
+
if len(pred) > 0:
|
| 9 |
+
pred = pred[0]
|
| 10 |
+
if pred['label'] == 'LABEL_1':
|
| 11 |
+
pred['label'] = "positive"
|
| 12 |
+
else:
|
| 13 |
+
pred['label'] = "negative"
|
| 14 |
+
return {pred['label'] : pred['score']}
|
| 15 |
+
return "Internal error"
|
| 16 |
+
|
| 17 |
+
review = gr.Textbox()
|
| 18 |
+
label = gr.Label()
|
| 19 |
+
|
| 20 |
+
inter = gr.Interface(fn=detect_sentiment, inputs=review, outputs=label)
|
| 21 |
+
inter.launch(inline=False)
|
model/config.json
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"add_cross_attention": false,
|
| 3 |
+
"architectures": [
|
| 4 |
+
"BertForSequenceClassification"
|
| 5 |
+
],
|
| 6 |
+
"attention_probs_dropout_prob": 0.1,
|
| 7 |
+
"bos_token_id": null,
|
| 8 |
+
"classifier_dropout": null,
|
| 9 |
+
"dtype": "float32",
|
| 10 |
+
"eos_token_id": null,
|
| 11 |
+
"hidden_act": "gelu",
|
| 12 |
+
"hidden_dropout_prob": 0.1,
|
| 13 |
+
"hidden_size": 256,
|
| 14 |
+
"initializer_range": 0.02,
|
| 15 |
+
"intermediate_size": 1024,
|
| 16 |
+
"is_decoder": false,
|
| 17 |
+
"layer_norm_eps": 1e-12,
|
| 18 |
+
"max_position_embeddings": 512,
|
| 19 |
+
"model_type": "bert",
|
| 20 |
+
"num_attention_heads": 4,
|
| 21 |
+
"num_hidden_layers": 4,
|
| 22 |
+
"pad_token_id": 0,
|
| 23 |
+
"problem_type": "single_label_classification",
|
| 24 |
+
"tie_word_embeddings": true,
|
| 25 |
+
"transformers_version": "5.0.0",
|
| 26 |
+
"type_vocab_size": 2,
|
| 27 |
+
"use_cache": false,
|
| 28 |
+
"vocab_size": 30522
|
| 29 |
+
}
|
model/model.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:2eb775b9b1c93a1a39b1349fad782c1f9e8d177d377a04b67767f57fb90f47d1
|
| 3 |
+
size 44692592
|
model/tokenizer.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
model/tokenizer_config.json
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"backend": "tokenizers",
|
| 3 |
+
"cls_token": "[CLS]",
|
| 4 |
+
"do_lower_case": false,
|
| 5 |
+
"is_local": false,
|
| 6 |
+
"mask_token": "[MASK]",
|
| 7 |
+
"model_max_length": 1000000000000000019884624838656,
|
| 8 |
+
"pad_token": "[PAD]",
|
| 9 |
+
"sep_token": "[SEP]",
|
| 10 |
+
"strip_accents": null,
|
| 11 |
+
"tokenize_chinese_chars": true,
|
| 12 |
+
"tokenizer_class": "BertTokenizer",
|
| 13 |
+
"unk_token": "[UNK]"
|
| 14 |
+
}
|
model/training_args.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:47f425273f344ada539744ea93f17586d413b4d7d87f3ae6b26b884ccc75f266
|
| 3 |
+
size 5201
|
requirements-local.txt
ADDED
|
@@ -0,0 +1,55 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
aiofiles==24.1.0
|
| 2 |
+
annotated-doc==0.0.4
|
| 3 |
+
annotated-types==0.7.0
|
| 4 |
+
anyio==4.12.1
|
| 5 |
+
brotli==1.2.0
|
| 6 |
+
certifi==2026.2.25
|
| 7 |
+
click==8.3.1
|
| 8 |
+
fastapi==0.135.1
|
| 9 |
+
ffmpy==1.0.0
|
| 10 |
+
filelock==3.25.2
|
| 11 |
+
fsspec==2026.2.0
|
| 12 |
+
gradio==6.9.0
|
| 13 |
+
gradio_client==2.3.0
|
| 14 |
+
groovy==0.1.2
|
| 15 |
+
h11==0.16.0
|
| 16 |
+
hf-xet==1.4.2
|
| 17 |
+
httpcore==1.0.9
|
| 18 |
+
httpx==0.28.1
|
| 19 |
+
huggingface_hub==1.7.1
|
| 20 |
+
idna==3.11
|
| 21 |
+
Jinja2==3.1.6
|
| 22 |
+
markdown-it-py==4.0.0
|
| 23 |
+
MarkupSafe==3.0.3
|
| 24 |
+
mdurl==0.1.2
|
| 25 |
+
numpy==2.4.3
|
| 26 |
+
orjson==3.11.7
|
| 27 |
+
packaging==26.0
|
| 28 |
+
pandas==3.0.1
|
| 29 |
+
pillow==12.1.1
|
| 30 |
+
pydantic==2.12.5
|
| 31 |
+
pydantic_core==2.41.5
|
| 32 |
+
pydub==0.25.1
|
| 33 |
+
Pygments==2.19.2
|
| 34 |
+
python-dateutil==2.9.0.post0
|
| 35 |
+
python-multipart==0.0.22
|
| 36 |
+
pytz==2026.1.post1
|
| 37 |
+
PyYAML==6.0.3
|
| 38 |
+
regex==2026.2.28
|
| 39 |
+
rich==14.3.3
|
| 40 |
+
safehttpx==0.1.7
|
| 41 |
+
safetensors==0.7.0
|
| 42 |
+
semantic-version==2.10.0
|
| 43 |
+
shellingham==1.5.4
|
| 44 |
+
six==1.17.0
|
| 45 |
+
starlette==0.52.1
|
| 46 |
+
tokenizers==0.22.2
|
| 47 |
+
tomlkit==0.13.3
|
| 48 |
+
tqdm==4.67.3
|
| 49 |
+
transformers==5.3.0
|
| 50 |
+
typer==0.24.1
|
| 51 |
+
typing-inspection==0.4.2
|
| 52 |
+
typing_extensions==4.15.0
|
| 53 |
+
uvicorn==0.41.0
|
| 54 |
+
torch @ https://download.pytorch.org/whl/cpu/torch-2.10.0%2Bcpu-cp312-cp312-manylinux_2_28_x86_64.whl#sha256=ee40b8a4b4b2cf0670c6fd4f35a7ef23871af956fecb238fbf5da15a72650b1d
|
| 55 |
+
torchvision @ https://download.pytorch.org/whl/cpu/torchvision-0.25.0%2Bcpu-cp312-cp312-manylinux_2_28_x86_64.whl#sha256=c1be164e93c68b2dbf460fd58975377c892dbcf3358fb72941709c3857351bba
|
requirements.txt
ADDED
|
@@ -0,0 +1,55 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
aiofiles==24.1.0
|
| 2 |
+
annotated-doc==0.0.4
|
| 3 |
+
annotated-types==0.7.0
|
| 4 |
+
anyio==4.12.1
|
| 5 |
+
brotli==1.2.0
|
| 6 |
+
certifi==2026.2.25
|
| 7 |
+
click==8.3.1
|
| 8 |
+
fastapi==0.135.1
|
| 9 |
+
ffmpy==1.0.0
|
| 10 |
+
filelock==3.25.2
|
| 11 |
+
fsspec==2026.2.0
|
| 12 |
+
gradio==6.9.0
|
| 13 |
+
gradio_client==2.3.0
|
| 14 |
+
groovy==0.1.2
|
| 15 |
+
h11==0.16.0
|
| 16 |
+
hf-xet==1.4.2
|
| 17 |
+
httpcore==1.0.9
|
| 18 |
+
httpx==0.28.1
|
| 19 |
+
huggingface_hub==1.7.1
|
| 20 |
+
idna==3.11
|
| 21 |
+
Jinja2==3.1.6
|
| 22 |
+
markdown-it-py==4.0.0
|
| 23 |
+
MarkupSafe==3.0.3
|
| 24 |
+
mdurl==0.1.2
|
| 25 |
+
numpy==2.4.3
|
| 26 |
+
orjson==3.11.7
|
| 27 |
+
packaging==26.0
|
| 28 |
+
pandas==3.0.1
|
| 29 |
+
pillow==12.1.1
|
| 30 |
+
pydantic==2.12.5
|
| 31 |
+
pydantic_core==2.41.5
|
| 32 |
+
pydub==0.25.1
|
| 33 |
+
Pygments==2.19.2
|
| 34 |
+
python-dateutil==2.9.0.post0
|
| 35 |
+
python-multipart==0.0.22
|
| 36 |
+
pytz==2026.1.post1
|
| 37 |
+
PyYAML==6.0.3
|
| 38 |
+
regex==2026.2.28
|
| 39 |
+
rich==14.3.3
|
| 40 |
+
safehttpx==0.1.7
|
| 41 |
+
safetensors==0.7.0
|
| 42 |
+
semantic-version==2.10.0
|
| 43 |
+
shellingham==1.5.4
|
| 44 |
+
six==1.17.0
|
| 45 |
+
starlette==0.52.1
|
| 46 |
+
tokenizers==0.22.2
|
| 47 |
+
tomlkit==0.13.3
|
| 48 |
+
tqdm==4.67.3
|
| 49 |
+
transformers==5.3.0
|
| 50 |
+
typer==0.24.1
|
| 51 |
+
typing-inspection==0.4.2
|
| 52 |
+
typing_extensions==4.15.0
|
| 53 |
+
uvicorn==0.41.0
|
| 54 |
+
torch
|
| 55 |
+
torchvision
|
train.py
ADDED
|
@@ -0,0 +1,98 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# -*- coding: utf-8 -*-
|
| 2 |
+
"""FastAI_04_NLP_IMDB_MoviesDataset.ipynb
|
| 3 |
+
|
| 4 |
+
Automatically generated by Colab.
|
| 5 |
+
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
# Set your KAGGLE_API_TOKEN
|
| 9 |
+
|
| 10 |
+
# !pip install kagglehub "kagglehub[pandas-datasets]" "transformers[torch]"
|
| 11 |
+
|
| 12 |
+
import kagglehub
|
| 13 |
+
from kagglehub import KaggleDatasetAdapter
|
| 14 |
+
from datasets import Dataset, DatasetDict
|
| 15 |
+
from transformers import AutoModelForSequenceClassification, AutoTokenizer
|
| 16 |
+
from transformers import TrainingArguments, Trainer
|
| 17 |
+
|
| 18 |
+
file_path = "IMDB Dataset.csv"
|
| 19 |
+
model = 'prajjwal1/bert-mini'
|
| 20 |
+
|
| 21 |
+
bs = 64
|
| 22 |
+
epochs = 4
|
| 23 |
+
lr = 5e-5
|
| 24 |
+
args = TrainingArguments(
|
| 25 |
+
'outputs',
|
| 26 |
+
learning_rate=lr,
|
| 27 |
+
warmup_ratio=0.1,
|
| 28 |
+
lr_scheduler_type='cosine',
|
| 29 |
+
fp16=True,
|
| 30 |
+
eval_strategy="epoch",
|
| 31 |
+
per_device_train_batch_size=bs,
|
| 32 |
+
per_device_eval_batch_size=bs*2,
|
| 33 |
+
num_train_epochs=epochs,
|
| 34 |
+
weight_decay=0.01,
|
| 35 |
+
report_to='none'
|
| 36 |
+
)
|
| 37 |
+
|
| 38 |
+
df = kagglehub.dataset_load(
|
| 39 |
+
KaggleDatasetAdapter.PANDAS,
|
| 40 |
+
"lakshmi25npathi/imdb-dataset-of-50k-movie-reviews",
|
| 41 |
+
file_path,
|
| 42 |
+
)
|
| 43 |
+
|
| 44 |
+
df["sentiment"] = df["sentiment"].replace({
|
| 45 |
+
"negative": 0,
|
| 46 |
+
"positive": 1
|
| 47 |
+
})
|
| 48 |
+
|
| 49 |
+
df['review'] = df['review'].str.lower()
|
| 50 |
+
|
| 51 |
+
ds = Dataset.from_pandas(df)
|
| 52 |
+
tokz = AutoTokenizer.from_pretrained(model)
|
| 53 |
+
tokenized_ds = ds.map(lambda x: tokz(x["review"], truncation=True, max_length=512), batched=True)
|
| 54 |
+
tokenized_ds = tokenized_ds.rename_columns({'sentiment': 'labels', 'review': 'input'})
|
| 55 |
+
dataset_dict = tokenized_ds.train_test_split(0.30, seed=2026)
|
| 56 |
+
|
| 57 |
+
|
| 58 |
+
mdl = AutoModelForSequenceClassification.from_pretrained(model, num_labels=2)
|
| 59 |
+
trainer = Trainer(
|
| 60 |
+
mdl,
|
| 61 |
+
args,
|
| 62 |
+
train_dataset=dataset_dict['train'],
|
| 63 |
+
eval_dataset=dataset_dict['test'],
|
| 64 |
+
processing_class=tokz
|
| 65 |
+
)
|
| 66 |
+
|
| 67 |
+
trainer.train()
|
| 68 |
+
|
| 69 |
+
# To free GPU memory
|
| 70 |
+
# ===================
|
| 71 |
+
#del dataset_dict
|
| 72 |
+
#del trainer
|
| 73 |
+
#del mdl
|
| 74 |
+
#import gc
|
| 75 |
+
#gc.collect()
|
| 76 |
+
#import torch
|
| 77 |
+
#torch.cuda.empty_cache()
|
| 78 |
+
#torch.cuda.ipc_collect()
|
| 79 |
+
#!nvidia-smi
|
| 80 |
+
|
| 81 |
+
# Save the model
|
| 82 |
+
trainer.save_model("imdb_sentiment")
|
| 83 |
+
tokz.save_pretrained("imdb_sentiment")
|
| 84 |
+
|
| 85 |
+
#from google.colab import drive
|
| 86 |
+
#drive.mount('/content/drive')
|
| 87 |
+
#!cp -r ./imdb_sentiment/ /content/drive/MyDrive/imdb_sentiment
|
| 88 |
+
|
| 89 |
+
# Check the accuracy (since we did not specify compute_accuracy while training)
|
| 90 |
+
# TODO: Add this while training the next time
|
| 91 |
+
from sklearn.metrics import accuracy_score
|
| 92 |
+
import numpy as np
|
| 93 |
+
|
| 94 |
+
predictions = trainer.predict(dataset_dict['test'])
|
| 95 |
+
preds = np.argmax(predictions.predictions, axis=1)
|
| 96 |
+
labels = predictions.label_ids
|
| 97 |
+
acc = accuracy_score(labels, preds)
|
| 98 |
+
print("Validation Accuracy:", acc)
|