Icarevic commited on
Commit
3f29e4d
·
verified ·
1 Parent(s): 611abf3

Upload 12 files

Browse files
app.py ADDED
@@ -0,0 +1,156 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import joblib
3
+ import torch
4
+ import torch.nn as nn
5
+ import torch.nn.functional as F
6
+ import json
7
+ from transformers import AutoTokenizer, AutoModelForSequenceClassification
8
+
9
+ print("Pokrećem aplikaciju...")
10
+
11
+ # --- Učitavanje SVM pipelinea ---
12
+ print("Učitavam SVM pipeline...")
13
+ svm_pipeline = joblib.load("svm_pipeline.pkl")
14
+
15
+ # --- Učitavanje riječnika za CNN i GRU ---
16
+ print("Učitavam riječnik...")
17
+ with open("word2idx.json", "r", encoding="utf-8") as f:
18
+ word2idx = json.load(f)
19
+
20
+ # --- Definicija CNN modela ---
21
+ class CNNModel(nn.Module):
22
+ def __init__(self, vocab_size, embed_dim=300, num_classes=3, kernel_sizes=[3,4,5], num_filters=128):
23
+ super(CNNModel, self).__init__()
24
+ self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
25
+ self.convs = nn.ModuleList([
26
+ nn.Conv2d(1, num_filters, (k, embed_dim)) for k in kernel_sizes
27
+ ])
28
+ self.dropout = nn.Dropout(0.5)
29
+ self.fc = nn.Linear(num_filters * len(kernel_sizes), num_classes)
30
+
31
+ def forward(self, x):
32
+ x = self.embedding(x).unsqueeze(1)
33
+ convs = [F.relu(conv(x)).squeeze(3) for conv in self.convs]
34
+ pools = [F.max_pool1d(c, c.size(2)).squeeze(2) for c in convs]
35
+ x = torch.cat(pools, 1)
36
+ x = self.dropout(x)
37
+ return self.fc(x)
38
+
39
+ # --- Definicija GRU modela ---
40
+ class GRUModel(nn.Module):
41
+ def __init__(self, vocab_size, embed_dim=300, hidden_dim=256, num_layers=1, num_classes=3):
42
+ super(GRUModel, self).__init__()
43
+ self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
44
+ self.gru = nn.GRU(embed_dim, hidden_dim, num_layers=num_layers, batch_first=True)
45
+ self.fc = nn.Linear(hidden_dim, num_classes)
46
+
47
+ def forward(self, x):
48
+ x = self.embedding(x)
49
+ _, h_n = self.gru(x)
50
+ out = self.fc(h_n[-1])
51
+ return out
52
+
53
+ # --- Učitavanje CNN i GRU modela ---
54
+ vocab_size = len(word2idx) + 1
55
+ embed_dim = 300
56
+ num_classes = 3
57
+
58
+ print("Učitavam CNN model...")
59
+ cnn_model = CNNModel(vocab_size, embed_dim, num_classes)
60
+ cnn_model.load_state_dict(torch.load("cnn_model.pt", map_location=torch.device('cpu')))
61
+ cnn_model.eval()
62
+
63
+ print("Učitavam GRU model...")
64
+ gru_model = GRUModel(vocab_size, embed_dim, hidden_dim=256, num_layers=1, num_classes=num_classes)
65
+ gru_model.load_state_dict(torch.load("gru_model.pt", map_location=torch.device('cpu')))
66
+ gru_model.eval()
67
+
68
+ # --- Učitavanje BERTić modela i tokenizer ---
69
+ print("Učitavam BERTić model i tokenizer...")
70
+ bert_tokenizer = AutoTokenizer.from_pretrained("my_finetuned_model")
71
+ bert_model = AutoModelForSequenceClassification.from_pretrained("my_finetuned_model")
72
+ bert_model.eval()
73
+
74
+ # --- Pretvaranje teksta u indekse za CNN i GRU ---
75
+ def text_to_indices(text, max_len=100):
76
+ tokens = text.lower().split()
77
+ print(f"Tokeni: {tokens}")
78
+ indices = [word2idx.get(token, 0) for token in tokens]
79
+ print(f"Indeksi: {indices}")
80
+ if len(indices) < max_len:
81
+ indices += [0] * (max_len - len(indices))
82
+ else:
83
+ indices = indices[:max_len]
84
+ tensor = torch.tensor([indices], dtype=torch.long)
85
+ print(f"Tensor shape: {tensor.shape}")
86
+ return tensor
87
+
88
+ # --- Funkcije za predikciju ---
89
+
90
+ def predict_svm(text):
91
+ print(f"Predikcija SVM za tekst: {text}")
92
+ proba = svm_pipeline.predict_proba([text])[0]
93
+ pred = svm_pipeline.classes_[proba.argmax()]
94
+ print(f"SVM predikcija: {pred}, povjerenje: {proba.max():.2f}")
95
+ return f"{pred} (p={proba.max():.2f})"
96
+
97
+ def predict_cnn(text):
98
+ print(f"Predikcija CNN za tekst: {text}")
99
+ with torch.no_grad():
100
+ inputs = text_to_indices(text)
101
+ outputs = cnn_model(inputs)
102
+ print(f"CNN output: {outputs}")
103
+ probs = F.softmax(outputs, dim=1)
104
+ pred = torch.argmax(probs, dim=1).item()
105
+ confidence = probs[0][pred].item()
106
+ print(f"CNN predikcija: {pred}, povjerenje: {confidence:.2f}")
107
+ return f"{pred} (p={confidence:.2f})"
108
+
109
+ def predict_gru(text):
110
+ print(f"Predikcija GRU za tekst: {text}")
111
+ with torch.no_grad():
112
+ inputs = text_to_indices(text)
113
+ outputs = gru_model(inputs)
114
+ print(f"GRU output: {outputs}")
115
+ probs = F.softmax(outputs, dim=1)
116
+ pred = torch.argmax(probs, dim=1).item()
117
+ confidence = probs[0][pred].item()
118
+ print(f"GRU predikcija: {pred}, povjerenje: {confidence:.2f}")
119
+ return f"{pred} (p={confidence:.2f})"
120
+
121
+ def predict_bert(text):
122
+ print(f"Predikcija BERTić za tekst: {text}")
123
+ inputs = bert_tokenizer(text, return_tensors="pt", truncation=True, padding=True)
124
+ with torch.no_grad():
125
+ outputs = bert_model(**inputs)
126
+ print(f"BERTić output logits: {outputs.logits}")
127
+ probs = F.softmax(outputs.logits, dim=1)
128
+ pred = torch.argmax(probs, dim=1).item()
129
+ confidence = probs[0][pred].item()
130
+ print(f"BERTić predikcija: {pred}, povjerenje: {confidence:.2f}")
131
+ return f"{pred} (p={confidence:.2f})"
132
+
133
+ # --- Gradio sučelje ---
134
+ def predict_all(text):
135
+ return (
136
+ predict_svm(text),
137
+ predict_cnn(text),
138
+ predict_gru(text),
139
+ predict_bert(text)
140
+ )
141
+
142
+ demo = gr.Interface(
143
+ fn=predict_all,
144
+ inputs=gr.Textbox(lines=3, placeholder="Upiši tekst za klasifikaciju..."),
145
+ outputs=[
146
+ gr.Textbox(label="SVM (RBF)"),
147
+ gr.Textbox(label="CNN"),
148
+ gr.Textbox(label="GRU"),
149
+ gr.Textbox(label="BERTić")
150
+ ],
151
+ title="Demo klasifikacije teksta",
152
+ description="Predikcije koriste SVM, CNN, GRU i BERTić modele."
153
+ )
154
+
155
+ if __name__ == "__main__":
156
+ demo.launch(share=True, debug=True)
cnn_model.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b26e145bac986a39cff1bc7f39e064f250c0dba7662f9088c8d496f38bbb63b1
3
+ size 13854439
gru_model.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8e574b173307f1d8eb5aeea8ef68d0d8b9e1e660fa00cb09387586281e483348
3
+ size 13721707
my_finetuned_model/config.json ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "BertForSequenceClassification"
4
+ ],
5
+ "attention_probs_dropout_prob": 0.1,
6
+ "classifier_dropout": null,
7
+ "gradient_checkpointing": false,
8
+ "hidden_act": "gelu",
9
+ "hidden_dropout_prob": 0.1,
10
+ "hidden_size": 768,
11
+ "id2label": {
12
+ "0": "LABEL_0",
13
+ "1": "LABEL_1",
14
+ "2": "LABEL_2"
15
+ },
16
+ "initializer_range": 0.02,
17
+ "intermediate_size": 3072,
18
+ "label2id": {
19
+ "LABEL_0": 0,
20
+ "LABEL_1": 1,
21
+ "LABEL_2": 2
22
+ },
23
+ "layer_norm_eps": 1e-12,
24
+ "max_position_embeddings": 512,
25
+ "model_type": "bert",
26
+ "num_attention_heads": 12,
27
+ "num_hidden_layers": 12,
28
+ "pad_token_id": 0,
29
+ "position_embedding_type": "absolute",
30
+ "problem_type": "single_label_classification",
31
+ "torch_dtype": "float32",
32
+ "transformers_version": "4.52.4",
33
+ "type_vocab_size": 2,
34
+ "use_cache": true,
35
+ "vocab_size": 30522
36
+ }
my_finetuned_model/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c9f9be5bc9757223ac3e314f7d10c36c0946894585e6c350188dcc4592515234
3
+ size 437961724
my_finetuned_model/special_tokens_map.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "cls_token": "[CLS]",
3
+ "mask_token": "[MASK]",
4
+ "pad_token": "[PAD]",
5
+ "sep_token": "[SEP]",
6
+ "unk_token": "[UNK]"
7
+ }
my_finetuned_model/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
my_finetuned_model/tokenizer_config.json ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "[PAD]",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "100": {
12
+ "content": "[UNK]",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "101": {
20
+ "content": "[CLS]",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "102": {
28
+ "content": "[SEP]",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ },
35
+ "103": {
36
+ "content": "[MASK]",
37
+ "lstrip": false,
38
+ "normalized": false,
39
+ "rstrip": false,
40
+ "single_word": false,
41
+ "special": true
42
+ }
43
+ },
44
+ "clean_up_tokenization_spaces": false,
45
+ "cls_token": "[CLS]",
46
+ "do_lower_case": true,
47
+ "extra_special_tokens": {},
48
+ "mask_token": "[MASK]",
49
+ "model_max_length": 512,
50
+ "pad_token": "[PAD]",
51
+ "sep_token": "[SEP]",
52
+ "strip_accents": null,
53
+ "tokenize_chinese_chars": true,
54
+ "tokenizer_class": "BertTokenizer",
55
+ "unk_token": "[UNK]"
56
+ }
my_finetuned_model/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ee4fb7862129aec04f692f0fed9a34edd7916d9024dd152c28a040631fd81c7c
3
+ size 5649
my_finetuned_model/vocab.txt ADDED
The diff for this file is too large to render. See raw diff
 
svm_pipeline.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cce78663d1e601e069e87581f022bb2c4cdc82531ebcfc1b33d2343c40572350
3
+ size 2055480
word2idx.json ADDED
The diff for this file is too large to render. See raw diff