omernet commited on
Commit
1d80aa1
·
verified ·
1 Parent(s): 4228343

Upload app.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. app.py +269 -0
app.py ADDED
@@ -0,0 +1,269 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as st
2
+ import torch
3
+ import json
4
+ import requests
5
+ from transformers import (
6
+ AutoTokenizer,
7
+ AutoModelForSequenceClassification,
8
+ TrainingArguments,
9
+ Trainer,
10
+ DataCollatorWithPadding
11
+ )
12
+ from datasets import Dataset, DatasetDict
13
+ import numpy as np
14
+ from sklearn.metrics import accuracy_score, precision_recall_fscore_support
15
+ import os
16
+
17
+ # Sayfa ayarları
18
+ st.set_page_config(page_title="Code Security Trainer", page_icon="🎓", layout="wide")
19
+
20
+ st.title("🎓 Code Security Model Trainer")
21
+ st.markdown("Interaktif model eğitim arayüzü - Kontrol sende!")
22
+
23
+ # Session state
24
+ if 'model' not in st.session_state:
25
+ st.session_state.model = None
26
+ if 'tokenizer' not in st.session_state:
27
+ st.session_state.tokenizer = None
28
+ if 'dataset' not in st.session_state:
29
+ st.session_state.dataset = None
30
+ if 'training_logs' not in st.session_state:
31
+ st.session_state.training_logs = []
32
+
33
+ # Sidebar kontroller
34
+ with st.sidebar:
35
+ st.header("⚙️ Eğitim Ayarları")
36
+
37
+ epochs = st.slider("Epoch sayısı", 5, 50, 20)
38
+ learning_rate = st.select_slider("Learning rate", options=[1e-5, 2e-5, 5e-5, 1e-4], value=2e-5)
39
+ batch_size = st.selectbox("Batch size", [2, 4, 8], index=1)
40
+
41
+ st.markdown("---")
42
+ st.header("📊 Durum")
43
+ if st.session_state.model:
44
+ st.success("Model yüklendi")
45
+ else:
46
+ st.info("Model yüklenmedi")
47
+
48
+ # Ana bölüm
49
+ tab1, tab2, tab3, tab4 = st.tabs(["📥 Veri Seti", "🏋️ Eğitim", "🧪 Test", "💾 Kaydet"])
50
+
51
+ # Tab 1: Veri Seti
52
+ with tab1:
53
+ st.header("Veri Seti Yükle")
54
+
55
+ col1, col2 = st.columns(2)
56
+
57
+ with col1:
58
+ st.subheader("HF'den İndir")
59
+ if st.button("📥 HF Veri Setini İndir"):
60
+ with st.spinner("İndiriliyor..."):
61
+ try:
62
+ url = "https://huggingface.co/datasets/omernet/code-security-dataset/resolve/main/python_sql_20.jsonl"
63
+ response = requests.get(url)
64
+
65
+ data = []
66
+ for line in response.text.strip().split('\n'):
67
+ if line.strip():
68
+ data.append(json.loads(line))
69
+
70
+ st.session_state.raw_data = data
71
+
72
+ # Göster
73
+ st.success(f"{len(data)} örnek yüklendi!")
74
+ st.write(f"- Zafiyetli: {sum(1 for d in data if d['label'] == 1)}")
75
+ st.write(f"- Güvenli: {sum(1 for d in data if d['label'] == 0)}")
76
+
77
+ # Veri setini hazırla
78
+ train_data = data[:14]
79
+ val_data = data[14:17]
80
+ test_data = data[17:]
81
+
82
+ def create_dataset(examples):
83
+ return Dataset.from_dict({
84
+ 'code': [e['code'] for e in examples],
85
+ 'label': [e['label'] for e in examples]
86
+ })
87
+
88
+ st.session_state.dataset = DatasetDict({
89
+ 'train': create_dataset(train_data),
90
+ 'validation': create_dataset(val_data),
91
+ 'test': create_dataset(test_data)
92
+ })
93
+
94
+ st.success("Veri seti hazır!")
95
+
96
+ except Exception as e:
97
+ st.error(f"Hata: {e}")
98
+
99
+ with col2:
100
+ st.subheader("Örnekleri Gör")
101
+ if st.session_state.get('raw_data'):
102
+ sample_type = st.radio("Tür", ["Zafiyetli", "Güvenli"])
103
+ label = 1 if sample_type == "Zafiyetli" else 0
104
+ samples = [d for d in st.session_state.raw_data if d['label'] == label]
105
+
106
+ if samples:
107
+ selected = st.selectbox("Örnek seç", range(len(samples)), format_func=lambda i: f"Örnek {i+1}")
108
+ st.code(samples[selected]['code'], language='python')
109
+
110
+ # Tab 2: Eğitim
111
+ with tab2:
112
+ st.header("Model Eğitimi")
113
+
114
+ col1, col2 = st.columns([1, 2])
115
+
116
+ with col1:
117
+ st.subheader("Başlat")
118
+
119
+ if st.button("🚀 Eğitimi Başlat", type="primary"):
120
+ if not st.session_state.get('dataset'):
121
+ st.error("Önce veri setini indir!")
122
+ else:
123
+ with st.spinner("Model yükleniyor..."):
124
+ # Model yükle
125
+ MODEL_NAME = "microsoft/codebert-base"
126
+ st.session_state.tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
127
+ st.session_state.model = AutoModelForSequenceClassification.from_pretrained(
128
+ MODEL_NAME, num_labels=2
129
+ )
130
+
131
+ # Tokenize
132
+ def tokenize_function(examples):
133
+ return st.session_state.tokenizer(
134
+ examples['code'],
135
+ padding='max_length',
136
+ truncation=True,
137
+ max_length=512
138
+ )
139
+
140
+ tokenized = st.session_state.dataset.map(tokenize_function, batched=True)
141
+ tokenized = tokenized.remove_columns(['code'])
142
+ tokenized = tokenized.rename_column('label', 'labels')
143
+ tokenized.set_format('torch')
144
+
145
+ st.session_state.tokenized_dataset = tokenized
146
+
147
+ # Eğitim
148
+ with st.spinner(f"Eğitim başlıyor ({epochs} epoch)..."):
149
+ def compute_metrics(eval_pred):
150
+ logits, labels = eval_pred
151
+ predictions = np.argmax(logits, axis=-1)
152
+ precision, recall, f1, _ = precision_recall_fscore_support(
153
+ labels, predictions, average='binary'
154
+ )
155
+ acc = accuracy_score(labels, predictions)
156
+ return {'accuracy': acc, 'f1': f1, 'precision': precision, 'recall': recall}
157
+
158
+ training_args = TrainingArguments(
159
+ output_dir="./results",
160
+ learning_rate=learning_rate,
161
+ per_device_train_batch_size=batch_size,
162
+ per_device_eval_batch_size=batch_size,
163
+ num_train_epochs=epochs,
164
+ weight_decay=0.01,
165
+ evaluation_strategy="epoch",
166
+ save_strategy="epoch",
167
+ load_best_model_at_end=True,
168
+ metric_for_best_model="f1",
169
+ logging_dir='./logs',
170
+ logging_steps=1,
171
+ report_to="none"
172
+ )
173
+
174
+ trainer = Trainer(
175
+ model=st.session_state.model,
176
+ args=training_args,
177
+ train_dataset=tokenized['train'],
178
+ eval_dataset=tokenized['validation'],
179
+ tokenizer=st.session_state.tokenizer,
180
+ data_collator=DataCollatorWithPadding(st.session_state.tokenizer),
181
+ compute_metrics=compute_metrics,
182
+ )
183
+
184
+ # Eğit
185
+ trainer.train()
186
+
187
+ # Test
188
+ results = trainer.evaluate(tokenized['test'])
189
+ st.session_state.test_results = results
190
+
191
+ st.success("Eğitim tamamlandı!")
192
+
193
+ with col2:
194
+ st.subheader("Sonuçlar")
195
+ if st.session_state.get('test_results'):
196
+ results = st.session_state.test_results
197
+
198
+ col_m1, col_m2, col_m3, col_m4 = st.columns(4)
199
+ with col_m1:
200
+ st.metric("Accuracy", f"{results['eval_accuracy']:.2%}")
201
+ with col_m2:
202
+ st.metric("F1 Score", f"{results['eval_f1']:.2%}")
203
+ with col_m3:
204
+ st.metric("Precision", f"{results['eval_precision']:.2%}")
205
+ with col_m4:
206
+ st.metric("Recall", f"{results['eval_recall']:.2%}")
207
+ else:
208
+ st.info("Eğitim sonrası sonuçlar burada görünecek")
209
+
210
+ # Tab 3: Test
211
+ with tab3:
212
+ st.header("Model Testi")
213
+
214
+ if not st.session_state.get('model'):
215
+ st.warning("Önce modeli eğit!")
216
+ else:
217
+ test_code = st.text_area("Test kodu", height=150, value="def login(u, p):\n query = f\"SELECT * FROM users WHERE name='{u}'\"\n return db.execute(query)")
218
+
219
+ if st.button("🔍 Tahmin Et"):
220
+ with st.spinner("Tahmin yapılıyor..."):
221
+ inputs = st.session_state.tokenizer(
222
+ test_code,
223
+ return_tensors="pt",
224
+ truncation=True,
225
+ max_length=512
226
+ )
227
+
228
+ with torch.no_grad():
229
+ outputs = st.session_state.model(**inputs)
230
+ probabilities = torch.softmax(outputs.logits, dim=-1)
231
+ prediction = torch.argmax(probabilities, dim=-1).item()
232
+ confidence = probabilities[0][prediction].item()
233
+
234
+ if prediction == 1:
235
+ st.error(f"🔴 ZAFİYET TESPİT EDİLDİ (Güven: {confidence:.2%})")
236
+ else:
237
+ st.success(f"🟢 GÜVENLİ (Güven: {confidence:.2%})")
238
+
239
+ # Tab 4: Kaydet
240
+ with tab4:
241
+ st.header("Modeli Kaydet")
242
+
243
+ if not st.session_state.get('model'):
244
+ st.warning("Önce modeli eğit!")
245
+ else:
246
+ if st.button("💾 Local Kaydet"):
247
+ with st.spinner("Kaydediliyor..."):
248
+ st.session_state.model.save_pretrained("./code-security-model")
249
+ st.session_state.tokenizer.save_pretrained("./code-security-model")
250
+ st.success("Model kaydedildi!")
251
+
252
+ st.markdown("---")
253
+
254
+ hf_token = st.text_input("HF Token (opsiyonel)", type="password")
255
+ if st.button("☁️ Hugging Face'e Yükle"):
256
+ if hf_token:
257
+ with st.spinner("Yükleniyor..."):
258
+ from huggingface_hub import login, HfApi
259
+ login(token=hf_token)
260
+ api = HfApi()
261
+
262
+ api.create_repo(repo_id="omernet/code-security-trained", exist_ok=True)
263
+ api.upload_folder(
264
+ folder_path="./code-security-model",
265
+ repo_id="omernet/code-security-trained"
266
+ )
267
+ st.success("HF'e yüklendi!")
268
+ else:
269
+ st.error("HF token gerekli!")