Firmansyah-Ibrahim commited on
Commit
51a2583
Β·
verified Β·
1 Parent(s): e1efb14

Create codekaggle/ .qa_generator_BSE_C1_C2

Browse files
Files changed (1) hide show
  1. codekaggle/ .qa_generator_BSE_C1_C2 +323 -0
codekaggle/ .qa_generator_BSE_C1_C2 ADDED
@@ -0,0 +1,323 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # =============================================================================
2
+ # INDO-BLOOM LOCAL QA GENERATOR v1.0 β€” KAGGLE GPU (NO API KEY)
3
+ #
4
+ # Model : Qwen/Qwen2.5-3B-Instruct (lokal, gratis, tanpa rate limit)
5
+ # GPU : Kaggle T4 (16GB VRAM) β€” aktifkan di Settings β†’ Accelerator β†’ GPU T4
6
+ # Input : CSV hasil IBEX (kolom 'context')
7
+ # Output : CSV QA pairs C1 + C2 siap pakai sebagai Indo-Bloom corpus
8
+ #
9
+ # CARA PAKAI:
10
+ # 1. Buka Kaggle Notebook β†’ Settings β†’ Accelerator β†’ pilih "GPU T4 x2" atau "GPU T4"
11
+ # 2. Upload CSV hasil IBEX sebagai dataset input
12
+ # 3. Jalankan cell ini β€” model otomatis diunduh (~6GB, sekali saja)
13
+ # 4. Selesai! Tidak ada API key, tidak ada rate limit.
14
+ # =============================================================================
15
+
16
+ import subprocess, sys
17
+
18
+ # Install transformasi yang dibutuhkan
19
+ subprocess.run([sys.executable, "-m", "pip", "install", "-q",
20
+ "transformers", "accelerate", "torch"], check=False)
21
+
22
+ import os, json, re, hashlib, time
23
+ import pandas as pd
24
+ import torch
25
+ from transformers import AutoTokenizer, AutoModelForCausalLM
26
+
27
+ # ══════════════════════════════════════════════════════════════════════════════
28
+ # KONFIGURASI
29
+ # ══════════════════════════════════════════════════════════════════════════════
30
+
31
+ MODEL_NAME = "Qwen/Qwen2.5-3B-Instruct"
32
+ FILE_PATH = "/kaggle/input/datasets/baimfirmansyah/sosiologi-bs-kls-x11-fulhalaman/IBEX_Sosiologi_BS_KLS_XII_hal15-240_chunk150_noise2_FULL.csv"
33
+ OUTPUT_FILE = "/kaggle/working/IndoBloom_QA_Local_Final.csv"
34
+ ERROR_FILE = "/kaggle/working/IndoBloom_QA_Local_Errors.csv"
35
+
36
+ N_C1_PER_CHUNK = 2 # jumlah QA C1 per chunk
37
+ N_C2_PER_CHUNK = 2 # jumlah QA C2 per chunk
38
+ MAX_NEW_TOKENS = 600
39
+ TEMPERATURE = 0.7
40
+
41
+ # ══════════════════════════════════════════════════════════════════════════════
42
+ # LOAD MODEL
43
+ # ══════════════════════════════════════════════════════════════════════════════
44
+
45
+ print("=" * 60)
46
+ print(f"πŸ€– Memuat model: {MODEL_NAME}")
47
+ print(" (proses ini ~2-5 menit pertama kali, lalu cached)")
48
+ print("=" * 60)
49
+
50
+ device = "cuda" if torch.cuda.is_available() else "cpu"
51
+ print(f"βš™οΈ Device: {device.upper()}")
52
+
53
+ tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
54
+ model = AutoModelForCausalLM.from_pretrained(
55
+ MODEL_NAME,
56
+ torch_dtype=torch.float16 if device == "cuda" else torch.float32,
57
+ device_map="auto",
58
+ trust_remote_code=True,
59
+ )
60
+ model.eval()
61
+ print(f"βœ… Model siap di {device.upper()}")
62
+
63
+ # ══════════════════════════════════════════════════════════════════════════════
64
+ # PROMPT TEMPLATES
65
+ # ══════════════════════════════════════════════════════════════════════════════
66
+
67
+ SYSTEM_MSG = (
68
+ "Anda adalah pakar pembuatan soal Taksonomi Bloom Bahasa Indonesia. "
69
+ "Tugas Anda membuat soal yang tepat sesuai level kognitif yang diminta. "
70
+ "Selalu kembalikan output dalam format JSON yang valid."
71
+ )
72
+
73
+ def prompt_c1(konteks: str, n: int) -> str:
74
+ return (
75
+ f"Bacalah teks berikut:\n\"\"\"{konteks}\"\"\"\n\n"
76
+ f"Buat {n} pasang soal-jawaban level C1 (Mengingat).\n"
77
+ "Ketentuan C1:\n"
78
+ "- Pertanyaan diawali: apa, siapa, kapan, di mana, atau berapa\n"
79
+ "- Jawaban berupa fakta eksplisit dari teks (maks 15 kata)\n\n"
80
+ f"Output JSON (tanpa teks lain):\n"
81
+ '{"c1": [{"question": "...", "answer": "..."}, ...]}'
82
+ )
83
+
84
+ def prompt_c2(konteks: str, n: int) -> str:
85
+ return (
86
+ f"Bacalah teks berikut:\n\"\"\"{konteks}\"\"\"\n\n"
87
+ f"Buat {n} pasang soal-jawaban level C2 (Memahami).\n"
88
+ "Ketentuan C2:\n"
89
+ "- Pertanyaan WAJIB diawali: mengapa atau bagaimana\n"
90
+ "- Jawaban menjelaskan sebab-akibat/proses, min 20 kata\n"
91
+ "- Jawaban HARUS mengandung kata: karena/sehingga/mengakibatkan/berdampak\n"
92
+ "- Jawaban dengan bahasa sendiri, BUKAN copy-paste teks\n\n"
93
+ f"Output JSON (tanpa teks lain):\n"
94
+ '{"c2": [{"question": "...", "answer": "..."}, ...]}'
95
+ )
96
+
97
+ # ═════════════════════════════════════���════════════════════════════════════════
98
+ # FUNGSI GENERATE
99
+ # ══════════════════════════════════════════════════════════════════════════════
100
+
101
+ def generate_json(user_prompt: str) -> dict | None:
102
+ """Panggil model lokal dan parse JSON dari output."""
103
+ messages = [
104
+ {"role": "system", "content": SYSTEM_MSG},
105
+ {"role": "user", "content": user_prompt},
106
+ ]
107
+
108
+ text = tokenizer.apply_chat_template(
109
+ messages, tokenize=False, add_generation_prompt=True
110
+ )
111
+ inputs = tokenizer([text], return_tensors="pt").to(device)
112
+
113
+ with torch.no_grad():
114
+ outputs = model.generate(
115
+ **inputs,
116
+ max_new_tokens=MAX_NEW_TOKENS,
117
+ temperature=TEMPERATURE,
118
+ do_sample=True,
119
+ pad_token_id=tokenizer.eos_token_id,
120
+ )
121
+
122
+ # Ambil hanya bagian yang digenerate (bukan prompt)
123
+ generated = outputs[0][inputs["input_ids"].shape[-1]:]
124
+ raw = tokenizer.decode(generated, skip_special_tokens=True).strip()
125
+
126
+ # Extract JSON
127
+ s = raw.find('{')
128
+ e = raw.rfind('}') + 1
129
+ if s == -1:
130
+ return None
131
+
132
+ try:
133
+ return json.loads(raw[s:e])
134
+ except json.JSONDecodeError:
135
+ # Coba bersihkan trailing comma umum di LLM output
136
+ cleaned = re.sub(r',\s*([}\]])', r'\1', raw[s:e])
137
+ try:
138
+ return json.loads(cleaned)
139
+ except Exception:
140
+ return None
141
+
142
+
143
+ def validasi_c1(q: str, a: str) -> tuple[bool, str]:
144
+ starters = ["apa", "siapa", "kapan", "di mana", "dimana", "berapa"]
145
+ if not any(q.lower().startswith(s) for s in starters):
146
+ return False, f"Tidak diawali kata tanya C1 (mulai: '{q[:30]}')"
147
+ if len(a.split()) > 20:
148
+ return False, f"Jawaban C1 terlalu panjang ({len(a.split())} kata)"
149
+ if len(a.split()) < 2:
150
+ return False, "Jawaban terlalu pendek"
151
+ return True, "OK"
152
+
153
+
154
+ def validasi_c2(q: str, a: str) -> tuple[bool, str]:
155
+ if not any(q.lower().startswith(k) for k in ["mengapa", "bagaimana"]):
156
+ return False, f"Tidak diawali 'Mengapa'/'Bagaimana' (mulai: '{q[:30]}')"
157
+ if len(a.split()) < 20:
158
+ return False, f"Jawaban terlalu pendek ({len(a.split())} kata, min 20)"
159
+ kausal = ['karena', 'sehingga', 'mengakibatkan', 'berdampak',
160
+ 'akibatnya', 'dampaknya', 'disebabkan', 'mendorong', 'menyebabkan']
161
+ if not any(k in a.lower() for k in kausal):
162
+ return False, "Tidak ada penanda kausal"
163
+ return True, "OK"
164
+
165
+
166
+ def proses_chunk(chunk_id: str, konteks: str) -> tuple[list, list]:
167
+ """
168
+ Generate C1 + C2 untuk satu chunk.
169
+ Return: (valid_rows, error_rows)
170
+ """
171
+ uid = hashlib.md5(konteks.encode()).hexdigest()[:8]
172
+ valid = []
173
+ errors = []
174
+
175
+ # ── C1 ────────────────────────────────────────────────────────────────
176
+ data_c1 = generate_json(prompt_c1(konteks, N_C1_PER_CHUNK))
177
+ if data_c1 and "c1" in data_c1:
178
+ for item in data_c1["c1"]:
179
+ q = item.get("question", "").strip()
180
+ a = item.get("answer", "").strip()
181
+ ok, alasan = validasi_c1(q, a)
182
+ if ok:
183
+ valid.append({
184
+ "id" : f"BSE-SOS-12-{chunk_id}-{uid}-C1",
185
+ "chunk_id" : chunk_id,
186
+ "bloom_level" : "C1",
187
+ "bloom_label" : "Mengingat (Remembering)",
188
+ "answer_type" : "extractive",
189
+ "question" : q,
190
+ "answer" : a,
191
+ "answer_words" : len(a.split()),
192
+ "context" : konteks,
193
+ })
194
+ else:
195
+ errors.append({"chunk_id": chunk_id, "level": "C1",
196
+ "alasan": alasan, "q": q[:100], "a": a[:100]})
197
+ else:
198
+ errors.append({"chunk_id": chunk_id, "level": "C1",
199
+ "alasan": "Gagal parse JSON atau key 'c1' tidak ada", "q": "", "a": ""})
200
+
201
+ # ── C2 ────────────────────────────────────────────────────────────────
202
+ data_c2 = generate_json(prompt_c2(konteks, N_C2_PER_CHUNK))
203
+ if data_c2 and "c2" in data_c2:
204
+ for item in data_c2["c2"]:
205
+ q = item.get("question", "").strip()
206
+ a = item.get("answer", "").strip()
207
+ ok, alasan = validasi_c2(q, a)
208
+ if ok:
209
+ valid.append({
210
+ "id" : f"BSE-SOS-12-{chunk_id}-{uid}-C2",
211
+ "chunk_id" : chunk_id,
212
+ "bloom_level" : "C2",
213
+ "bloom_label" : "Memahami (Understanding)",
214
+ "answer_type" : "abstractive",
215
+ "question" : q,
216
+ "answer" : a,
217
+ "answer_words" : len(a.split()),
218
+ "context" : konteks,
219
+ })
220
+ else:
221
+ errors.append({"chunk_id": chunk_id, "level": "C2",
222
+ "alasan": alasan, "q": q[:100], "a": a[:100]})
223
+ else:
224
+ errors.append({"chunk_id": chunk_id, "level": "C2",
225
+ "alasan": "Gagal parse JSON atau key 'c2' tidak ada", "q": "", "a": ""})
226
+
227
+ return valid, errors
228
+
229
+
230
+ # ══════════════════════════════════════════════════════════════════════════════
231
+ # BACA INPUT & RESUME
232
+ # ══════════════════════════════════════════════════════════════════════════════
233
+
234
+ assert os.path.exists(FILE_PATH), f"File tidak ditemukan: {FILE_PATH}"
235
+ df_input = pd.read_csv(FILE_PATH)
236
+ print(f"\nβœ… {len(df_input)} chunk dimuat dari {os.path.basename(FILE_PATH)}")
237
+
238
+ # Resume
239
+ def muat_output(path):
240
+ if not os.path.exists(path) or os.path.getsize(path) == 0:
241
+ return set(), []
242
+ try:
243
+ df = pd.read_csv(path)
244
+ if df.empty or "chunk_id" not in df.columns:
245
+ return set(), []
246
+ ids = set(df["chunk_id"].tolist())
247
+ print(f"♻️ Resume: {len(ids)} chunk sudah ada.")
248
+ return ids, df.to_dict("records")
249
+ except Exception:
250
+ return set(), []
251
+
252
+ processed_ids, all_rows = muat_output(OUTPUT_FILE)
253
+ error_rows = []
254
+
255
+ # ══════════════════════════════════════════════════════════════════════════════
256
+ # LOOP UTAMA
257
+ # ══════════════════════════════════════════════════════════════════════════════
258
+
259
+ sisa = len(df_input) - len(processed_ids)
260
+ print(f"⏱️ Estimasi: ~{sisa * 30 // 60} menit untuk {sisa} chunk "
261
+ f"(~30s/chunk di GPU T4)")
262
+ print("\n" + "=" * 60)
263
+ print("πŸš€ Memulai generate QA (lokal, tanpa API key)")
264
+ print("=" * 60)
265
+
266
+ for idx, row in df_input.iterrows():
267
+ chunk_id = str(row["chunk_id"])
268
+ konteks = str(row["context"])
269
+
270
+ if chunk_id in processed_ids:
271
+ print(f"[{idx+1}/{len(df_input)}] {chunk_id} β€” dilewati.")
272
+ continue
273
+
274
+ print(f"\n[{idx+1}/{len(df_input)}] {chunk_id}...")
275
+ t0 = time.time()
276
+
277
+ valid, errors = proses_chunk(chunk_id, konteks)
278
+ elapsed = time.time() - t0
279
+
280
+ all_rows.extend(valid)
281
+ error_rows.extend(errors)
282
+ processed_ids.add(chunk_id)
283
+
284
+ # Log hasil
285
+ c1_ok = sum(1 for r in valid if r["bloom_level"] == "C1")
286
+ c2_ok = sum(1 for r in valid if r["bloom_level"] == "C2")
287
+ c1_err = sum(1 for e in errors if e["level"] == "C1")
288
+ c2_err = sum(1 for e in errors if e["level"] == "C2")
289
+
290
+ print(f" βœ… C1: {c1_ok} valid, {c1_err} ditolak | "
291
+ f"C2: {c2_ok} valid, {c2_err} ditolak | {elapsed:.0f}s")
292
+
293
+ for r in valid:
294
+ print(f" [{r['bloom_level']}] Q: {r['question'][:80]}")
295
+ print(f" A: {r['answer'][:80]}{'...' if len(r['answer'])>80 else ''}")
296
+
297
+ # Checkpoint setiap 5 chunk
298
+ if len(processed_ids) % 5 == 0:
299
+ pd.DataFrame(all_rows).to_csv(OUTPUT_FILE, index=False, encoding="utf-8-sig")
300
+ print(f" πŸ’Ύ Checkpoint: {len(all_rows)} QA disimpan.")
301
+
302
+ # ══════════════════════════════════════════════════════════════════════════════
303
+ # SIMPAN & LAPORAN AKHIR
304
+ # ══════════════════════════════════════════════════════════════════════════════
305
+
306
+ df_out = pd.DataFrame(all_rows)
307
+ df_out.to_csv(OUTPUT_FILE, index=False, encoding="utf-8-sig")
308
+
309
+ if error_rows:
310
+ pd.DataFrame(error_rows).to_csv(ERROR_FILE, index=False, encoding="utf-8-sig")
311
+
312
+ print("\n" + "=" * 60)
313
+ print("βœ… SELESAI!")
314
+ print(f" Total QA valid : {len(all_rows)}")
315
+ if not df_out.empty:
316
+ c1_total = len(df_out[df_out["bloom_level"] == "C1"])
317
+ c2_total = len(df_out[df_out["bloom_level"] == "C2"])
318
+ print(f" β€’ C1 (Mengingat) : {c1_total}")
319
+ print(f" β€’ C2 (Memahami) : {c2_total}")
320
+ print(f" Rata-rata jawaban : {df_out['answer_words'].mean():.1f} kata")
321
+ print(f" Error/ditolak : {len(error_rows)}")
322
+ print(f" Output : {OUTPUT_FILE}")
323
+ print("=" * 60)