Spaces:
Running
Running
| { | |
| "course_id": "course-eval-five-pdfs-001", | |
| "documents": [ | |
| { | |
| "path": "doc/samples/Dokumen Uji RAG Hub Sevima.pdf" | |
| }, | |
| { | |
| "path": "doc/samples/Dokumen Uji_RAG_SEVIMA_Kebijakan_Akademik.pdf" | |
| }, | |
| { | |
| "path": "doc/samples/Sop Implementasi Tenant Rag Hub Sevima.pdf" | |
| }, | |
| { | |
| "path": "doc/samples/Dokumen Rag Mata Kuliah Pemrograman Web Struktur Data Dasar Pemrograman.pdf" | |
| }, | |
| { | |
| "path": "doc/samples/Transformer Architecture.pdf" | |
| } | |
| ], | |
| "questions": [ | |
| { | |
| "question": "Apa nama platform yang dijelaskan dalam dokumen spesifikasi RAG Hub SEVIMA?", | |
| "expected_all": [ | |
| "RAG Hub SEVIMA" | |
| ], | |
| "expected_any": [], | |
| "forbidden": [ | |
| "i don't know" | |
| ], | |
| "expect_abstain": false | |
| }, | |
| { | |
| "question": "Apa versi dokumen spesifikasi dan operasional RAG Hub SEVIMA?", | |
| "expected_all": [ | |
| "1.4.2" | |
| ], | |
| "expected_any": [], | |
| "forbidden": [ | |
| "i don't know" | |
| ], | |
| "expect_abstain": false | |
| }, | |
| { | |
| "question": "Kapan tanggal rilis dokumen RAG Hub SEVIMA?", | |
| "expected_all": [ | |
| "17 April 2026" | |
| ], | |
| "expected_any": [], | |
| "forbidden": [ | |
| "i don't know" | |
| ], | |
| "expect_abstain": false | |
| }, | |
| { | |
| "question": "Siapa pemilik dokumen RAG Hub SEVIMA?", | |
| "expected_all": [ | |
| "Tim Platform Knowledge SEVIMA" | |
| ], | |
| "expected_any": [], | |
| "forbidden": [ | |
| "i don't know" | |
| ], | |
| "expect_abstain": false | |
| }, | |
| { | |
| "question": "Apa email kontak eskalasi untuk RAG Hub SEVIMA?", | |
| "expected_all": [ | |
| "rag-platform@sevima.test" | |
| ], | |
| "expected_any": [], | |
| "forbidden": [ | |
| "i don't know" | |
| ], | |
| "expect_abstain": false | |
| }, | |
| { | |
| "question": "Berapa target keberhasilan fase pertama RAG Hub dalam menjawab pertanyaan uji berbasis dokumen?", | |
| "expected_all": [ | |
| "78%" | |
| ], | |
| "expected_any": [ | |
| "minimal" | |
| ], | |
| "forbidden": [ | |
| "i don't know" | |
| ], | |
| "expect_abstain": false | |
| }, | |
| { | |
| "question": "Sebutkan tiga kelompok pengguna utama pada fase pertama RAG Hub.", | |
| "expected_all": [ | |
| "Tim Implementasi Kampus", | |
| "Tim Customer Support", | |
| "Tim Product dan Engineering" | |
| ], | |
| "expected_any": [], | |
| "forbidden": [ | |
| "i don't know" | |
| ], | |
| "expect_abstain": false | |
| }, | |
| { | |
| "question": "Apa vector database utama pada RAG Hub versi 1.4.2?", | |
| "expected_all": [ | |
| "Qdrant" | |
| ], | |
| "expected_any": [], | |
| "forbidden": [ | |
| "i don't know" | |
| ], | |
| "expect_abstain": false | |
| }, | |
| { | |
| "question": "Apa keyword search yang digunakan RAG Hub?", | |
| "expected_all": [ | |
| "PostgreSQL Full-Text Search" | |
| ], | |
| "expected_any": [ | |
| "PostgreSQL" | |
| ], | |
| "forbidden": [ | |
| "i don't know" | |
| ], | |
| "expect_abstain": false | |
| }, | |
| { | |
| "question": "Apa ukuran maksimum file yang dijelaskan pada batasan file RAG Hub?", | |
| "expected_all": [ | |
| "40 MB" | |
| ], | |
| "expected_any": [], | |
| "forbidden": [ | |
| "i don't know" | |
| ], | |
| "expect_abstain": false | |
| }, | |
| { | |
| "question": "Berapa jumlah halaman maksimum per dokumen PDF pada RAG Hub?", | |
| "expected_all": [ | |
| "300" | |
| ], | |
| "expected_any": [ | |
| "halaman" | |
| ], | |
| "forbidden": [ | |
| "i don't know" | |
| ], | |
| "expect_abstain": false | |
| }, | |
| { | |
| "question": "Kapan registrasi ulang mahasiswa wajib dilakukan?", | |
| "expected_all": [ | |
| "awal semester ganjil dan genap" | |
| ], | |
| "expected_any": [], | |
| "forbidden": [ | |
| "i don't know" | |
| ], | |
| "expect_abstain": false | |
| }, | |
| { | |
| "question": "Apa akibat jika mahasiswa gagal melakukan registrasi ulang?", | |
| "expected_all": [ | |
| "non-aktif" | |
| ], | |
| "expected_any": [], | |
| "forbidden": [ | |
| "i don't know" | |
| ], | |
| "expect_abstain": false | |
| }, | |
| { | |
| "question": "Kapan batas maksimal pembayaran UKT untuk semester ganjil dan genap?", | |
| "expected_all": [ | |
| "minggu kedua", | |
| "Agustus", | |
| "Februari" | |
| ], | |
| "expected_any": [], | |
| "forbidden": [ | |
| "i don't know" | |
| ], | |
| "expect_abstain": false | |
| }, | |
| { | |
| "question": "Apa syarat mutlak sebelum perkuliahan dimulai terkait KRS?", | |
| "expected_all": [ | |
| "Validasi KRS", | |
| "Dosen Wali" | |
| ], | |
| "expected_any": [], | |
| "forbidden": [ | |
| "i don't know" | |
| ], | |
| "expect_abstain": false | |
| }, | |
| { | |
| "question": "Berapa rentang nilai angka untuk nilai huruf AB?", | |
| "expected_all": [ | |
| "76", | |
| "85" | |
| ], | |
| "expected_any": [ | |
| "AB" | |
| ], | |
| "forbidden": [ | |
| "i don't know" | |
| ], | |
| "expect_abstain": false | |
| }, | |
| { | |
| "question": "Berapa bobot SKS untuk nilai huruf C?", | |
| "expected_all": [ | |
| "2.00" | |
| ], | |
| "expected_any": [], | |
| "forbidden": [ | |
| "i don't know" | |
| ], | |
| "expect_abstain": false | |
| }, | |
| { | |
| "question": "Berapa biaya administrasi cuti akademik?", | |
| "expected_all": [ | |
| "25%", | |
| "UKT" | |
| ], | |
| "expected_any": [], | |
| "forbidden": [ | |
| "i don't know" | |
| ], | |
| "expect_abstain": false | |
| }, | |
| { | |
| "question": "Berapa durasi maksimal cuti akademik yang diizinkan?", | |
| "expected_all": [ | |
| "4 semester" | |
| ], | |
| "expected_any": [], | |
| "forbidden": [ | |
| "i don't know" | |
| ], | |
| "expect_abstain": false | |
| }, | |
| { | |
| "question": "Apa system_id pada blok metadata API referensi?", | |
| "expected_all": [ | |
| "SIAKAD-CORE-099" | |
| ], | |
| "expected_any": [], | |
| "forbidden": [ | |
| "i don't know" | |
| ], | |
| "expect_abstain": false | |
| }, | |
| { | |
| "question": "Apa data_domain pada blok metadata API referensi?", | |
| "expected_all": [ | |
| "academic_policies" | |
| ], | |
| "expected_any": [], | |
| "forbidden": [ | |
| "i don't know" | |
| ], | |
| "expect_abstain": false | |
| }, | |
| { | |
| "question": "Apa last_sync pada blok metadata API referensi?", | |
| "expected_all": [ | |
| "2026-05-01T08:00:00Z" | |
| ], | |
| "expected_any": [], | |
| "forbidden": [ | |
| "i don't know" | |
| ], | |
| "expect_abstain": false | |
| }, | |
| { | |
| "question": "Siapa nama rektor yang menandatangani dokumen ini?", | |
| "expected_all": [], | |
| "expected_any": [], | |
| "forbidden": [], | |
| "expect_abstain": true | |
| }, | |
| { | |
| "question": "Berapa nomor telepon helpdesk yang tercantum dalam dokumen?", | |
| "expected_all": [], | |
| "expected_any": [], | |
| "forbidden": [], | |
| "expect_abstain": true | |
| }, | |
| { | |
| "question": "Apa alamat kantor fisik SEVIMA yang disebutkan dalam dokumen?", | |
| "expected_all": [], | |
| "expected_any": [], | |
| "forbidden": [], | |
| "expect_abstain": true | |
| }, | |
| { | |
| "question": "Apa versi SOP Implementasi Tenant RAG Hub SEVIMA?", | |
| "expected_all": [ | |
| "2.3.0" | |
| ], | |
| "expected_any": [], | |
| "forbidden": [ | |
| "i don't know" | |
| ], | |
| "expect_abstain": false | |
| }, | |
| { | |
| "question": "Kapan tanggal efektif SOP Implementasi Tenant RAG Hub SEVIMA?", | |
| "expected_all": [ | |
| "28 April 2026" | |
| ], | |
| "expected_any": [], | |
| "forbidden": [ | |
| "i don't know" | |
| ], | |
| "expect_abstain": false | |
| }, | |
| { | |
| "question": "Siapa pemilik dokumen SOP Implementasi Tenant RAG Hub SEVIMA?", | |
| "expected_all": [ | |
| "Tim Implementation Enablement SEVIMA" | |
| ], | |
| "expected_any": [], | |
| "forbidden": [ | |
| "i don't know" | |
| ], | |
| "expect_abstain": false | |
| }, | |
| { | |
| "question": "Apa kontak eskalasi untuk SOP Implementasi Tenant RAG Hub SEVIMA?", | |
| "expected_all": [ | |
| "implementation-enablement@sevima.test" | |
| ], | |
| "expected_any": [], | |
| "forbidden": [ | |
| "i don't know" | |
| ], | |
| "expect_abstain": false | |
| }, | |
| { | |
| "question": "Berapa target durasi normal implementasi tenant baru sejak kickoff?", | |
| "expected_all": [ | |
| "15 hari kerja" | |
| ], | |
| "expected_any": [], | |
| "forbidden": [ | |
| "i don't know" | |
| ], | |
| "expect_abstain": false | |
| }, | |
| { | |
| "question": "Kapan durasi implementasi tenant dapat diperpanjang menjadi 25 hari kerja?", | |
| "expected_all": [ | |
| "lebih dari 500 dokumen" | |
| ], | |
| "expected_any": [ | |
| "25 hari kerja" | |
| ], | |
| "forbidden": [ | |
| "i don't know" | |
| ], | |
| "expect_abstain": false | |
| }, | |
| { | |
| "question": "Kickoff implementasi dilakukan maksimal berapa hari kerja setelah kontrak aktif?", | |
| "expected_all": [ | |
| "3 hari kerja" | |
| ], | |
| "expected_any": [], | |
| "forbidden": [ | |
| "i don't know" | |
| ], | |
| "expect_abstain": false | |
| }, | |
| { | |
| "question": "Apa format Tenant ID yang benar menurut SOP implementasi tenant?", | |
| "expected_all": [ | |
| "T-<KODE-INSTITUSI>-<NOMOR-URUT>" | |
| ], | |
| "expected_any": [ | |
| "T-UCD-001" | |
| ], | |
| "forbidden": [ | |
| "i don't know" | |
| ], | |
| "expect_abstain": false | |
| }, | |
| { | |
| "question": "Apa nilai wajib untuk tenant_isolation_mode?", | |
| "expected_all": [ | |
| "strict" | |
| ], | |
| "expected_any": [], | |
| "forbidden": [ | |
| "i don't know" | |
| ], | |
| "expect_abstain": false | |
| }, | |
| { | |
| "question": "Apa nilai default no_answer_threshold pada konfigurasi wajib tenant?", | |
| "expected_all": [ | |
| "0.42" | |
| ], | |
| "expected_any": [], | |
| "forbidden": [ | |
| "i don't know" | |
| ], | |
| "expect_abstain": false | |
| }, | |
| { | |
| "question": "Apakah Campus Support boleh menghapus dokumen atau mengubah metadata?", | |
| "expected_all": [ | |
| "tidak boleh" | |
| ], | |
| "expected_any": [ | |
| "Campus Support" | |
| ], | |
| "forbidden": [ | |
| "i don't know" | |
| ], | |
| "expect_abstain": false | |
| }, | |
| { | |
| "question": "Kapan ingestion batch pertama boleh dilakukan?", | |
| "expected_all": [ | |
| "80%", | |
| "dokumen P0" | |
| ], | |
| "expected_any": [ | |
| "lolos validasi" | |
| ], | |
| "forbidden": [ | |
| "i don't know" | |
| ], | |
| "expect_abstain": false | |
| }, | |
| { | |
| "question": "Berapa persen sampel chunk dari batch pertama yang wajib diperiksa SEVIMA Knowledge Admin?", | |
| "expected_all": [ | |
| "10%" | |
| ], | |
| "expected_any": [ | |
| "sampel chunk" | |
| ], | |
| "forbidden": [ | |
| "i don't know" | |
| ], | |
| "expect_abstain": false | |
| }, | |
| { | |
| "question": "Kapan batch ingestion dianggap gagal berdasarkan sampel chunk?", | |
| "expected_all": [ | |
| "12%", | |
| "masalah struktur berat" | |
| ], | |
| "expected_any": [], | |
| "forbidden": [ | |
| "i don't know" | |
| ], | |
| "expect_abstain": false | |
| }, | |
| { | |
| "question": "Berapa jumlah minimal pertanyaan uji tenant sebelum go-live?", | |
| "expected_all": [ | |
| "40" | |
| ], | |
| "expected_any": [ | |
| "pertanyaan uji" | |
| ], | |
| "forbidden": [ | |
| "i don't know" | |
| ], | |
| "expect_abstain": false | |
| }, | |
| { | |
| "question": "Sebutkan ambang kelulusan Answer Accuracy, Citation Correctness, No-Answer Precision, dan Access Control Accuracy sebelum go-live.", | |
| "expected_all": [ | |
| "80%", | |
| "88%", | |
| "90%", | |
| "100%" | |
| ], | |
| "expected_any": [], | |
| "forbidden": [ | |
| "i don't know" | |
| ], | |
| "expect_abstain": false | |
| }, | |
| { | |
| "question": "Berapa lama hypercare berlangsung setelah tenant aktif?", | |
| "expected_all": [ | |
| "10 hari kerja" | |
| ], | |
| "expected_any": [], | |
| "forbidden": [ | |
| "i don't know" | |
| ], | |
| "expect_abstain": false | |
| }, | |
| { | |
| "question": "Berapa durasi minimal pelatihan Campus Admin sebelum go-live?", | |
| "expected_all": [ | |
| "90 menit" | |
| ], | |
| "expected_any": [], | |
| "forbidden": [ | |
| "i don't know" | |
| ], | |
| "expect_abstain": false | |
| }, | |
| { | |
| "question": "Apa syarat Support Lead tidak boleh menerima handover?", | |
| "expected_all": [ | |
| "issue P1 terbuka" | |
| ], | |
| "expected_any": [], | |
| "forbidden": [ | |
| "i don't know" | |
| ], | |
| "expect_abstain": false | |
| }, | |
| { | |
| "question": "Apakah role Security Reviewer boleh digabung dengan Knowledge Admin Kampus?", | |
| "expected_all": [ | |
| "tidak boleh" | |
| ], | |
| "expected_any": [ | |
| "Security Reviewer" | |
| ], | |
| "forbidden": [ | |
| "i don't know" | |
| ], | |
| "expect_abstain": false | |
| }, | |
| { | |
| "question": "Apa status yang harus diberikan pada dokumen lama yang digantikan dokumen baru tetapi masih relevan historis?", | |
| "expected_all": [ | |
| "Deprecated" | |
| ], | |
| "expected_any": [], | |
| "forbidden": [ | |
| "i don't know" | |
| ], | |
| "expect_abstain": false | |
| }, | |
| { | |
| "question": "Apa kode dan bobot SKS mata kuliah Dasar Pemrograman?", | |
| "expected_all": [ | |
| "IF101", | |
| "4 SKS" | |
| ], | |
| "expected_any": [], | |
| "forbidden": [ | |
| "i don't know" | |
| ], | |
| "expect_abstain": false | |
| }, | |
| { | |
| "question": "Apa prasyarat mata kuliah Struktur Data?", | |
| "expected_all": [ | |
| "Dasar Pemrograman" | |
| ], | |
| "expected_any": [], | |
| "forbidden": [ | |
| "i don't know" | |
| ], | |
| "expect_abstain": false | |
| }, | |
| { | |
| "question": "Apa kode dan bobot SKS mata kuliah Pemrograman Web?", | |
| "expected_all": [ | |
| "IF305", | |
| "3 SKS" | |
| ], | |
| "expected_any": [], | |
| "forbidden": [ | |
| "i don't know" | |
| ], | |
| "expect_abstain": false | |
| }, | |
| { | |
| "question": "Apa prasyarat mata kuliah Pemrograman Web?", | |
| "expected_all": [ | |
| "Dasar Pemrograman", | |
| "Basis Data" | |
| ], | |
| "expected_any": [], | |
| "forbidden": [ | |
| "i don't know" | |
| ], | |
| "expect_abstain": false | |
| }, | |
| { | |
| "question": "Pada minggu ke berapa Binary Search Tree diajarkan dalam mata kuliah Struktur Data?", | |
| "expected_all": [ | |
| "10", | |
| "Binary Search Tree" | |
| ], | |
| "expected_any": [ | |
| "minggu" | |
| ], | |
| "forbidden": [ | |
| "i don't know" | |
| ], | |
| "expect_abstain": false | |
| }, | |
| { | |
| "question": "Berapa bobot final project Pemrograman Web?", | |
| "expected_all": [ | |
| "35%" | |
| ], | |
| "expected_any": [ | |
| "Final project" | |
| ], | |
| "forbidden": [ | |
| "i don't know" | |
| ], | |
| "expect_abstain": false | |
| }, | |
| { | |
| "question": "Sebutkan fitur wajib final project Pemrograman Web.", | |
| "expected_all": [ | |
| "CRUD", | |
| "Autentikasi pengguna", | |
| "Session login dan logout", | |
| "Validasi input", | |
| "database relasional" | |
| ], | |
| "expected_any": [], | |
| "forbidden": [ | |
| "i don't know" | |
| ], | |
| "expect_abstain": false | |
| }, | |
| { | |
| "question": "Berapa minimal kehadiran untuk mengikuti ujian akhir pada ketiga mata kuliah?", | |
| "expected_all": [ | |
| "75%" | |
| ], | |
| "expected_any": [ | |
| "kehadiran" | |
| ], | |
| "forbidden": [ | |
| "i don't know" | |
| ], | |
| "expect_abstain": false | |
| }, | |
| { | |
| "question": "Apa aturan keterlambatan pengumpulan tugas?", | |
| "expected_all": [ | |
| "10% per hari" | |
| ], | |
| "expected_any": [ | |
| "lebih dari 3 hari", | |
| "ditolak" | |
| ], | |
| "forbidden": [ | |
| "i don't know" | |
| ], | |
| "expect_abstain": false | |
| }, | |
| { | |
| "question": "Apakah password boleh disimpan plaintext pada proyek web?", | |
| "expected_all": [ | |
| "plaintext" | |
| ], | |
| "expected_any": [ | |
| "tidak boleh", | |
| "tidak" | |
| ], | |
| "forbidden": [ | |
| "i don't know" | |
| ], | |
| "expect_abstain": false | |
| }, | |
| { | |
| "question": "Apa fokus utama mata kuliah Struktur Data?", | |
| "expected_all": [ | |
| "penyimpanan", | |
| "pengolahan data", | |
| "efisien" | |
| ], | |
| "expected_any": [], | |
| "forbidden": [ | |
| "i don't know" | |
| ], | |
| "expect_abstain": false | |
| }, | |
| { | |
| "question": "Penggunaan AI seperti apa yang diperbolehkan pada tugas pemrograman?", | |
| "expected_all": [ | |
| "alat bantu belajar" | |
| ], | |
| "expected_any": [ | |
| "Meminta penjelasan konsep", | |
| "Membantu menemukan bug", | |
| "Membantu menyusun dokumentasi awal" | |
| ], | |
| "forbidden": [ | |
| "i don't know" | |
| ], | |
| "expect_abstain": false | |
| }, | |
| { | |
| "question": "Siapa nama CEO yang menyetujui SOP Implementasi Tenant RAG Hub SEVIMA?", | |
| "expected_all": [], | |
| "expected_any": [], | |
| "forbidden": [], | |
| "expect_abstain": true | |
| }, | |
| { | |
| "question": "Berapa nomor telepon darurat tim Implementation Enablement?", | |
| "expected_all": [], | |
| "expected_any": [], | |
| "forbidden": [], | |
| "expect_abstain": true | |
| }, | |
| { | |
| "question": "What are the document ID, version, and publication month for the Transformer Architecture technical reference?", | |
| "expected_all": [ | |
| "AI-TR-2024-047", | |
| "3.0", | |
| "March 2024" | |
| ], | |
| "expected_any": [], | |
| "forbidden": [ | |
| "i don't know" | |
| ], | |
| "expect_abstain": false | |
| }, | |
| { | |
| "question": "What three compounding problems did sequential RNN processing create for long sequences?", | |
| "expected_all": [ | |
| "Parallelism bottleneck", | |
| "Vanishing/exploding gradients", | |
| "Memory compression" | |
| ], | |
| "expected_any": [], | |
| "forbidden": [ | |
| "i don't know" | |
| ], | |
| "expect_abstain": false | |
| }, | |
| { | |
| "question": "Why does scaled dot-product attention divide by sqrt(dk)?", | |
| "expected_all": [ | |
| "softmax" | |
| ], | |
| "expected_any": [ | |
| "near-zero gradients", | |
| "saturated regions", | |
| "reasonable range" | |
| ], | |
| "forbidden": [ | |
| "i don't know" | |
| ], | |
| "expect_abstain": false | |
| }, | |
| { | |
| "question": "In the Transformer base model, how many attention heads are used and what is the per-head dimension?", | |
| "expected_all": [ | |
| "8", | |
| "64" | |
| ], | |
| "expected_any": [ | |
| "heads", | |
| "dimensions" | |
| ], | |
| "forbidden": [ | |
| "i don't know" | |
| ], | |
| "expect_abstain": false | |
| }, | |
| { | |
| "question": "Why has Pre-LN become dominant in large Transformer models?", | |
| "expected_all": [ | |
| "stable gradients" | |
| ], | |
| "expected_any": [ | |
| "eliminates the need for learning rate warmup", | |
| "learning rate warmup" | |
| ], | |
| "forbidden": [ | |
| "i don't know" | |
| ], | |
| "expect_abstain": false | |
| }, | |
| { | |
| "question": "What positional encoding scheme does LLaMA use, and what does it do to query and key vectors?", | |
| "expected_all": [ | |
| "RoPE", | |
| "query", | |
| "key" | |
| ], | |
| "expected_any": [ | |
| "Rotary Positional Embeddings", | |
| "rotating query and key vectors", | |
| "complex space" | |
| ], | |
| "forbidden": [ | |
| "i don't know" | |
| ], | |
| "expect_abstain": false | |
| }, | |
| { | |
| "question": "How many parameters does BERT-Large have, and what pre-training objective does it use?", | |
| "expected_all": [ | |
| "340M" | |
| ], | |
| "expected_any": [ | |
| "Masked Language Modeling", | |
| "MLM" | |
| ], | |
| "forbidden": [ | |
| "i don't know" | |
| ], | |
| "expect_abstain": false | |
| }, | |
| { | |
| "question": "In T5 span corruption, what is the average span length and what percentage of tokens are masked?", | |
| "expected_all": [ | |
| "3", | |
| "15%" | |
| ], | |
| "expected_any": [ | |
| "span length", | |
| "tokens" | |
| ], | |
| "forbidden": [ | |
| "i don't know" | |
| ], | |
| "expect_abstain": false | |
| }, | |
| { | |
| "question": "What is QLoRA and how much GPU memory does it need to fine-tune a 65B model compared with full fine-tuning?", | |
| "expected_all": [ | |
| "4-bit", | |
| "NF4", | |
| "48 GB", | |
| "780 GB" | |
| ], | |
| "expected_any": [ | |
| "LoRA adapters", | |
| "frozen base weights" | |
| ], | |
| "forbidden": [ | |
| "i don't know" | |
| ], | |
| "expect_abstain": false | |
| }, | |
| { | |
| "question": "For a 7B LLaMA model at sequence length 4096, how much KV cache memory is required per request?", | |
| "expected_all": [ | |
| "2 GB", | |
| "4096" | |
| ], | |
| "expected_any": [ | |
| "KV cache" | |
| ], | |
| "forbidden": [ | |
| "i don't know" | |
| ], | |
| "expect_abstain": false | |
| }, | |
| { | |
| "question": "How does FlashAttention improve attention computation and what speedup does it achieve?", | |
| "expected_all": [ | |
| "IO-aware", | |
| "SRAM", | |
| "speedup" | |
| ], | |
| "expected_any": [ | |
| "2", | |
| "4" | |
| ], | |
| "forbidden": [ | |
| "i don't know" | |
| ], | |
| "expect_abstain": false | |
| }, | |
| { | |
| "question": "How does speculative decoding use a draft model, and what throughput improvement can it achieve?", | |
| "expected_all": [ | |
| "draft model", | |
| "2", | |
| "3" | |
| ], | |
| "expected_any": [ | |
| "candidate tokens", | |
| "verifies", | |
| "single forward pass" | |
| ], | |
| "forbidden": [ | |
| "i don't know" | |
| ], | |
| "expect_abstain": false | |
| } | |
| ] | |
| } | |