Buckets:
| <meta charset="utf-8" /><meta name="hf:doc:metadata" content="{"title":"Penerapan Inferensi yang Dioptimalkan","local":"penerapan-inferensi-yang-dioptimalkan","sections":[{"title":"Panduan Pemilihan Kerangka Kerja","local":"panduan-pemilihan-kerangka-kerja","sections":[{"title":"Manajemen Memori dan Performa","local":"manajemen-memori-dan-performa","sections":[],"depth":3},{"title":"Penerapan dan Integrasi","local":"penerapan-dan-integrasi","sections":[],"depth":3}],"depth":2},{"title":"Memulai Penggunaan","local":"memulai-penggunaan","sections":[{"title":"Instalasi dan Pengaturan Dasar","local":"instalasi-dan-pengaturan-dasar","sections":[],"depth":3},{"title":"Generasi Teks Dasar","local":"generasi-teks-dasar","sections":[],"depth":3}],"depth":2},{"title":"Kontrol Generasi Lanjutan","local":"kontrol-generasi-lanjutan","sections":[{"title":"Pemilihan Token dan Sampling","local":"pemilihan-token-dan-sampling","sections":[],"depth":3},{"title":"Mengontrol Pengulangan","local":"mengontrol-pengulangan","sections":[],"depth":3},{"title":"Kontrol Panjang dan Stop Sequences","local":"kontrol-panjang-dan-stop-sequences","sections":[],"depth":3}],"depth":2},{"title":"Pengelolaan Memori","local":"pengelolaan-memori","sections":[],"depth":2},{"title":"Sumber","local":"sumber","sections":[],"depth":2}],"depth":1}"> | |
| <link href="/docs/course/pr_1054/id/_app/immutable/assets/0.e3b0c442.css" rel="modulepreload"> | |
| <link rel="modulepreload" href="/docs/course/pr_1054/id/_app/immutable/entry/start.4f92af03.js"> | |
| <link rel="modulepreload" href="/docs/course/pr_1054/id/_app/immutable/chunks/scheduler.36a0863c.js"> | |
| <link rel="modulepreload" href="/docs/course/pr_1054/id/_app/immutable/chunks/singletons.7dc7b9a4.js"> | |
| <link rel="modulepreload" href="/docs/course/pr_1054/id/_app/immutable/chunks/index.733708bb.js"> | |
| <link rel="modulepreload" href="/docs/course/pr_1054/id/_app/immutable/chunks/paths.cf097d06.js"> | |
| <link rel="modulepreload" href="/docs/course/pr_1054/id/_app/immutable/entry/app.19cef1b6.js"> | |
| <link rel="modulepreload" href="/docs/course/pr_1054/id/_app/immutable/chunks/index.156fee99.js"> | |
| <link rel="modulepreload" href="/docs/course/pr_1054/id/_app/immutable/nodes/0.1203e4a0.js"> | |
| <link rel="modulepreload" href="/docs/course/pr_1054/id/_app/immutable/chunks/each.e59479a4.js"> | |
| <link rel="modulepreload" href="/docs/course/pr_1054/id/_app/immutable/nodes/21.a65c210c.js"> | |
| <link rel="modulepreload" href="/docs/course/pr_1054/id/_app/immutable/chunks/Tip.8a648467.js"> | |
| <link rel="modulepreload" href="/docs/course/pr_1054/id/_app/immutable/chunks/CodeBlock.4cf998e6.js"> | |
| <link rel="modulepreload" href="/docs/course/pr_1054/id/_app/immutable/chunks/getInferenceSnippets.472bc46d.js"> | |
| <link rel="modulepreload" href="/docs/course/pr_1054/id/_app/immutable/chunks/stores.8f4efe8a.js"><!-- HEAD_svelte-u9bgzb_START --><meta name="hf:doc:metadata" content="{"title":"Penerapan Inferensi yang Dioptimalkan","local":"penerapan-inferensi-yang-dioptimalkan","sections":[{"title":"Panduan Pemilihan Kerangka Kerja","local":"panduan-pemilihan-kerangka-kerja","sections":[{"title":"Manajemen Memori dan Performa","local":"manajemen-memori-dan-performa","sections":[],"depth":3},{"title":"Penerapan dan Integrasi","local":"penerapan-dan-integrasi","sections":[],"depth":3}],"depth":2},{"title":"Memulai Penggunaan","local":"memulai-penggunaan","sections":[{"title":"Instalasi dan Pengaturan Dasar","local":"instalasi-dan-pengaturan-dasar","sections":[],"depth":3},{"title":"Generasi Teks Dasar","local":"generasi-teks-dasar","sections":[],"depth":3}],"depth":2},{"title":"Kontrol Generasi Lanjutan","local":"kontrol-generasi-lanjutan","sections":[{"title":"Pemilihan Token dan Sampling","local":"pemilihan-token-dan-sampling","sections":[],"depth":3},{"title":"Mengontrol Pengulangan","local":"mengontrol-pengulangan","sections":[],"depth":3},{"title":"Kontrol Panjang dan Stop Sequences","local":"kontrol-panjang-dan-stop-sequences","sections":[],"depth":3}],"depth":2},{"title":"Pengelolaan Memori","local":"pengelolaan-memori","sections":[],"depth":2},{"title":"Sumber","local":"sumber","sections":[],"depth":2}],"depth":1}"><!-- HEAD_svelte-u9bgzb_END --> <p></p> <h1 class="relative group"><a id="penerapan-inferensi-yang-dioptimalkan" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#penerapan-inferensi-yang-dioptimalkan"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Penerapan Inferensi yang Dioptimalkan</span></h1> <p data-svelte-h="svelte-1wtwj5n">Dalam bagian ini, kita akan mengeksplorasi kerangka kerja lanjutan untuk mengoptimalkan penerapan LLM: Text Generation Inference (TGI), vLLM, dan llama.cpp. Aplikasi-aplikasi ini terutama digunakan di lingkungan produksi untuk menyajikan LLM kepada pengguna. Bagian ini berfokus pada cara menerapkan kerangka kerja tersebut di produksi, bukan cara menggunakannya untuk inferensi di mesin tunggal.</p> <p data-svelte-h="svelte-1evar45">Kami akan membahas bagaimana alat-alat ini memaksimalkan efisiensi inferensi dan menyederhanakan penerapan Large Language Models di lingkungan produksi.</p> <h2 class="relative group"><a id="panduan-pemilihan-kerangka-kerja" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#panduan-pemilihan-kerangka-kerja"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Panduan Pemilihan Kerangka Kerja</span></h2> <p data-svelte-h="svelte-1vlo0fn">TGI, vLLM, dan llama.cpp memiliki tujuan serupa namun memiliki karakteristik yang berbeda, sehingga lebih cocok untuk berbagai kasus penggunaan. Mari kita lihat perbedaan utama antara mereka, dengan fokus pada kinerja dan integrasi.</p> <h3 class="relative group"><a id="manajemen-memori-dan-performa" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#manajemen-memori-dan-performa"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Manajemen Memori dan Performa</span></h3> <p data-svelte-h="svelte-yis6u"><strong>TGI</strong> dirancang agar stabil dan dapat diprediksi di lingkungan produksi, menggunakan panjang urutan tetap untuk menjaga penggunaan memori yang konsisten. TGI mengelola memori menggunakan Flash Attention 2 dan teknik continuous batching. Ini berarti ia dapat memproses perhitungan attention secara sangat efisien dan menjaga GPU tetap sibuk dengan terus memberikan pekerjaan. Sistem ini dapat memindahkan bagian model antara CPU dan GPU sesuai kebutuhan, yang membantu menangani model yang lebih besar.</p> <p data-svelte-h="svelte-1459ciq"><img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/tgi/flash-attn.png" alt="Flash Attention"></p> <div class="course-tip bg-gradient-to-br dark:bg-gradient-to-r before:border-green-500 dark:before:border-green-800 from-green-50 dark:from-gray-900 to-white dark:to-gray-950 border border-green-50 text-green-700 dark:text-gray-400">Flash Attention adalah teknik yang mengoptimalkan mekanisme attention dalam model transformer dengan mengatasi hambatan bandwidth memori. Seperti yang dibahas dalam [Bab 1.8](/course/chapter1/8), mekanisme attention memiliki kompleksitas kuadrat dan penggunaan memori yang tinggi, sehingga tidak efisien untuk urutan panjang. | |
| <p data-svelte-h="svelte-jfygbi">Inovasi utama terletak pada cara mengelola transfer memori antara High Bandwidth Memory (HBM) dan cache SRAM yang lebih cepat. Attention tradisional berulang kali mentransfer data antara HBM dan SRAM, menciptakan hambatan dan membuat GPU tidak aktif. Flash Attention memuat data sekali ke SRAM dan melakukan semua perhitungan di sana, meminimalkan transfer memori yang mahal.</p> <p data-svelte-h="svelte-w41pqe">Meskipun manfaatnya paling besar saat pelatihan, penggunaan VRAM yang lebih rendah dan efisiensi Flash Attention juga sangat berguna saat inferensi, memungkinkan penyajian LLM yang lebih cepat dan skalabel.</p></div> <p data-svelte-h="svelte-tn6cam"><strong>vLLM</strong> menggunakan pendekatan berbeda dengan teknik bernama PagedAttention. Seperti sistem operasi mengelola memori dalam bentuk halaman, vLLM membagi memori model menjadi blok-blok kecil. Sistem cerdas ini memungkinkan penanganan permintaan berukuran berbeda secara fleksibel dan menghindari pemborosan memori. vLLM sangat efisien dalam berbagi memori antar permintaan dan mengurangi fragmentasi memori, menjadikan sistem lebih efisien secara keseluruhan.</p> <div class="course-tip bg-gradient-to-br dark:bg-gradient-to-r before:border-green-500 dark:before:border-green-800 from-green-50 dark:from-gray-900 to-white dark:to-gray-950 border border-green-50 text-green-700 dark:text-gray-400">PagedAttention adalah teknik yang mengatasi hambatan kritis dalam inferensi LLM: manajemen memori KV cache. Seperti yang dijelaskan di [Bab 1.8](/course/chapter1/8), saat menghasilkan teks, model menyimpan key dan value attention (KV cache) untuk setiap token yang dihasilkan agar menghindari perhitungan ulang. KV cache dapat menjadi sangat besar, terutama untuk urutan panjang atau banyak permintaan secara bersamaan. | |
| <p data-svelte-h="svelte-xmmut3">Inovasi utama vLLM adalah:</p> <ol data-svelte-h="svelte-ol7u5q"><li><strong>Memory Paging</strong>: KV cache dibagi menjadi “halaman” berukuran tetap (seperti virtual memory).</li> <li><strong>Penyimpanan Non-Kontigu</strong>: Halaman tidak harus tersimpan secara berurutan di memori GPU.</li> <li><strong>Manajemen Tabel Halaman</strong>: Tabel halaman melacak halaman mana milik urutan mana, memungkinkan akses efisien.</li> <li><strong>Berbagi Memori</strong>: Untuk operasi seperti sampling paralel, halaman KV cache dapat dibagi antara banyak urutan.</li></ol> <p data-svelte-h="svelte-121342p">PagedAttention meningkatkan throughput hingga 24x dibandingkan metode tradisional — sangat signifikan untuk produksi. Untuk detail lebih lanjut, lihat <a href="https://docs.vllm.ai/en/latest/design/kernel/paged_attention.html" rel="nofollow">panduan vLLM</a>.</p></div> <p data-svelte-h="svelte-13ejjx8"><strong>llama.cpp</strong> adalah implementasi C/C++ yang sangat dioptimalkan, awalnya dirancang untuk menjalankan model LLaMA di perangkat konsumen. Fokusnya pada efisiensi CPU dengan dukungan opsional untuk akselerasi GPU, ideal untuk lingkungan dengan sumber daya terbatas. llama.cpp menggunakan teknik kuantisasi untuk mengurangi ukuran dan kebutuhan memori model sambil mempertahankan kinerja yang baik. Ia juga menyediakan kernel yang dioptimalkan untuk berbagai arsitektur CPU dan manajemen KV cache dasar untuk menghasilkan token secara efisien.</p> <div class="course-tip bg-gradient-to-br dark:bg-gradient-to-r before:border-green-500 dark:before:border-green-800 from-green-50 dark:from-gray-900 to-white dark:to-gray-950 border border-green-50 text-green-700 dark:text-gray-400">Kuantisasi di llama.cpp menurunkan presisi bobot model dari 32-bit atau 16-bit float menjadi format yang lebih rendah seperti INT8 (8-bit), 4-bit, atau bahkan lebih kecil. Ini secara signifikan mengurangi penggunaan memori dan mempercepat inferensi tanpa kehilangan kualitas secara signifikan. | |
| <p data-svelte-h="svelte-n3qf0l">Fitur kuantisasi utama di llama.cpp:</p> <ol data-svelte-h="svelte-oix722"><li><strong>Banyak Tingkat Kuantisasi</strong>: Mendukung kuantisasi 8-bit, 4-bit, 3-bit, hingga 2-bit</li> <li><strong>Format GGML/GGUF</strong>: Format tensor khusus yang dioptimalkan untuk inferensi kuantisasi</li> <li><strong>Presisi Campuran</strong>: Bisa menerapkan tingkat kuantisasi berbeda di bagian berbeda dari model</li> <li><strong>Optimasi Perangkat Keras</strong>: Kode dioptimalkan untuk berbagai arsitektur CPU (AVX2, AVX-512, NEON)</li></ol> <p data-svelte-h="svelte-1jcow5m">Pendekatan ini memungkinkan model dengan miliaran parameter dijalankan di perangkat konsumen dengan memori terbatas — sangat cocok untuk deployment lokal dan perangkat edge.</p></div> <h3 class="relative group"><a id="penerapan-dan-integrasi" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#penerapan-dan-integrasi"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Penerapan dan Integrasi</span></h3> <p data-svelte-h="svelte-qz8pxd">Sekarang mari kita bahas perbedaan penerapan dan integrasi antar kerangka kerja.</p> <p data-svelte-h="svelte-rv3kgi"><strong>TGI</strong> unggul dalam penerapan tingkat enterprise dengan fitur siap-produksi. Ini memiliki dukungan bawaan untuk Kubernetes, pemantauan dengan Prometheus dan Grafana, penskalaan otomatis, serta fitur keamanan lengkap. Logging tingkat enterprise dan fitur seperti penyaringan konten serta pembatasan laju (rate limiting) membuatnya aman dan stabil.</p> <p data-svelte-h="svelte-1inp4hu"><strong>vLLM</strong> menawarkan pendekatan yang lebih fleksibel dan ramah pengembang. Dibangun dengan Python sebagai inti, vLLM bisa dengan mudah menggantikan API OpenAI di aplikasi yang sudah ada. Cocok digunakan dengan Ray untuk mengelola klaster, sangat ideal saat Anda membutuhkan performa tinggi dan kemampuan kustomisasi.</p> <p data-svelte-h="svelte-h34c74"><strong>llama.cpp</strong> mengutamakan kesederhanaan dan portabilitas. Implementasi servernya ringan dan dapat dijalankan di berbagai perangkat — dari server bertenaga tinggi hingga laptop konsumen dan beberapa perangkat seluler kelas atas. Dengan ketergantungan minimal dan inti C/C++ sederhana, sangat mudah diterapkan di lingkungan yang sulit menginstal framework Python. Server ini kompatibel dengan API OpenAI sambil tetap memiliki jejak sumber daya yang jauh lebih kecil dibanding solusi lain.</p> <h2 class="relative group"><a id="memulai-penggunaan" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#memulai-penggunaan"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Memulai Penggunaan</span></h2> <p data-svelte-h="svelte-kazfin">Mari kita eksplorasi cara menggunakan kerangka kerja ini untuk menerapkan LLM, dimulai dari instalasi dan pengaturan dasar.</p> <h3 class="relative group"><a id="instalasi-dan-pengaturan-dasar" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#instalasi-dan-pengaturan-dasar"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Instalasi dan Pengaturan Dasar</span></h3> <div class="flex space-x-2 items-center my-1.5 mr-8 h-7 !pl-0 -mx-3 md:mx-0"></div> <div class="language-select"><hfoption value="tgi" label="TGI"> | |
| <p data-svelte-h="svelte-1wo0s27">TGI mudah dipasang dan digunakan, dengan integrasi mendalam ke ekosistem Hugging Face.</p> <p data-svelte-h="svelte-l3djch">Pertama, jalankan server TGI menggunakan Docker:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->docker run --gpus all \ | |
| --shm-size 1g \ | |
| -p 8080:80 \ | |
| -v ~/.cache/huggingface:/data \ | |
| ghcr.io/huggingface/text-generation-inference:latest \ | |
| --model-id HuggingFaceTB/SmolLM2-360M-Instruct<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1l3gnvv">Lalu berinteraksi dengannya menggunakan <code>InferenceClient</code> dari Hugging Face:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> huggingface_hub <span class="hljs-keyword">import</span> InferenceClient | |
| client = InferenceClient( | |
| model=<span class="hljs-string">"http://localhost:8080"</span>, <span class="hljs-comment"># URL untuk server TGI</span> | |
| ) | |
| response = client.text_generation( | |
| <span class="hljs-string">"Ceritakan sebuah kisah"</span>, | |
| max_new_tokens=<span class="hljs-number">100</span>, | |
| temperature=<span class="hljs-number">0.7</span>, | |
| top_p=<span class="hljs-number">0.95</span>, | |
| details=<span class="hljs-literal">True</span>, | |
| stop_sequences=[], | |
| ) | |
| <span class="hljs-built_in">print</span>(response.generated_text) | |
| response = client.chat_completion( | |
| messages=[ | |
| {<span class="hljs-string">"role"</span>: <span class="hljs-string">"system"</span>, <span class="hljs-string">"content"</span>: <span class="hljs-string">"Kamu adalah asisten yang membantu."</span>}, | |
| {<span class="hljs-string">"role"</span>: <span class="hljs-string">"user"</span>, <span class="hljs-string">"content"</span>: <span class="hljs-string">"Ceritakan sebuah kisah"</span>}, | |
| ], | |
| max_tokens=<span class="hljs-number">100</span>, | |
| temperature=<span class="hljs-number">0.7</span>, | |
| top_p=<span class="hljs-number">0.95</span>, | |
| ) | |
| <span class="hljs-built_in">print</span>(response.choices[<span class="hljs-number">0</span>].message.content)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-17ivk84">Alternatif lain, gunakan klien OpenAI:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> openai <span class="hljs-keyword">import</span> OpenAI | |
| client = OpenAI( | |
| base_url=<span class="hljs-string">"http://localhost:8080/v1"</span>, <span class="hljs-comment"># Pastikan menyertakan /v1</span> | |
| api_key=<span class="hljs-string">"not-needed"</span>, <span class="hljs-comment"># Secara default, TGI tidak memerlukan API key</span> | |
| ) | |
| response = client.chat.completions.create( | |
| model=<span class="hljs-string">"HuggingFaceTB/SmolLM2-360M-Instruct"</span>, | |
| messages=[ | |
| {<span class="hljs-string">"role"</span>: <span class="hljs-string">"system"</span>, <span class="hljs-string">"content"</span>: <span class="hljs-string">"Kamu adalah asisten yang membantu."</span>}, | |
| {<span class="hljs-string">"role"</span>: <span class="hljs-string">"user"</span>, <span class="hljs-string">"content"</span>: <span class="hljs-string">"Ceritakan sebuah kisah"</span>}, | |
| ], | |
| max_tokens=<span class="hljs-number">100</span>, | |
| temperature=<span class="hljs-number">0.7</span>, | |
| top_p=<span class="hljs-number">0.95</span>, | |
| ) | |
| <span class="hljs-built_in">print</span>(response.choices[<span class="hljs-number">0</span>].message.content)<!-- HTML_TAG_END --></pre></div> | |
| </hfoption> | |
| <hfoption value="llama.cpp" label="llama.cpp"> | |
| <p data-svelte-h="svelte-1bx6ztg">llama.cpp mudah dipasang dan digunakan, hanya membutuhkan dependensi minimal dan mendukung inferensi via CPU dan GPU.</p> <p data-svelte-h="svelte-rsihho">Pertama, instal dan bangun <code>llama.cpp</code>:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-comment"># Klon repositori</span> | |
| git <span class="hljs-built_in">clone</span> https://github.com/ggerganov/llama.cpp | |
| <span class="hljs-built_in">cd</span> llama.cpp | |
| <span class="hljs-comment"># Build project</span> | |
| make | |
| <span class="hljs-comment"># Download mode SmolLM2-1.7B-Instruct-GGUF </span> | |
| curl -L -O https://huggingface.co/HuggingFaceTB/SmolLM2-1.7B-Instruct-GGUF/resolve/main/smollm2-1.7b-instruct.Q4_K_M.gguf<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1ltsiwy">Lalu jalankan server-nya:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-comment"># Menjalankan server</span> | |
| ./server \ | |
| -m smollm2-1.7b-instruct.Q4_K_M.gguf \ | |
| --host 0.0.0.0 \ | |
| --port 8080 \ | |
| -c 4096 \ | |
| --n-gpu-layers 0 <span class="hljs-comment"># Atur ke angka lebih tinggi untuk menggunakan GPU</span><!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-bqx66v">Gunakan <code>InferenceClient</code> untuk mengaksesnya:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> huggingface_hub <span class="hljs-keyword">import</span> InferenceClient | |
| <span class="hljs-comment"># Inisialisasi klien yang mengarah ke server llama.cpp</span> | |
| client = InferenceClient( | |
| model=<span class="hljs-string">"http://localhost:8080/v1"</span>, <span class="hljs-comment"># URL untuk server llama.cpp</span> | |
| token=<span class="hljs-string">"sk-no-key-required"</span>, <span class="hljs-comment"># server llama.cpp memerlukan placeholder ini</span> | |
| ) | |
| <span class="hljs-comment"># Pembuatan teks</span> | |
| response = client.text_generation( | |
| <span class="hljs-string">"Ceritakan sebuah kisah"</span>, | |
| max_new_tokens=<span class="hljs-number">100</span>, | |
| temperature=<span class="hljs-number">0.7</span>, | |
| top_p=<span class="hljs-number">0.95</span>, | |
| details=<span class="hljs-literal">True</span>, | |
| ) | |
| <span class="hljs-built_in">print</span>(response.generated_text) | |
| <span class="hljs-comment"># Untuk format percakapan</span> | |
| response = client.chat_completion( | |
| messages=[ | |
| {<span class="hljs-string">"role"</span>: <span class="hljs-string">"system"</span>, <span class="hljs-string">"content"</span>: <span class="hljs-string">"Kamu adalah asisten yang membantu."</span>}, | |
| {<span class="hljs-string">"role"</span>: <span class="hljs-string">"user"</span>, <span class="hljs-string">"content"</span>: <span class="hljs-string">"Ceritakan sebuah kisah"</span>}, | |
| ], | |
| max_tokens=<span class="hljs-number">100</span>, | |
| temperature=<span class="hljs-number">0.7</span>, | |
| top_p=<span class="hljs-number">0.95</span>, | |
| ) | |
| <span class="hljs-built_in">print</span>(response.choices[<span class="hljs-number">0</span>].message.content)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1t7u5oh">Atau gunakan klien OpenAI:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> openai <span class="hljs-keyword">import</span> OpenAI | |
| <span class="hljs-comment"># Inisialisasi klien yang menunjuk ke server llama.cpp</span> | |
| client = OpenAI( | |
| base_url=<span class="hljs-string">"http://localhost:8080/v1"</span>, | |
| api_key=<span class="hljs-string">"sk-no-key-required"</span>, <span class="hljs-comment"># server llama.cpp memerlukan placeholder ini</span> | |
| ) | |
| response = client.chat.completions.create( | |
| model=<span class="hljs-string">"smollm2-1.7b-instruct"</span>, <span class="hljs-comment"># Pengenal model bisa apa saja karena server hanya memuat satu model</span> | |
| messages=[ | |
| {<span class="hljs-string">"role"</span>: <span class="hljs-string">"system"</span>, <span class="hljs-string">"content"</span>: <span class="hljs-string">"Kamu adalah asisten yang membantu."</span>}, | |
| {<span class="hljs-string">"role"</span>: <span class="hljs-string">"user"</span>, <span class="hljs-string">"content"</span>: <span class="hljs-string">"Ceritakan sebuah kisah"</span>}, | |
| ], | |
| max_tokens=<span class="hljs-number">100</span>, | |
| temperature=<span class="hljs-number">0.7</span>, | |
| top_p=<span class="hljs-number">0.95</span>, | |
| ) | |
| <span class="hljs-built_in">print</span>(response.choices[<span class="hljs-number">0</span>].message.content)<!-- HTML_TAG_END --></pre></div> | |
| </hfoption> | |
| <hfoption value="vllm" label="vLLM"> | |
| <p data-svelte-h="svelte-15eyqaq">vLLM mudah dipasang, dengan kompatibilitas API OpenAI dan antarmuka Python bawaan.</p> <p data-svelte-h="svelte-2eicno">Jalankan server API vLLM:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->python -m vllm.entrypoints.openai.api_server \ | |
| --model HuggingFaceTB/SmolLM2-360M-Instruct \ | |
| --host 0.0.0.0 \ | |
| --port 8000<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1ltue7k">Lalu berinteraksilah dengannya menggunakan <code>InferenceClient</code> dari Hugging Face:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> huggingface_hub <span class="hljs-keyword">import</span> InferenceClient | |
| <span class="hljs-comment"># Inisialisasi klien yang menunjuk ke endpoint vLLM</span> | |
| client = InferenceClient( | |
| model=<span class="hljs-string">"http://localhost:8000/v1"</span>, <span class="hljs-comment"># URL untuk server vLLM</span> | |
| ) | |
| <span class="hljs-comment"># Pembuatan teks</span> | |
| response = client.text_generation( | |
| <span class="hljs-string">"Ceritakan sebuah kisah"</span>, | |
| max_new_tokens=<span class="hljs-number">100</span>, | |
| temperature=<span class="hljs-number">0.7</span>, | |
| top_p=<span class="hljs-number">0.95</span>, | |
| details=<span class="hljs-literal">True</span>, | |
| ) | |
| <span class="hljs-built_in">print</span>(response.generated_text) | |
| <span class="hljs-comment"># Untuk format percakapan</span> | |
| response = client.chat_completion( | |
| messages=[ | |
| {<span class="hljs-string">"role"</span>: <span class="hljs-string">"system"</span>, <span class="hljs-string">"content"</span>: <span class="hljs-string">"Kamu adalah asisten yang membantu."</span>}, | |
| {<span class="hljs-string">"role"</span>: <span class="hljs-string">"user"</span>, <span class="hljs-string">"content"</span>: <span class="hljs-string">"Ceritakan sebuah kisah"</span>}, | |
| ], | |
| max_tokens=<span class="hljs-number">100</span>, | |
| temperature=<span class="hljs-number">0.7</span>, | |
| top_p=<span class="hljs-number">0.95</span>, | |
| ) | |
| <span class="hljs-built_in">print</span>(response.choices[<span class="hljs-number">0</span>].message.content)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1g7zqpn">Atau dengan klien OpenAI:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> openai <span class="hljs-keyword">import</span> OpenAI | |
| <span class="hljs-comment"># Inisialisasi klien yang mengarah ke endpoint vLLM</span> | |
| client = OpenAI( | |
| base_url=<span class="hljs-string">"http://localhost:8000/v1"</span>, | |
| api_key=<span class="hljs-string">"not-needed"</span>, <span class="hljs-comment"># Secara bawaan, vLLM tidak memerlukan API key</span> | |
| ) | |
| response = client.chat.completions.create( | |
| model=<span class="hljs-string">"HuggingFaceTB/SmolLM2-360M-Instruct"</span>, | |
| messages=[ | |
| {<span class="hljs-string">"role"</span>: <span class="hljs-string">"system"</span>, <span class="hljs-string">"content"</span>: <span class="hljs-string">"Kamu adalah asisten yang membantu."</span>}, | |
| {<span class="hljs-string">"role"</span>: <span class="hljs-string">"user"</span>, <span class="hljs-string">"content"</span>: <span class="hljs-string">"Ceritakan sebuah kisah"</span>}, | |
| ], | |
| max_tokens=<span class="hljs-number">100</span>, | |
| temperature=<span class="hljs-number">0.7</span>, | |
| top_p=<span class="hljs-number">0.95</span>, | |
| ) | |
| <span class="hljs-built_in">print</span>(response.choices[<span class="hljs-number">0</span>].message.content)<!-- HTML_TAG_END --></pre></div> | |
| </hfoption></div> <h3 class="relative group"><a id="generasi-teks-dasar" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#generasi-teks-dasar"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Generasi Teks Dasar</span></h3> <p data-svelte-h="svelte-5t2ln2">Mari kita lihat contoh-contoh generasi teks dengan berbagai framework:</p> <div class="flex space-x-2 items-center my-1.5 mr-8 h-7 !pl-0 -mx-3 md:mx-0"></div> <div class="language-select"><hfoption value="tgi" label="TGI"> | |
| <p data-svelte-h="svelte-1js88tn">Pertama, jalankan TGI dengan parameter lanjutan:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->docker run --gpus all \ | |
| --shm-size 1g \ | |
| -p 8080:80 \ | |
| -v ~/.cache/huggingface:/data \ | |
| ghcr.io/huggingface/text-generation-inference:latest \ | |
| --model-id HuggingFaceTB/SmolLM2-360M-Instruct \ | |
| --max-total-tokens 4096 \ | |
| --max-input-length 3072 \ | |
| --max-batch-total-tokens 8192 \ | |
| --waiting-served-ratio 1.2<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-133q38m">Gunakan <code>InferenceClient</code> untuk generasi teks yang fleksibel:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> huggingface_hub <span class="hljs-keyword">import</span> InferenceClient | |
| client = InferenceClient(model=<span class="hljs-string">"http://localhost:8080"</span>) | |
| <span class="hljs-comment"># Contoh parameter lanjutan</span> | |
| response = client.chat_completion( | |
| messages=[ | |
| {<span class="hljs-string">"role"</span>: <span class="hljs-string">"system"</span>, <span class="hljs-string">"content"</span>: <span class="hljs-string">"Kamu adalah seorang pendongeng kreatif."</span>}, | |
| {<span class="hljs-string">"role"</span>: <span class="hljs-string">"user"</span>, <span class="hljs-string">"content"</span>: <span class="hljs-string">"Tulis sebuah cerita kreatif"</span>}, | |
| ], | |
| temperature=<span class="hljs-number">0.8</span>, | |
| max_tokens=<span class="hljs-number">200</span>, | |
| top_p=<span class="hljs-number">0.95</span>, | |
| ) | |
| <span class="hljs-built_in">print</span>(response.choices[<span class="hljs-number">0</span>].message.content) | |
| <span class="hljs-comment"># Generasi teks mentah</span> | |
| response = client.text_generation( | |
| <span class="hljs-string">"Tulis cerita kreatif tentang eksplorasi luar angkasa"</span>, | |
| max_new_tokens=<span class="hljs-number">200</span>, | |
| temperature=<span class="hljs-number">0.8</span>, | |
| top_p=<span class="hljs-number">0.95</span>, | |
| repetition_penalty=<span class="hljs-number">1.1</span>, | |
| do_sample=<span class="hljs-literal">True</span>, | |
| details=<span class="hljs-literal">True</span>, | |
| ) | |
| <span class="hljs-built_in">print</span>(response.generated_text)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1t7u5oh">Atau gunakan klien OpenAI:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> openai <span class="hljs-keyword">import</span> OpenAI | |
| client = OpenAI(base_url=<span class="hljs-string">"http://localhost:8080/v1"</span>, api_key=<span class="hljs-string">"not-needed"</span>) | |
| <span class="hljs-comment"># Contoh parameter lanjutan</span> | |
| response = client.chat.completions.create( | |
| model=<span class="hljs-string">"HuggingFaceTB/SmolLM2-360M-Instruct"</span>, | |
| messages=[ | |
| {<span class="hljs-string">"role"</span>: <span class="hljs-string">"system"</span>, <span class="hljs-string">"content"</span>: <span class="hljs-string">"Kamu adalah seorang pendongeng kreatif."</span>}, | |
| {<span class="hljs-string">"role"</span>: <span class="hljs-string">"user"</span>, <span class="hljs-string">"content"</span>: <span class="hljs-string">"Tulis sebuah cerita kreatif"</span>}, | |
| ], | |
| temperature=<span class="hljs-number">0.8</span>, <span class="hljs-comment"># Lebih tinggi untuk hasil yang lebih kreatif</span> | |
| ) | |
| <span class="hljs-built_in">print</span>(response.choices[<span class="hljs-number">0</span>].message.content)<!-- HTML_TAG_END --></pre></div> | |
| </hfoption> | |
| <hfoption value="llama.cpp" label="llama.cpp"> | |
| <p data-svelte-h="svelte-13oenko">Untuk <code>llama.cpp</code>, Anda dapat menetapkan parameter lanjutan saat menjalankan server:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->./server \ | |
| -m smollm2-1.7b-instruct.Q4_K_M.gguf \ | |
| --host 0.0.0.0 \ | |
| --port 8080 \ | |
| -c 4096 \ <span class="hljs-comment"># Ukuran konteks</span> | |
| --threads 8 \ <span class="hljs-comment"># Jumlah thread CPU yang digunakan</span> | |
| --batch-size 512 \ <span class="hljs-comment"># Ukuran batch evaluasi prompt</span> | |
| --n-gpu-layers 0 <span class="hljs-comment"># Lapisan GPU (0 = hanya CPU)</span><!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1nk0za2">Gunakan <code>InferenceClient</code>:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> huggingface_hub <span class="hljs-keyword">import</span> InferenceClient | |
| client = InferenceClient(model=<span class="hljs-string">"http://localhost:8080/v1"</span>, token=<span class="hljs-string">"sk-no-key-required"</span>) | |
| <span class="hljs-comment"># Contoh parameter lanjutan</span> | |
| response = client.chat_completion( | |
| messages=[ | |
| {<span class="hljs-string">"role"</span>: <span class="hljs-string">"system"</span>, <span class="hljs-string">"content"</span>: <span class="hljs-string">"Kamu adalah seorang pendongeng kreatif."</span>}, | |
| {<span class="hljs-string">"role"</span>: <span class="hljs-string">"user"</span>, <span class="hljs-string">"content"</span>: <span class="hljs-string">"Tulis sebuah cerita kreatif"</span>}, | |
| ], | |
| temperature=<span class="hljs-number">0.8</span>, | |
| max_tokens=<span class="hljs-number">200</span>, | |
| top_p=<span class="hljs-number">0.95</span>, | |
| ) | |
| <span class="hljs-built_in">print</span>(response.choices[<span class="hljs-number">0</span>].message.content) | |
| <span class="hljs-comment"># Untuk generasi teks langsung</span> | |
| response = client.text_generation( | |
| <span class="hljs-string">"Tulis cerita kreatif tentang eksplorasi luar angkasa"</span>, | |
| max_new_tokens=<span class="hljs-number">200</span>, | |
| temperature=<span class="hljs-number">0.8</span>, | |
| top_p=<span class="hljs-number">0.95</span>, | |
| repetition_penalty=<span class="hljs-number">1.1</span>, | |
| details=<span class="hljs-literal">True</span>, | |
| ) | |
| <span class="hljs-built_in">print</span>(response.generated_text)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-seg517">Atau gunakan klien OpenAI untuk mengontrol parameter sampling:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> openai <span class="hljs-keyword">import</span> OpenAI | |
| client = OpenAI(base_url=<span class="hljs-string">"http://localhost:8080/v1"</span>, api_key=<span class="hljs-string">"sk-no-key-required"</span>) | |
| <span class="hljs-comment"># Contoh parameter lanjutan</span> | |
| response = client.chat.completions.create( | |
| model=<span class="hljs-string">"smollm2-1.7b-instruct"</span>, | |
| messages=[ | |
| {<span class="hljs-string">"role"</span>: <span class="hljs-string">"system"</span>, <span class="hljs-string">"content"</span>: <span class="hljs-string">"Kamu adalah seorang pendongeng kreatif."</span>}, | |
| {<span class="hljs-string">"role"</span>: <span class="hljs-string">"user"</span>, <span class="hljs-string">"content"</span>: <span class="hljs-string">"Tulis sebuah cerita kreatif"</span>}, | |
| ], | |
| temperature=<span class="hljs-number">0.8</span>, | |
| top_p=<span class="hljs-number">0.95</span>, | |
| frequency_penalty=<span class="hljs-number">0.5</span>, | |
| presence_penalty=<span class="hljs-number">0.5</span>, | |
| max_tokens=<span class="hljs-number">200</span>, | |
| ) | |
| <span class="hljs-built_in">print</span>(response.choices[<span class="hljs-number">0</span>].message.content)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1qwmquu">Anda juga dapat menggunakan modul native <code>llama.cpp</code> untuk kontrol yang lebih mendetail:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-comment"># Menggunakan paket llama-cpp-python untuk akses langsung ke model</span> | |
| <span class="hljs-keyword">from</span> llama_cpp <span class="hljs-keyword">import</span> Llama | |
| <span class="hljs-comment"># Memuat model</span> | |
| llm = Llama( | |
| model_path=<span class="hljs-string">"smollm2-1.7b-instruct.Q4_K_M.gguf"</span>, | |
| n_ctx=<span class="hljs-number">4096</span>, | |
| n_threads=<span class="hljs-number">8</span>, | |
| n_gpu_layers=<span class="hljs-number">0</span>, | |
| ) | |
| <span class="hljs-comment"># Format prompt sesuai format yang diharapkan oleh model</span> | |
| prompt = <span class="hljs-string">"""<|im_start|>system | |
| Kamu adalah seorang pendongeng kreatif. | |
| <|im_end|> | |
| <|im_start|>user | |
| Tulis sebuah cerita kreatif | |
| <|im_end|> | |
| <|im_start|>assistant | |
| """</span> | |
| <span class="hljs-comment"># Menghasilkan respon dengan kontrol parameter yang tepat</span> | |
| output = llm( | |
| prompt, | |
| max_tokens=<span class="hljs-number">200</span>, | |
| temperature=<span class="hljs-number">0.8</span>, | |
| top_p=<span class="hljs-number">0.95</span>, | |
| frequency_penalty=<span class="hljs-number">0.5</span>, | |
| presence_penalty=<span class="hljs-number">0.5</span>, | |
| stop=[<span class="hljs-string">"<|im_end|>"</span>], | |
| ) | |
| <span class="hljs-built_in">print</span>(output[<span class="hljs-string">"choices"</span>][<span class="hljs-number">0</span>][<span class="hljs-string">"text"</span>])<!-- HTML_TAG_END --></pre></div> | |
| </hfoption> | |
| <hfoption value="vllm" label="vLLM"> | |
| <p data-svelte-h="svelte-1lq8je2">Untuk penggunaan lanjutan dengan <code>vLLM</code>, Anda dapat menggunakan <code>InferenceClient</code>:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> huggingface_hub <span class="hljs-keyword">import</span> InferenceClient | |
| client = InferenceClient(model=<span class="hljs-string">"http://localhost:8000/v1"</span>) | |
| <span class="hljs-comment"># Contoh parameter lanjutan</span> | |
| response = client.chat_completion( | |
| messages=[ | |
| {<span class="hljs-string">"role"</span>: <span class="hljs-string">"system"</span>, <span class="hljs-string">"content"</span>: <span class="hljs-string">"Kamu adalah seorang pendongeng kreatif."</span>}, | |
| {<span class="hljs-string">"role"</span>: <span class="hljs-string">"user"</span>, <span class="hljs-string">"content"</span>: <span class="hljs-string">"Tulis sebuah cerita kreatif"</span>}, | |
| ], | |
| temperature=<span class="hljs-number">0.8</span>, | |
| max_tokens=<span class="hljs-number">200</span>, | |
| top_p=<span class="hljs-number">0.95</span>, | |
| ) | |
| <span class="hljs-built_in">print</span>(response.choices[<span class="hljs-number">0</span>].message.content) | |
| <span class="hljs-comment"># Untuk generasi teks langsung</span> | |
| response = client.text_generation( | |
| <span class="hljs-string">"Tulis cerita kreatif tentang eksplorasi luar angkasa"</span>, | |
| max_new_tokens=<span class="hljs-number">200</span>, | |
| temperature=<span class="hljs-number">0.8</span>, | |
| top_p=<span class="hljs-number">0.95</span>, | |
| details=<span class="hljs-literal">True</span>, | |
| ) | |
| <span class="hljs-built_in">print</span>(response.generated_text)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-xn2mvu">Anda juga dapat menggunakan klien OpenAI:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> openai <span class="hljs-keyword">import</span> OpenAI | |
| client = OpenAI(base_url=<span class="hljs-string">"http://localhost:8000/v1"</span>, api_key=<span class="hljs-string">"not-needed"</span>) | |
| <span class="hljs-comment"># Contoh parameter lanjutan</span> | |
| response = client.chat.completions.create( | |
| model=<span class="hljs-string">"HuggingFaceTB/SmolLM2-360M-Instruct"</span>, | |
| messages=[ | |
| {<span class="hljs-string">"role"</span>: <span class="hljs-string">"system"</span>, <span class="hljs-string">"content"</span>: <span class="hljs-string">"Kamu adalah seorang pendongeng kreatif."</span>}, | |
| {<span class="hljs-string">"role"</span>: <span class="hljs-string">"user"</span>, <span class="hljs-string">"content"</span>: <span class="hljs-string">"Tulis sebuah cerita kreatif"</span>}, | |
| ], | |
| temperature=<span class="hljs-number">0.8</span>, | |
| top_p=<span class="hljs-number">0.95</span>, | |
| max_tokens=<span class="hljs-number">200</span>, | |
| ) | |
| <span class="hljs-built_in">print</span>(response.choices[<span class="hljs-number">0</span>].message.content)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1k3elb6"><code>vLLM</code> juga menyediakan antarmuka Python native dengan kontrol yang lebih rinci:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> vllm <span class="hljs-keyword">import</span> LLM, SamplingParams | |
| <span class="hljs-comment"># Inisialisasi model dengan parameter lanjutan</span> | |
| llm = LLM( | |
| model=<span class="hljs-string">"HuggingFaceTB/SmolLM2-360M-Instruct"</span>, | |
| gpu_memory_utilization=<span class="hljs-number">0.85</span>, | |
| max_num_batched_tokens=<span class="hljs-number">8192</span>, | |
| max_num_seqs=<span class="hljs-number">256</span>, | |
| block_size=<span class="hljs-number">16</span>, | |
| ) | |
| <span class="hljs-comment"># Konfigurasi parameter sampling</span> | |
| sampling_params = SamplingParams( | |
| temperature=<span class="hljs-number">0.8</span>, <span class="hljs-comment"># Semakin tinggi, semakin kreatif</span> | |
| top_p=<span class="hljs-number">0.95</span>, <span class="hljs-comment"># Pertimbangkan 95% dari massa probabilitas teratas</span> | |
| max_tokens=<span class="hljs-number">100</span>, <span class="hljs-comment"># Panjang maksimum</span> | |
| presence_penalty=<span class="hljs-number">1.1</span>, <span class="hljs-comment"># Kurangi pengulangan</span> | |
| frequency_penalty=<span class="hljs-number">1.1</span>, <span class="hljs-comment"># Kurangi pengulangan</span> | |
| stop=[<span class="hljs-string">"\n\n"</span>, <span class="hljs-string">"###"</span>], <span class="hljs-comment"># Urutan berhenti</span> | |
| ) | |
| <span class="hljs-comment"># Menghasilkan teks</span> | |
| prompt = <span class="hljs-string">"Tulis sebuah cerita kreatif"</span> | |
| outputs = llm.generate(prompt, sampling_params) | |
| <span class="hljs-built_in">print</span>(outputs[<span class="hljs-number">0</span>].outputs[<span class="hljs-number">0</span>].text) | |
| <span class="hljs-comment"># Untuk interaksi gaya chat</span> | |
| chat_prompt = [ | |
| {<span class="hljs-string">"role"</span>: <span class="hljs-string">"system"</span>, <span class="hljs-string">"content"</span>: <span class="hljs-string">"Kamu adalah seorang pendongeng kreatif."</span>}, | |
| {<span class="hljs-string">"role"</span>: <span class="hljs-string">"user"</span>, <span class="hljs-string">"content"</span>: <span class="hljs-string">"Tulis sebuah cerita kreatif"</span>}, | |
| ] | |
| formatted_prompt = llm.get_chat_template()(chat_prompt) | |
| outputs = llm.generate(formatted_prompt, sampling_params) | |
| <span class="hljs-built_in">print</span>(outputs[<span class="hljs-number">0</span>].outputs[<span class="hljs-number">0</span>].text)<!-- HTML_TAG_END --></pre></div> | |
| </hfoption></div> <h2 class="relative group"><a id="kontrol-generasi-lanjutan" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#kontrol-generasi-lanjutan"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Kontrol Generasi Lanjutan</span></h2> <h3 class="relative group"><a id="pemilihan-token-dan-sampling" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#pemilihan-token-dan-sampling"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Pemilihan Token dan Sampling</span></h3> <p data-svelte-h="svelte-1xgb8sl">Proses generasi teks melibatkan pemilihan token berikutnya di setiap langkah. Proses ini bisa dikendalikan melalui berbagai parameter:</p> <ol data-svelte-h="svelte-ujr52"><li><strong>Logit Mentah</strong>: Probabilitas awal untuk setiap token</li> <li><strong>Temperature</strong>: Mengatur tingkat kreativitas (semakin tinggi, semakin acak)</li> <li><strong>Top-p (Nucleus Sampling)</strong>: Memfilter token dengan probabilitas kumulatif X%</li> <li><strong>Top-k Filtering</strong>: Membatasi pilihan ke k token dengan kemungkinan tertinggi</li></ol> <p data-svelte-h="svelte-j38dlc">Berikut cara mengkonfigurasinya:</p> <div class="flex space-x-2 items-center my-1.5 mr-8 h-7 !pl-0 -mx-3 md:mx-0"></div> <div class="language-select"><hfoption value="tgi" label="TGI"> | |
| <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->client.generate( | |
| <span class="hljs-string">"Tulis sebuah cerita kreatif"</span>, | |
| temperature=<span class="hljs-number">0.8</span>, | |
| top_p=<span class="hljs-number">0.95</span>, | |
| top_k=<span class="hljs-number">50</span>, | |
| max_new_tokens=<span class="hljs-number">100</span>, | |
| repetition_penalty=<span class="hljs-number">1.1</span>, | |
| )<!-- HTML_TAG_END --></pre></div> | |
| </hfoption> | |
| <hfoption value="llama.cpp" label="llama.cpp"> | |
| <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-comment"># Melalui API OpenAI</span> | |
| response = client.completions.create( | |
| model=<span class="hljs-string">"smollm2-1.7b-instruct"</span>, <span class="hljs-comment"># Nama model (bisa berupa string apa saja untuk server llama.cpp)</span> | |
| prompt=<span class="hljs-string">"Tulis sebuah cerita kreatif"</span>, | |
| temperature=<span class="hljs-number">0.8</span>, <span class="hljs-comment"># Semakin tinggi, semakin kreatif</span> | |
| top_p=<span class="hljs-number">0.95</span>, <span class="hljs-comment"># Pertimbangkan 95% dari massa probabilitas teratas</span> | |
| frequency_penalty=<span class="hljs-number">1.1</span>, <span class="hljs-comment"># Kurangi pengulangan</span> | |
| presence_penalty=<span class="hljs-number">0.1</span>, <span class="hljs-comment"># Kurangi pengulangan</span> | |
| max_tokens=<span class="hljs-number">100</span>, <span class="hljs-comment"># Panjang maksimum</span> | |
| ) | |
| <span class="hljs-comment"># Melalui akses langsung llama-cpp-python</span> | |
| output = llm( | |
| <span class="hljs-string">"Tulis sebuah cerita kreatif"</span>, | |
| temperature=<span class="hljs-number">0.8</span>, | |
| top_p=<span class="hljs-number">0.95</span>, | |
| top_k=<span class="hljs-number">50</span>, | |
| max_tokens=<span class="hljs-number">100</span>, | |
| repeat_penalty=<span class="hljs-number">1.1</span>, | |
| )<!-- HTML_TAG_END --></pre></div> | |
| </hfoption> | |
| <hfoption value="vllm" label="vLLM"> | |
| <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->params = SamplingParams( | |
| temperature=<span class="hljs-number">0.8</span>, <span class="hljs-comment"># Semakin tinggi, semakin kreatif</span> | |
| top_p=<span class="hljs-number">0.95</span>, <span class="hljs-comment"># Pertimbangkan 95% dari massa probabilitas teratas</span> | |
| top_k=<span class="hljs-number">50</span>, <span class="hljs-comment"># Pertimbangkan 50 token teratas</span> | |
| max_tokens=<span class="hljs-number">100</span>, <span class="hljs-comment"># Panjang maksimum</span> | |
| presence_penalty=<span class="hljs-number">0.1</span>, <span class="hljs-comment"># Kurangi pengulangan</span> | |
| ) | |
| llm.generate(<span class="hljs-string">"Tulis sebuah cerita kreatif"</span>, sampling_params=params)<!-- HTML_TAG_END --></pre></div> | |
| </hfoption></div> <h3 class="relative group"><a id="mengontrol-pengulangan" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#mengontrol-pengulangan"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Mengontrol Pengulangan</span></h3> <p data-svelte-h="svelte-v2tann">Semua framework menyediakan cara untuk mencegah generasi teks yang berulang:</p> <div class="flex space-x-2 items-center my-1.5 mr-8 h-7 !pl-0 -mx-3 md:mx-0"></div> <div class="language-select"><hfoption value="tgi" label="TGI"> | |
| ```python | |
| client.generate( | |
| "Tulis teks yang bervariasi", | |
| repetition_penalty=1.1, # Penalti untuk token yang diulang | |
| no_repeat_ngram_size=3, # Cegah pengulangan 3-gram | |
| ) | |
| ``` | |
| </hfoption> | |
| <hfoption value="llama.cpp" label="llama.cpp"> | |
| <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-comment"># Melalui API OpenAI</span> | |
| response = client.completions.create( | |
| model=<span class="hljs-string">"smollm2-1.7b-instruct"</span>, | |
| prompt=<span class="hljs-string">"Tulis teks yang bervariasi"</span>, | |
| frequency_penalty=<span class="hljs-number">1.1</span>, <span class="hljs-comment"># Penalti untuk token yang sering muncul</span> | |
| presence_penalty=<span class="hljs-number">0.8</span>, <span class="hljs-comment"># Penalti untuk token yang sudah muncul</span> | |
| ) | |
| <span class="hljs-comment"># Melalui modul langsung</span> | |
| output = llm( | |
| <span class="hljs-string">"Tulis teks yang bervariasi"</span>, | |
| repeat_penalty=<span class="hljs-number">1.1</span>, <span class="hljs-comment"># Penalti untuk token yang diulang</span> | |
| frequency_penalty=<span class="hljs-number">0.5</span>, <span class="hljs-comment"># Penalti tambahan untuk frekuensi tinggi</span> | |
| presence_penalty=<span class="hljs-number">0.5</span>, <span class="hljs-comment"># Penalti tambahan untuk token yang sudah muncul</span> | |
| )<!-- HTML_TAG_END --></pre></div> | |
| </hfoption> | |
| <hfoption value="vllm" label="vLLM"> | |
| <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->params = SamplingParams( | |
| presence_penalty=<span class="hljs-number">0.1</span>, <span class="hljs-comment"># Penalti untuk keberadaan token</span> | |
| frequency_penalty=<span class="hljs-number">0.1</span>, <span class="hljs-comment"># Penalti untuk frekuensi token</span> | |
| )<!-- HTML_TAG_END --></pre></div> | |
| </hfoption></div> <h3 class="relative group"><a id="kontrol-panjang-dan-stop-sequences" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#kontrol-panjang-dan-stop-sequences"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Kontrol Panjang dan Stop Sequences</span></h3> <p data-svelte-h="svelte-tnmnza">Anda bisa mengontrol panjang generasi dan menetapkan titik berhenti:</p> <div class="flex space-x-2 items-center my-1.5 mr-8 h-7 !pl-0 -mx-3 md:mx-0"></div> <div class="language-select"><hfoption value="tgi" label="TGI"> | |
| ```python | |
| client.generate( | |
| "Hasilkan paragraf singkat", | |
| max_new_tokens=100, | |
| min_new_tokens=10, | |
| stop_sequences=["\n\n", "###"], | |
| ) | |
| ``` | |
| </hfoption> | |
| <hfoption value="llama.cpp" label="llama.cpp"> | |
| <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-comment"># Melalui API OpenAI</span> | |
| response = client.completions.create( | |
| model=<span class="hljs-string">"smollm2-1.7b-instruct"</span>, | |
| prompt=<span class="hljs-string">"Hasilkan paragraf singkat"</span>, | |
| max_tokens=<span class="hljs-number">100</span>, | |
| stop=[<span class="hljs-string">"\n\n"</span>, <span class="hljs-string">"###"</span>], | |
| ) | |
| <span class="hljs-comment"># Melalui modul langsung</span> | |
| output = llm(<span class="hljs-string">"Hasilkan paragraf singkat"</span>, max_tokens=<span class="hljs-number">100</span>, stop=[<span class="hljs-string">"\n\n"</span>, <span class="hljs-string">"###"</span>])<!-- HTML_TAG_END --></pre></div> | |
| </hfoption> | |
| <hfoption value="vllm" label="vLLM"> | |
| <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->params = SamplingParams( | |
| max_tokens=<span class="hljs-number">100</span>, | |
| min_tokens=<span class="hljs-number">10</span>, | |
| stop=[<span class="hljs-string">"###"</span>, <span class="hljs-string">"\n\n"</span>], | |
| ignore_eos=<span class="hljs-literal">False</span>, | |
| skip_special_tokens=<span class="hljs-literal">True</span>, | |
| )<!-- HTML_TAG_END --></pre></div> | |
| </hfoption></div> <h2 class="relative group"><a id="pengelolaan-memori" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#pengelolaan-memori"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Pengelolaan Memori</span></h2> <p data-svelte-h="svelte-4f5155">Ketiga kerangka kerja menerapkan teknik manajemen memori canggih untuk efisiensi inferensi.</p> <div class="flex space-x-2 items-center my-1.5 mr-8 h-7 !pl-0 -mx-3 md:mx-0"></div> <div class="language-select"><hfoption value="tgi" label="TGI"> | |
| <p data-svelte-h="svelte-iqkcj8">TGI menggunakan Flash Attention 2 dan continuous batching:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->docker run --gpus all -p 8080:80 \ | |
| --shm-size 1g \ | |
| ghcr.io/huggingface/text-generation-inference:latest \ | |
| --model-id HuggingFaceTB/SmolLM2-1.7B-Instruct \ | |
| --max-batch-total-tokens 8192 \ | |
| --max-input-length 4096<!-- HTML_TAG_END --></pre></div> | |
| </hfoption> | |
| <hfoption value="llama.cpp" label="llama.cpp"> | |
| <p data-svelte-h="svelte-x52dpz">llama.cpp menggunakan kuantisasi dan tata letak memori yang dioptimalkan:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->./server \ | |
| -m smollm2-1.7b-instruct.Q4_K_M.gguf \ | |
| --host 0.0.0.0 \ | |
| --port 8080 \ | |
| -c 2048 \ <span class="hljs-comment"># Ukuran konteks</span> | |
| --threads 4 \ <span class="hljs-comment"># Thread CPU</span> | |
| --n-gpu-layers 32 \ <span class="hljs-comment"># Gunakan lebih banyak lapisan GPU untuk model yang lebih besar</span> | |
| --mlock \ <span class="hljs-comment"># Kunci memori untuk mencegah swapping</span> | |
| --cont-batching <span class="hljs-comment"># Aktifkan batching kontinu</span><!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1lh932n">Untuk model terlalu besar bagi GPU Anda, gunakan offloading ke CPU:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->./server \ | |
| -m smollm2-1.7b-instruct.Q4_K_M.gguf \ | |
| --n-gpu-layers 20 \ <span class="hljs-comment"># Simpan 20 lapisan pertama di GPU</span> | |
| --threads 8 <span class="hljs-comment"># Gunakan lebih banyak thread CPU untuk lapisan CPU</span><!-- HTML_TAG_END --></pre></div> | |
| </hfoption> | |
| <hfoption value="vllm" label="vLLM"> | |
| <p data-svelte-h="svelte-v17cca">vLLM menggunakan PagedAttention untuk manajemen memori optimal:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> vllm.engine.arg_utils <span class="hljs-keyword">import</span> AsyncEngineArgs | |
| engine_args = AsyncEngineArgs( | |
| model=<span class="hljs-string">"HuggingFaceTB/SmolLM2-1.7B-Instruct"</span>, | |
| gpu_memory_utilization=<span class="hljs-number">0.85</span>, | |
| max_num_batched_tokens=<span class="hljs-number">8192</span>, | |
| block_size=<span class="hljs-number">16</span>, | |
| ) | |
| llm = LLM(engine_args=engine_args)<!-- HTML_TAG_END --></pre></div> | |
| </hfoption></div> <h2 class="relative group"><a id="sumber" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#sumber"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Sumber</span></h2> <ul data-svelte-h="svelte-1eojzuk"><li><a href="https://huggingface.co/docs/text-generation-inference" rel="nofollow">Dokumentasi Text Generation Inference</a></li> <li><a href="https://github.com/huggingface/text-generation-inference" rel="nofollow">Repositori GitHub TGI</a></li> <li><a href="https://vllm.readthedocs.io/" rel="nofollow">Dokumentasi vLLM</a></li> <li><a href="https://github.com/vllm-project/vllm" rel="nofollow">Repositori GitHub vLLM</a></li> <li><a href="https://arxiv.org/abs/2309.06180" rel="nofollow">Makalah PagedAttention</a></li> <li><a href="https://github.com/ggerganov/llama.cpp" rel="nofollow">Repositori llama.cpp</a></li> <li><a href="https://github.com/abetlen/llama-cpp-python" rel="nofollow">Repositori llama-cpp-python</a></li></ul> <a class="!text-gray-400 !no-underline text-sm flex items-center not-prose mt-4" href="https://github.com/huggingface/course/blob/main/chapters/id/chapter2/8.mdx" target="_blank"><span data-svelte-h="svelte-1kd6by1"><</span> <span data-svelte-h="svelte-x0xyl0">></span> <span data-svelte-h="svelte-1dajgef"><span class="underline ml-1.5">Update</span> on GitHub</span></a> <p></p> | |
| <script> | |
| { | |
| __sveltekit_ojy514 = { | |
| assets: "/docs/course/pr_1054/id", | |
| base: "/docs/course/pr_1054/id", | |
| env: {} | |
| }; | |
| const element = document.currentScript.parentElement; | |
| const data = [null,null]; | |
| Promise.all([ | |
| import("/docs/course/pr_1054/id/_app/immutable/entry/start.4f92af03.js"), | |
| import("/docs/course/pr_1054/id/_app/immutable/entry/app.19cef1b6.js") | |
| ]).then(([kit, app]) => { | |
| kit.start(app, element, { | |
| node_ids: [0, 21], | |
| data, | |
| form: null, | |
| error: null | |
| }); | |
| }); | |
| } | |
| </script> | |
Xet Storage Details
- Size:
- 105 kB
- Xet hash:
- 5d417586805d6784b8f377abd0ed59776a2eb894eab3735c4cacc83f2e96d671
·
Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.