Buckets:
| <meta charset="utf-8" /><meta name="hf:doc:metadata" content="{"title":"Optimization လုပ်ထားသော Inference Deployment","local":"optimized-inference-deployment","sections":[{"title":"Framework ရွေးချယ်မှု လမ်းညွှန်","local":"framework-selection-guide","sections":[{"title":"Memory Management နှင့် Performance","local":"memory-management-and-performance","sections":[],"depth":3},{"title":"Deployment နှင့် Integration","local":"deployment-and-integration","sections":[],"depth":3}],"depth":2},{"title":"စတင်ခြင်း","local":"getting-started","sections":[{"title":"Installation နှင့် Basic Setup","local":"installation-and-basic-setup","sections":[],"depth":3},{"title":"Basic Text Generation","local":"basic-text-generation","sections":[],"depth":3}],"depth":2},{"title":"Advanced Generation Control","local":"advanced-generation-control","sections":[{"title":"Token Selection နှင့် Sampling","local":"token-selection-and-sampling","sections":[],"depth":3},{"title":"ထပ်ခါတလဲလဲ ဖြစ်မှုကို ထိန်းချုပ်ခြင်း","local":"controlling-repetition","sections":[],"depth":3},{"title":"အရှည် ထိန်းချုပ်ခြင်းနှင့် ရပ်တန့်ခြင်း Sequences များ","local":"length-control-and-stop-sequences","sections":[],"depth":3}],"depth":2},{"title":"Memory Management","local":"memory-management","sections":[],"depth":2},{"title":"အရင်းအမြစ်များ","local":"resources","sections":[],"depth":2},{"title":"ဝေါဟာရ ရှင်းလင်းချက် (Glossary)","local":"ဝဟရ-ရငလငခက-glossary","sections":[],"depth":2}],"depth":1}"> | |
| <link href="/docs/course/pr_1107/my/_app/immutable/assets/0.e3b0c442.css" rel="modulepreload"> | |
| <link rel="modulepreload" href="/docs/course/pr_1107/my/_app/immutable/entry/start.5c6233a8.js"> | |
| <link rel="modulepreload" href="/docs/course/pr_1107/my/_app/immutable/chunks/scheduler.0835143d.js"> | |
| <link rel="modulepreload" href="/docs/course/pr_1107/my/_app/immutable/chunks/singletons.c8b11329.js"> | |
| <link rel="modulepreload" href="/docs/course/pr_1107/my/_app/immutable/chunks/index.1bab75e2.js"> | |
| <link rel="modulepreload" href="/docs/course/pr_1107/my/_app/immutable/chunks/paths.e4a366ea.js"> | |
| <link rel="modulepreload" href="/docs/course/pr_1107/my/_app/immutable/entry/app.55586789.js"> | |
| <link rel="modulepreload" href="/docs/course/pr_1107/my/_app/immutable/chunks/preload-helper.5f7c8393.js"> | |
| <link rel="modulepreload" href="/docs/course/pr_1107/my/_app/immutable/chunks/index.3d7efe79.js"> | |
| <link rel="modulepreload" href="/docs/course/pr_1107/my/_app/immutable/nodes/0.0cec3d6c.js"> | |
| <link rel="modulepreload" href="/docs/course/pr_1107/my/_app/immutable/chunks/each.e59479a4.js"> | |
| <link rel="modulepreload" href="/docs/course/pr_1107/my/_app/immutable/nodes/21.21542875.js"> | |
| <link rel="modulepreload" href="/docs/course/pr_1107/my/_app/immutable/chunks/Tip.55488f5f.js"> | |
| <link rel="modulepreload" href="/docs/course/pr_1107/my/_app/immutable/chunks/CodeBlock.116ed840.js"> | |
| <link rel="modulepreload" href="/docs/course/pr_1107/my/_app/immutable/chunks/MermaidChart.svelte_svelte_type_style_lang.0b02b772.js"> | |
| <link rel="modulepreload" href="/docs/course/pr_1107/my/_app/immutable/chunks/stores.6af6d5ae.js"><!-- HEAD_svelte-u9bgzb_START --><meta name="hf:doc:metadata" content="{"title":"Optimization လုပ်ထားသော Inference Deployment","local":"optimized-inference-deployment","sections":[{"title":"Framework ရွေးချယ်မှု လမ်းညွှန်","local":"framework-selection-guide","sections":[{"title":"Memory Management နှင့် Performance","local":"memory-management-and-performance","sections":[],"depth":3},{"title":"Deployment နှင့် Integration","local":"deployment-and-integration","sections":[],"depth":3}],"depth":2},{"title":"စတင်ခြင်း","local":"getting-started","sections":[{"title":"Installation နှင့် Basic Setup","local":"installation-and-basic-setup","sections":[],"depth":3},{"title":"Basic Text Generation","local":"basic-text-generation","sections":[],"depth":3}],"depth":2},{"title":"Advanced Generation Control","local":"advanced-generation-control","sections":[{"title":"Token Selection နှင့် Sampling","local":"token-selection-and-sampling","sections":[],"depth":3},{"title":"ထပ်ခါတလဲလဲ ဖြစ်မှုကို ထိန်းချုပ်ခြင်း","local":"controlling-repetition","sections":[],"depth":3},{"title":"အရှည် ထိန်းချုပ်ခြင်းနှင့် ရပ်တန့်ခြင်း Sequences များ","local":"length-control-and-stop-sequences","sections":[],"depth":3}],"depth":2},{"title":"Memory Management","local":"memory-management","sections":[],"depth":2},{"title":"အရင်းအမြစ်များ","local":"resources","sections":[],"depth":2},{"title":"ဝေါဟာရ ရှင်းလင်းချက် (Glossary)","local":"ဝဟရ-ရငလငခက-glossary","sections":[],"depth":2}],"depth":1}"><!-- HEAD_svelte-u9bgzb_END --> <p></p> <h1 class="relative group"><a id="optimized-inference-deployment" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#optimized-inference-deployment"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Optimization လုပ်ထားသော Inference Deployment</span></h1> <p data-svelte-h="svelte-1b8n5la">ဒီအပိုင်းမှာတော့ LLM deployments တွေကို optimization လုပ်ဖို့အတွက် အဆင့်မြင့် frameworks တွေဖြစ်တဲ့ Text Generation Inference (TGI), vLLM, နဲ့ llama.cpp တို့ကို လေ့လာသွားပါမယ်။ ဒီ application တွေက အဓိကအားဖြင့် ထုတ်လုပ်မှု ပတ်ဝန်းကျင် (production environments) တွေမှာ LLM တွေကို သုံးစွဲသူများဆီသို့ ဝန်ဆောင်မှုပေးဖို့ အသုံးပြုကြပါတယ်။ ဒီအပိုင်းက ဒီ frameworks တွေကို production မှာ ဘယ်လို deploy လုပ်ရမယ်ဆိုတာကို အဓိကထားပြီး၊ single machine တစ်ခုပေါ်မှာ inference အတွက် ဘယ်လိုအသုံးပြုရမယ်ဆိုတာကို အာရုံစိုက်ထားခြင်း မရှိပါဘူး။</p> <p data-svelte-h="svelte-1ysm94f">ဒီ tools တွေက inference efficiency ကို ဘယ်လိုအမြင့်ဆုံးမြှင့်တင်ပြီး Large Language Models တွေကို production deployments တွေကို ဘယ်လို ရိုးရှင်းအောင် လုပ်ဆောင်တယ်ဆိုတာကို ကျွန်တော်တို့ ဖော်ပြပေးပါမယ်။</p> <h2 class="relative group"><a id="framework-selection-guide" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#framework-selection-guide"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Framework ရွေးချယ်မှု လမ်းညွှန်</span></h2> <p data-svelte-h="svelte-1lnmlju">TGI, vLLM, နဲ့ llama.cpp တို့ဟာ ရည်ရွယ်ချက်ချင်း တူညီပေမယ့်၊ မတူညီတဲ့ အသုံးပြုမှုပုံစံတွေအတွက် ပိုမိုသင့်လျော်စေတဲ့ ထူးခြားတဲ့ အင်္ဂါရပ်တွေ ရှိပါတယ်။ ၎င်းတို့ကြားက အဓိက ကွာခြားချက်တွေကို စွမ်းဆောင်ရည် (performance) နဲ့ ပေါင်းစပ်မှု (integration) ကို အာရုံစိုက်ပြီး ကြည့်ရအောင်။</p> <h3 class="relative group"><a id="memory-management-and-performance" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#memory-management-and-performance"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Memory Management နှင့် Performance</span></h3> <p data-svelte-h="svelte-134fu0m"><strong>TGI</strong> ကို production မှာ တည်ငြိမ်ပြီး ခန့်မှန်းနိုင်စေဖို့ ဒီဇိုင်းထုတ်ထားပြီး၊ memory အသုံးပြုမှုကို တသမတ်တည်း ထိန်းထားဖို့အတွက် fixed sequence lengths တွေကို အသုံးပြုပါတယ်။ TGI က Flash Attention 2 နဲ့ continuous batching နည်းစနစ်တွေကို အသုံးပြုပြီး memory ကို စီမံခန့်ခွဲပါတယ်။ ဒါက ၎င်းသည် attention calculations တွေကို အလွန်ထိထိရောက်ရောက် လုပ်ဆောင်နိုင်ပြီး GPU ကို အလုပ်တွေ အဆက်မပြတ် ပေးခြင်းဖြင့် အလုပ်များနေအောင် ထိန်းထားနိုင်တယ်လို့ ဆိုလိုပါတယ်။ လိုအပ်တဲ့အခါ စနစ်က model ရဲ့ အစိတ်အပိုင်းတွေကို CPU နဲ့ GPU ကြား ရွှေ့ပြောင်းနိုင်တာကြောင့် ပိုကြီးတဲ့ model တွေကို ကိုင်တွယ်ရာမှာ အထောက်အကူ ဖြစ်စေပါတယ်။</p> <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/tgi/flash-attn.png" alt="Flash Attention"> <blockquote class="tip"><p data-svelte-h="svelte-eyijjm">Flash Attention ဆိုတာ transformer models တွေမှာ attention mechanism ကို memory bandwidth bottlenecks တွေကို ဖြေရှင်းပေးခြင်းဖြင့် optimization လုပ်တဲ့ နည်းပညာတစ်ခုပါ။ <a href="/course/chapter1/8">Chapter 1.8</a> မှာ ယခင်က ဆွေးနွေးခဲ့သလိုပဲ၊ attention mechanism မှာ quadratic complexity နဲ့ memory usage ရှိတာကြောင့် ရှည်လျားတဲ့ sequences တွေအတွက် ထိရောက်မှု မရှိပါဘူး။</p> <p data-svelte-h="svelte-l0mzhw">အဓိက တီထွင်မှုကတော့ High Bandwidth Memory (HBM) နဲ့ ပိုမြန်တဲ့ SRAM cache ကြား memory transfers တွေကို ဘယ်လို စီမံခန့်ခွဲလဲဆိုတဲ့ အချက်မှာပါပဲ။ ရိုးရာ attention နည်းလမ်းက HBM နဲ့ SRAM ကြား ဒေတာတွေကို အကြိမ်ကြိမ် transfer လုပ်တာကြောင့် GPU ကို အလုပ်မရှိဘဲ ထားခြင်းဖြင့် bottlenecks တွေ ဖြစ်စေပါတယ်။ Flash Attention က ဒေတာတွေကို SRAM ထဲကို တစ်ကြိမ်တည်း load လုပ်ပြီး အဲဒီမှာပဲ calculations တွေအားလုံးကို လုပ်ဆောင်တာကြောင့် ကုန်ကျစရိတ်များတဲ့ memory transfers တွေကို လျှော့ချပေးပါတယ်။</p> <p data-svelte-h="svelte-1atvhen">အကျိုးကျေးဇူးတွေက training လုပ်နေစဉ်မှာ အရေးအပါဆုံး ဖြစ်ပေမယ့်၊ Flash Attention ရဲ့ လျှော့ချထားတဲ့ VRAM အသုံးပြုမှုနဲ့ တိုးတက်လာတဲ့ efficiency က inference အတွက်ပါ အဖိုးတန်စေပြီး၊ ပိုမိုမြန်ဆန်ပြီး ပိုမို scalable ဖြစ်တဲ့ LLM serving ကို ဖြစ်ပေါ်စေပါတယ်။</p></blockquote> <p data-svelte-h="svelte-1ve6hmj"><strong>vLLM</strong> က PagedAttention ကို အသုံးပြုပြီး မတူညီတဲ့ နည်းလမ်းတစ်ခုကို အသုံးပြုပါတယ်။ ကွန်ပျူတာက memory ကို pages တွေနဲ့ စီမံခန့်ခွဲသလိုပဲ၊ vLLM က model ရဲ့ memory ကို ပိုသေးငယ်တဲ့ blocks တွေအဖြစ် ပိုင်းခြားပါတယ်။ ဒီ clever system ကြောင့် ၎င်းသည် မတူညီတဲ့ အရွယ်အစားရှိတဲ့ requests တွေကို ပိုမိုပြောင်းလွယ်ပြင်လွယ် ကိုင်တွယ်နိုင်ပြီး memory space ကို မဖြုန်းတီးပါဘူး။ ဒါက မတူညီတဲ့ requests တွေကြား memory ကို မျှဝေရာမှာ အထူးကောင်းမွန်ပြီး memory fragmentation ကို လျှော့ချပေးတာကြောင့် စနစ်တစ်ခုလုံးကို ပိုမိုထိရောက်စေပါတယ်။</p> <blockquote class="tip"><p data-svelte-h="svelte-4vcn0k">PagedAttention ဆိုတာ LLM inference မှာ နောက်ထပ် အရေးကြီးတဲ့ bottleneck တစ်ခုဖြစ်တဲ့ KV cache memory management ကို ဖြေရှင်းပေးတဲ့ နည်းပညာတစ်ခုပါ။ <a href="/course/chapter1/8">Chapter 1.8</a> မှာ ဆွေးနွေးခဲ့သလိုပဲ၊ text generation လုပ်နေစဉ်မှာ model က attention keys နဲ့ values (KV cache) တွေကို ထုတ်လုပ်လိုက်တဲ့ token တစ်ခုစီအတွက် သိမ်းဆည်းထားပြီး ထပ်ခါတလဲလဲ တွက်ချက်မှုတွေကို လျှော့ချပါတယ်။ KV cache က အထူးသဖြင့် ရှည်လျားတဲ့ sequences တွေ ဒါမှမဟုတ် concurrent requests များစွာနဲ့ဆိုရင် အလွန်ကြီးမားလာနိုင်ပါတယ်။</p> <p data-svelte-h="svelte-ivgi6v">vLLM ရဲ့ အဓိက တီထွင်မှုကတော့ ဒီ cache ကို ဘယ်လို စီမံခန့်ခွဲလဲဆိုတဲ့ အချက်မှာပါပဲ-</p> <ol data-svelte-h="svelte-127i6kz"><li><strong>Memory Paging</strong>: KV cache ကို ကြီးမားတဲ့ block တစ်ခုအဖြစ် မမှတ်ယူဘဲ၊ ၎င်းကို fixed-size “pages” တွေအဖြစ် ပိုင်းခြားထားပါတယ် (operating systems တွေမှာ virtual memory နဲ့ ဆင်တူပါတယ်)။</li> <li><strong>Non-contiguous Storage</strong>: Pages တွေကို GPU memory မှာ ဆက်တိုက် သိမ်းဆည်းထားဖို့ မလိုအပ်တာကြောင့် ပိုမိုပြောင်းလွယ်ပြင်လွယ်ရှိတဲ့ memory allocation ကို ဖြစ်ပေါ်စေပါတယ်။</li> <li><strong>Page Table Management</strong>: Page table တစ်ခုက ဘယ် pages တွေက ဘယ် sequence နဲ့ သက်ဆိုင်တယ်ဆိုတာကို ခြေရာခံပြီး၊ ထိရောက်တဲ့ lookup နဲ့ access ကို ဖြစ်ပေါ်စေပါတယ်။</li> <li><strong>Memory Sharing</strong>: parallel sampling လို လုပ်ငန်းတွေအတွက်၊ prompt အတွက် KV cache ကို သိမ်းဆည်းထားတဲ့ pages တွေကို sequences များစွာမှာ မျှဝေအသုံးပြုနိုင်ပါတယ်။</li></ol> <p data-svelte-h="svelte-1vcmkmw">PagedAttention နည်းလမ်းက ရိုးရာနည်းလမ်းတွေနဲ့ နှိုင်းယှဉ်ရင် throughput ကို ၂၄ ဆအထိ ပိုမိုမြင့်မားစေနိုင်တာကြောင့် production LLM deployments တွေအတွက် game-changer တစ်ခု ဖြစ်ပါတယ်။ PagedAttention ဘယ်လိုအလုပ်လုပ်တယ်ဆိုတာကို တကယ်နက်နက်နဲနဲ လေ့လာချင်တယ်ဆိုရင် <a href="https://docs.vllm.ai/en/latest/design/kernel/paged_attention.html" rel="nofollow">vLLM documentation ရဲ့ လမ်းညွှန်</a> ကို ဖတ်ရှုနိုင်ပါတယ်။</p></blockquote> <p data-svelte-h="svelte-jyinoy"><strong>llama.cpp</strong> ဟာ မူလက LLaMA models တွေကို consumer hardware တွေမှာ run ဖို့ ဒီဇိုင်းထုတ်ထားတဲ့ highly optimized C/C++ implementation တစ်ခုပါ။ ဒါက optional GPU acceleration ပါဝင်တဲ့ CPU efficiency ကို အာရုံစိုက်ပြီး၊ resource-constrained environments တွေအတွက် အကောင်းဆုံးပါပဲ။ llama.cpp က model size နဲ့ memory requirements တွေကို လျှော့ချဖို့အတွက် quantization နည်းစနစ်တွေကို အသုံးပြုပြီး ကောင်းမွန်တဲ့ performance ကို ထိန်းသိမ်းထားပါတယ်။ ဒါက အမျိုးမျိုးသော CPU architectures တွေအတွက် optimized kernels တွေကို implement လုပ်ထားပြီး၊ ထိရောက်တဲ့ token generation အတွက် basic KV cache management ကို ထောက်ပံ့ပေးပါတယ်။</p> <blockquote class="tip"><p data-svelte-h="svelte-1mah3u8">llama.cpp မှာ Quantization ဆိုတာ model weights တွေရဲ့ precision ကို 32-bit ဒါမှမဟုတ် 16-bit floating point ကနေ 8-bit integers (INT8)၊ 4-bit ဒါမှမဟုတ် ပိုနိမ့်တဲ့ precision formats တွေအဖြစ် လျှော့ချတာပါ။ ဒါက memory အသုံးပြုမှုကို သိသိသာသာ လျှော့ချပေးပြီး အနည်းဆုံး အရည်အသွေး ဆုံးရှုံးမှုနဲ့အတူ inference speed ကို မြှင့်တင်ပေးပါတယ်။</p> <p data-svelte-h="svelte-1o55l49">llama.cpp မှာ အဓိက quantization features တွေကတော့-</p> <ol data-svelte-h="svelte-hudfu2"><li><strong>Multiple Quantization Levels</strong>: 8-bit, 4-bit, 3-bit, နဲ့ 2-bit quantization ကိုပါ ထောက်ပံ့ပေးပါတယ်။</li> <li><strong>GGML/GGUF Format</strong>: Quantized inference အတွက် optimization လုပ်ထားတဲ့ custom tensor formats တွေကို အသုံးပြုပါတယ်။</li> <li><strong>Mixed Precision</strong>: Model ရဲ့ မတူညီတဲ့ အစိတ်အပိုင်းတွေမှာ မတူညီတဲ့ quantization levels တွေကို အသုံးပြုနိုင်ပါတယ်။</li> <li><strong>Hardware-Specific Optimizations</strong>: အမျိုးမျိုးသော CPU architectures တွေ (AVX2, AVX-512, NEON) အတွက် optimized code paths တွေ ပါဝင်ပါတယ်။</li></ol> <p data-svelte-h="svelte-1nz7s7s">ဒီနည်းလမ်းက limited memory ရှိတဲ့ consumer hardware တွေမှာ billion-parameter models တွေကို run နိုင်စေပြီး၊ local deployments နဲ့ edge devices တွေအတွက် အကောင်းဆုံး ဖြစ်စေပါတယ်။</p></blockquote> <h3 class="relative group"><a id="deployment-and-integration" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#deployment-and-integration"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Deployment နှင့် Integration</span></h3> <p data-svelte-h="svelte-dp4rs">frameworks တွေကြားက deployment နဲ့ integration ကွာခြားချက်တွေကို ဆက်သွားရအောင်။</p> <p data-svelte-h="svelte-1ezs85m"><strong>TGI</strong> က သူ့ရဲ့ production-ready features တွေနဲ့ enterprise-level deployment တွေမှာ ထူးချွန်ပါတယ်။ ဒါက built-in Kubernetes support နဲ့ Prometheus နဲ့ Grafana ကနေတဆင့် monitoring လုပ်ခြင်း၊ automatic scaling, နဲ့ ပြည့်စုံတဲ့ safety features တွေလို production မှာ run ဖို့ လိုအပ်တဲ့ အရာအားလုံး ပါဝင်ပါတယ်။ စနစ်က enterprise-grade logging နဲ့ content filtering နဲ့ rate limiting လိုမျိုး အမျိုးမျိုးသော ကာကွယ်မှု နည်းလမ်းတွေပါ ပါဝင်တာကြောင့် သင့် deployment ကို လုံခြုံပြီး တည်ငြိမ်အောင် ထိန်းထားနိုင်ပါတယ်။</p> <p data-svelte-h="svelte-t8xpqw"><strong>vLLM</strong> က deployment အတွက် ပိုမိုပြောင်းလွယ်ပြင်လွယ်ရှိပြီး developer-friendly ဖြစ်တဲ့ ချဉ်းကပ်မှုကို အသုံးပြုပါတယ်။ ဒါက Python ကို အဓိကထားပြီး တည်ဆောက်ထားတာကြောင့် သင့်ရဲ့ လက်ရှိ application တွေမှာ OpenAI ရဲ့ API ကို အလွယ်တကူ အစားထိုးနိုင်ပါတယ်။ framework က raw performance ကို ပေးစွမ်းဖို့ အာရုံစိုက်ပြီး၊ သင့်ရဲ့ သီးခြားလိုအပ်ချက်တွေနဲ့ ကိုက်ညီအောင် စိတ်ကြိုက်ပြင်ဆင်နိုင်ပါတယ်။ ဒါက clusters တွေကို စီမံခန့်ခွဲဖို့အတွက် Ray နဲ့ အထူးကောင်းမွန်စွာ အလုပ်လုပ်တာကြောင့် high performance နဲ့ adaptability လိုအပ်တဲ့အခါ အကောင်းဆုံး ရွေးချယ်မှုတစ်ခု ဖြစ်ပါတယ်။</p> <p data-svelte-h="svelte-1mp95i6"><strong>llama.cpp</strong> က ရိုးရှင်းမှုနဲ့ portability ကို ဦးစားပေးပါတယ်။ သူ့ရဲ့ server implementation က ပေါ့ပါးပြီး hardware အမျိုးမျိုး (powerful servers တွေကနေ consumer laptops တွေနဲ့ အချို့ high-end mobile devices တွေအထိ) မှာ run နိုင်ပါတယ်။ အနည်းဆုံး dependencies တွေနဲ့ ရိုးရှင်းတဲ့ C/C++ core နဲ့ဆိုရင်၊ Python frameworks တွေ install လုပ်ဖို့ ခက်ခဲတဲ့ environments တွေမှာ deploy လုပ်ဖို့ လွယ်ကူပါတယ်။ server က OpenAI-compatible API ကို ပံ့ပိုးပေးထားပြီး အခြား solution တွေထက် resource အသုံးပြုမှုက အများကြီး သေးငယ်ပါတယ်။</p> <h2 class="relative group"><a id="getting-started" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#getting-started"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>စတင်ခြင်း</span></h2> <p data-svelte-h="svelte-g23p2e">LLMs တွေကို deploy လုပ်ဖို့ ဒီ frameworks တွေကို ဘယ်လိုအသုံးပြုရမလဲဆိုတာကို လေ့လာကြည့်ရအောင်။ installation နဲ့ basic setup ကနေ စတင်ပါမယ်။</p> <h3 class="relative group"><a id="installation-and-basic-setup" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#installation-and-basic-setup"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Installation နှင့် Basic Setup</span></h3> <div class="flex space-x-2 items-center my-1.5 mr-8 h-7 !pl-0 -mx-3 md:mx-0"></div> <div class="language-select"><hfoption value="tgi" label="TGI"> | |
| <p data-svelte-h="svelte-z9my9k">TGI က install လုပ်ဖို့နဲ့ အသုံးပြုဖို့ လွယ်ကူပြီး၊ Hugging Face ecosystem ထဲမှာ နက်ရှိုင်းစွာ ပေါင်းစပ်ထားပါတယ်။</p> <p data-svelte-h="svelte-1i00898">ပထမဆုံး၊ Docker ကို အသုံးပြုပြီး TGI server ကို launch လုပ်ပါ။</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->docker run --gpus all \ | |
| --shm-size 1g \ | |
| -p 8080:80 \ | |
| -v ~/.cache/huggingface:/data \ | |
| ghcr.io/huggingface/text-generation-inference:latest \ | |
| --model-id HuggingFaceTB/SmolLM2-360M-Instruct<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-14je8eg">အဲဒီနောက် Hugging Face ရဲ့ InferenceClient ကို အသုံးပြုပြီး အပြန်အလှန်ဆက်သွယ်ပါ။</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> huggingface_hub <span class="hljs-keyword">import</span> InferenceClient | |
| <span class="hljs-comment"># TGI endpoint ကို ညွှန်ပြပြီး client ကို Initialize လုပ်ပါ။</span> | |
| client = InferenceClient( | |
| model=<span class="hljs-string">"http://localhost:8080"</span>, <span class="hljs-comment"># URL to the TGI server</span> | |
| ) | |
| <span class="hljs-comment"># Text generation</span> | |
| response = client.text_generation( | |
| <span class="hljs-string">"Tell me a story"</span>, | |
| max_new_tokens=<span class="hljs-number">100</span>, | |
| temperature=<span class="hljs-number">0.7</span>, | |
| top_p=<span class="hljs-number">0.95</span>, | |
| details=<span class="hljs-literal">True</span>, | |
| stop_sequences=[], | |
| ) | |
| <span class="hljs-built_in">print</span>(response.generated_text) | |
| <span class="hljs-comment"># For chat format</span> | |
| response = client.chat_completion( | |
| messages=[ | |
| {<span class="hljs-string">"role"</span>: <span class="hljs-string">"system"</span>, <span class="hljs-string">"content"</span>: <span class="hljs-string">"You are a helpful assistant."</span>}, | |
| {<span class="hljs-string">"role"</span>: <span class="hljs-string">"user"</span>, <span class="hljs-string">"content"</span>: <span class="hljs-string">"Tell me a story"</span>}, | |
| ], | |
| max_tokens=<span class="hljs-number">100</span>, | |
| temperature=<span class="hljs-number">0.7</span>, | |
| top_p=<span class="hljs-number">0.95</span>, | |
| ) | |
| <span class="hljs-built_in">print</span>(response.choices[<span class="hljs-number">0</span>].message.content)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1otxgnc">တနည်းအားဖြင့် OpenAI client ကို အသုံးပြုနိုင်ပါတယ်။</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> openai <span class="hljs-keyword">import</span> OpenAI | |
| <span class="hljs-comment"># Initialize client pointing to TGI endpoint</span> | |
| client = OpenAI( | |
| base_url=<span class="hljs-string">"http://localhost:8080/v1"</span>, <span class="hljs-comment"># /v1 ကို ထည့်သွင်းဖို့ သေချာပါစေ။</span> | |
| api_key=<span class="hljs-string">"not-needed"</span>, <span class="hljs-comment"># TGI က default အားဖြင့် API key မလိုအပ်ပါဘူး။</span> | |
| ) | |
| <span class="hljs-comment"># Chat completion</span> | |
| response = client.chat.completions.create( | |
| model=<span class="hljs-string">"HuggingFaceTB/SmolLM2-360M-Instruct"</span>, | |
| messages=[ | |
| {<span class="hljs-string">"role"</span>: <span class="hljs-string">"system"</span>, <span class="hljs-string">"content"</span>: <span class="hljs-string">"You are a helpful assistant."</span>}, | |
| {<span class="hljs-string">"role"</span>: <span class="hljs-string">"user"</span>, <span class="hljs-string">"content"</span>: <span class="hljs-string">"Tell me a story"</span>}, | |
| ], | |
| max_tokens=<span class="hljs-number">100</span>, | |
| temperature=<span class="hljs-number">0.7</span>, | |
| top_p=<span class="hljs-number">0.95</span>, | |
| ) | |
| <span class="hljs-built_in">print</span>(response.choices[<span class="hljs-number">0</span>].message.content)<!-- HTML_TAG_END --></pre></div> | |
| </hfoption> | |
| <hfoption value="llama.cpp" label="llama.cpp"> | |
| <p data-svelte-h="svelte-jpfqtr">llama.cpp က install လုပ်ဖို့နဲ့ အသုံးပြုဖို့ လွယ်ကူပြီး၊ အနည်းဆုံး dependencies တွေပဲ လိုအပ်ကာ CPU နဲ့ GPU inference နှစ်ခုလုံးကို ထောက်ပံ့ပါတယ်။</p> <p data-svelte-h="svelte-nshxsv">ပထမဆုံး၊ llama.cpp ကို install လုပ်ပြီး build လုပ်ပါ။</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-comment"># Repository ကို clone လုပ်ပါ။</span> | |
| git <span class="hljs-built_in">clone</span> https://github.com/ggerganov/llama.cpp | |
| <span class="hljs-built_in">cd</span> llama.cpp | |
| <span class="hljs-comment"># Project ကို build လုပ်ပါ။</span> | |
| make | |
| <span class="hljs-comment"># SmolLM2-1.7B-Instruct-GGUF model ကို download လုပ်ပါ။</span> | |
| curl -L -O https://huggingface.co/HuggingFaceTB/SmolLM2-1.7B-Instruct-GGUF/resolve/main/smollm2-1.7b-instruct.Q4_K_M.gguf<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-fg3dt9">အဲဒီနောက် server ကို launch လုပ်ပါ (OpenAI API compatibility နဲ့)။</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-comment"># Start the server</span> | |
| ./server \ | |
| -m smollm2-1.7b-instruct.Q4_K_M.gguf \ | |
| --host 0.0.0.0 \ | |
| --port 8080 \ | |
| -c 4096 \ | |
| --n-gpu-layers 0 <span class="hljs-comment"># GPU ကို အသုံးပြုရန်အတွက် ပိုမိုမြင့်မားသော နံပါတ်ကို သတ်မှတ်ပါ။</span><!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1alc7yw">Hugging Face ရဲ့ InferenceClient ကို အသုံးပြုပြီး server နဲ့ အပြန်အလှန်ဆက်သွယ်ပါ။</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> huggingface_hub <span class="hljs-keyword">import</span> InferenceClient | |
| <span class="hljs-comment"># llama.cpp server ကို ညွှန်ပြပြီး client ကို Initialize လုပ်ပါ။</span> | |
| client = InferenceClient( | |
| model=<span class="hljs-string">"http://localhost:8080/v1"</span>, <span class="hljs-comment"># llama.cpp server ရဲ့ URL</span> | |
| token=<span class="hljs-string">"sk-no-key-required"</span>, <span class="hljs-comment"># llama.cpp server က ဒီ placeholder ကို လိုအပ်ပါတယ်။</span> | |
| ) | |
| <span class="hljs-comment"># Text generation</span> | |
| response = client.text_generation( | |
| <span class="hljs-string">"Tell me a story"</span>, | |
| max_new_tokens=<span class="hljs-number">100</span>, | |
| temperature=<span class="hljs-number">0.7</span>, | |
| top_p=<span class="hljs-number">0.95</span>, | |
| details=<span class="hljs-literal">True</span>, | |
| ) | |
| <span class="hljs-built_in">print</span>(response.generated_text) | |
| <span class="hljs-comment"># Chat format အတွက်</span> | |
| response = client.chat_completion( | |
| messages=[ | |
| {<span class="hljs-string">"role"</span>: <span class="hljs-string">"system"</span>, <span class="hljs-string">"content"</span>: <span class="hljs-string">"You are a helpful assistant."</span>}, | |
| {<span class="hljs-string">"role"</span>: <span class="hljs-string">"user"</span>, <span class="hljs-string">"content"</span>: <span class="hljs-string">"Tell me a story"</span>}, | |
| ], | |
| max_tokens=<span class="hljs-number">100</span>, | |
| temperature=<span class="hljs-number">0.7</span>, | |
| top_p=<span class="hljs-number">0.95</span>, | |
| ) | |
| <span class="hljs-built_in">print</span>(response.choices[<span class="hljs-number">0</span>].message.content)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1otxgnc">တနည်းအားဖြင့် OpenAI client ကို အသုံးပြုနိုင်ပါတယ်။</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> openai <span class="hljs-keyword">import</span> OpenAI | |
| <span class="hljs-comment"># llama.cpp server ကို ညွှန်ပြပြီး client ကို Initialize လုပ်ပါ။</span> | |
| client = OpenAI( | |
| base_url=<span class="hljs-string">"http://localhost:8080/v1"</span>, | |
| api_key=<span class="hljs-string">"sk-no-key-required"</span>, <span class="hljs-comment"># llama.cpp server က ဒီ placeholder ကို လိုအပ်ပါတယ်။</span> | |
| ) | |
| <span class="hljs-comment"># Chat completion</span> | |
| response = client.chat.completions.create( | |
| model=<span class="hljs-string">"smollm2-1.7b-instruct"</span>, <span class="hljs-comment"># Server က model တစ်ခုတည်းသာ load လုပ်တာကြောင့် model identifier က ဘာပဲဖြစ်ဖြစ် ရပါတယ်။</span> | |
| messages=[ | |
| {<span class="hljs-string">"role"</span>: <span class="hljs-string">"system"</span>, <span class="hljs-string">"content"</span>: <span class="hljs-string">"You are a helpful assistant."</span>}, | |
| {<span class="hljs-string">"role"</span>: <span class="hljs-string">"user"</span>, <span class="hljs-string">"content"</span>: <span class="hljs-string">"Tell me a story"</span>}, | |
| ], | |
| max_tokens=<span class="hljs-number">100</span>, | |
| temperature=<span class="hljs-number">0.7</span>, | |
| top_p=<span class="hljs-number">0.95</span>, | |
| ) | |
| <span class="hljs-built_in">print</span>(response.choices[<span class="hljs-number">0</span>].message.content)<!-- HTML_TAG_END --></pre></div> | |
| </hfoption> | |
| <hfoption value="vllm" label="vLLM"> | |
| <p data-svelte-h="svelte-1nk2xsy">vLLM က install လုပ်ဖို့နဲ့ အသုံးပြုဖို့ လွယ်ကူပြီး၊ OpenAI API compatibility နဲ့ native Python interface နှစ်ခုလုံး ပါဝင်ပါတယ်။</p> <p data-svelte-h="svelte-ch3xxx">ပထမဆုံး၊ vLLM OpenAI-compatible server ကို launch လုပ်ပါ။</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->python -m vllm.entrypoints.openai.api_server \ | |
| --model HuggingFaceTB/SmolLM2-360M-Instruct \ | |
| --host 0.0.0.0 \ | |
| --port 8000<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-14je8eg">အဲဒီနောက် Hugging Face ရဲ့ InferenceClient ကို အသုံးပြုပြီး အပြန်အလှန်ဆက်သွယ်ပါ။</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> huggingface_hub <span class="hljs-keyword">import</span> InferenceClient | |
| <span class="hljs-comment"># vLLM endpoint ကို ညွှန်ပြပြီး client ကို Initialize လုပ်ပါ။</span> | |
| client = InferenceClient( | |
| model=<span class="hljs-string">"http://localhost:8000/v1"</span>, <span class="hljs-comment"># vLLM server ရဲ့ URL</span> | |
| ) | |
| <span class="hljs-comment"># Text generation</span> | |
| response = client.text_generation( | |
| <span class="hljs-string">"Tell me a story"</span>, | |
| max_new_tokens=<span class="hljs-number">100</span>, | |
| temperature=<span class="hljs-number">0.7</span>, | |
| top_p=<span class="hljs-number">0.95</span>, | |
| details=<span class="hljs-literal">True</span>, | |
| ) | |
| <span class="hljs-built_in">print</span>(response.generated_text) | |
| <span class="hljs-comment"># Chat format အတွက်</span> | |
| response = client.chat_completion( | |
| messages=[ | |
| {<span class="hljs-string">"role"</span>: <span class="hljs-string">"system"</span>, <span class="hljs-string">"content"</span>: <span class="hljs-string">"You are a helpful assistant."</span>}, | |
| {<span class="hljs-string">"role"</span>: <span class="hljs-string">"user"</span>, <span class="hljs-string">"content"</span>: <span class="hljs-string">"Tell me a story"</span>}, | |
| ], | |
| max_tokens=<span class="hljs-number">100</span>, | |
| temperature=<span class="hljs-number">0.7</span>, | |
| top_p=<span class="hljs-number">0.95</span>, | |
| ) | |
| <span class="hljs-built_in">print</span>(response.choices[<span class="hljs-number">0</span>].message.content)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1otxgnc">တနည်းအားဖြင့် OpenAI client ကို အသုံးပြုနိုင်ပါတယ်။</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> openai <span class="hljs-keyword">import</span> OpenAI | |
| <span class="hljs-comment"># vLLM endpoint ကို ညွှန်ပြပြီး client ကို Initialize လုပ်ပါ။</span> | |
| client = OpenAI( | |
| base_url=<span class="hljs-string">"http://localhost:8000/v1"</span>, | |
| api_key=<span class="hljs-string">"not-needed"</span>, <span class="hljs-comment"># vLLM က default အားဖြင့် API key မလိုအပ်ပါဘူး။</span> | |
| ) | |
| <span class="hljs-comment"># Chat completion</span> | |
| response = client.chat.completions.create( | |
| model=<span class="hljs-string">"HuggingFaceTB/SmolLM2-360M-Instruct"</span>, | |
| messages=[ | |
| {<span class="hljs-string">"role"</span>: <span class="hljs-string">"system"</span>, <span class="hljs-string">"content"</span>: <span class="hljs-string">"You are a helpful assistant."</span>}, | |
| {<span class="hljs-string">"role"</span>: <span class="hljs-string">"user"</span>, <span class="hljs-string">"content"</span>: <span class="hljs-string">"Tell me a story"</span>}, | |
| ], | |
| max_tokens=<span class="hljs-number">100</span>, | |
| temperature=<span class="hljs-number">0.7</span>, | |
| top_p=<span class="hljs-number">0.95</span>, | |
| ) | |
| <span class="hljs-built_in">print</span>(response.choices[<span class="hljs-number">0</span>].message.content)<!-- HTML_TAG_END --></pre></div> | |
| </hfoption></div> <h3 class="relative group"><a id="basic-text-generation" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#basic-text-generation"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Basic Text Generation</span></h3> <p data-svelte-h="svelte-1l1f00x">frameworks တွေနဲ့ text generation ရဲ့ ဥပမာတွေကို ကြည့်ရအောင်။</p> <div class="flex space-x-2 items-center my-1.5 mr-8 h-7 !pl-0 -mx-3 md:mx-0"></div> <div class="language-select"><hfoption value="tgi" label="TGI"> | |
| <p data-svelte-h="svelte-d6y7vd">ပထမဆုံး၊ TGI ကို အဆင့်မြင့် parameters တွေနဲ့ deploy လုပ်ပါ။</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->docker run --gpus all \ | |
| --shm-size 1g \ | |
| -p 8080:80 \ | |
| -v ~/.cache/huggingface:/data \ | |
| ghcr.io/huggingface/text-generation-inference:latest \ | |
| --model-id HuggingFaceTB/SmolLM2-360M-Instruct \ | |
| --max-total-tokens 4096 \ | |
| --max-input-length 3072 \ | |
| --max-batch-total-tokens 8192 \ | |
| --waiting-served-ratio 1.2<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-rh7jsm">ပြောင်းလွယ်ပြင်လွယ်ရှိတဲ့ text generation အတွက် InferenceClient ကို အသုံးပြုပါ။</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> huggingface_hub <span class="hljs-keyword">import</span> InferenceClient | |
| client = InferenceClient(model=<span class="hljs-string">"http://localhost:8080"</span>) | |
| <span class="hljs-comment"># အဆင့်မြင့် parameters ဥပမာ</span> | |
| response = client.chat_completion( | |
| messages=[ | |
| {<span class="hljs-string">"role"</span>: <span class="hljs-string">"system"</span>, <span class="hljs-string">"content"</span>: <span class="hljs-string">"You are a creative storyteller."</span>}, | |
| {<span class="hljs-string">"role"</span>: <span class="hljs-string">"user"</span>, <span class="hljs-string">"content"</span>: <span class="hljs-string">"Write a creative story"</span>}, | |
| ], | |
| temperature=<span class="hljs-number">0.8</span>, | |
| max_tokens=<span class="hljs-number">200</span>, | |
| top_p=<span class="hljs-number">0.95</span>, | |
| ) | |
| <span class="hljs-built_in">print</span>(response.choices[<span class="hljs-number">0</span>].message.content) | |
| <span class="hljs-comment"># Raw text generation</span> | |
| response = client.text_generation( | |
| <span class="hljs-string">"Write a creative story about space exploration"</span>, | |
| max_new_tokens=<span class="hljs-number">200</span>, | |
| temperature=<span class="hljs-number">0.8</span>, | |
| top_p=<span class="hljs-number">0.95</span>, | |
| repetition_penalty=<span class="hljs-number">1.1</span>, | |
| do_sample=<span class="hljs-literal">True</span>, | |
| details=<span class="hljs-literal">True</span>, | |
| ) | |
| <span class="hljs-built_in">print</span>(response.generated_text)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-8r7bzz">ဒါမှမဟုတ် OpenAI client ကို အသုံးပြုပါ။</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> openai <span class="hljs-keyword">import</span> OpenAI | |
| client = OpenAI(base_url=<span class="hljs-string">"http://localhost:8080/v1"</span>, api_key=<span class="hljs-string">"not-needed"</span>) | |
| <span class="hljs-comment"># အဆင့်မြင့် parameters ဥပမာ</span> | |
| response = client.chat.completions.create( | |
| model=<span class="hljs-string">"HuggingFaceTB/SmolLM2-360M-Instruct"</span>, | |
| messages=[ | |
| {<span class="hljs-string">"role"</span>: <span class="hljs-string">"system"</span>, <span class="hljs-string">"content"</span>: <span class="hljs-string">"You are a creative storyteller."</span>}, | |
| {<span class="hljs-string">"role"</span>: <span class="hljs-string">"user"</span>, <span class="hljs-string">"content"</span>: <span class="hljs-string">"Write a creative story"</span>}, | |
| ], | |
| temperature=<span class="hljs-number">0.8</span>, <span class="hljs-comment"># Higher for more creativity</span> | |
| ) | |
| <span class="hljs-built_in">print</span>(response.choices[<span class="hljs-number">0</span>].message.content)<!-- HTML_TAG_END --></pre></div> | |
| </hfoption> | |
| <hfoption value="llama.cpp" label="llama.cpp"> | |
| <p data-svelte-h="svelte-1rlgcza">llama.cpp အတွက် အဆင့်မြင့် parameters တွေကို server ကို launch လုပ်တဲ့အခါ သတ်မှတ်နိုင်ပါတယ်။</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->./server \ | |
| -m smollm2-1.7b-instruct.Q4_K_M.gguf \ | |
| --host 0.0.0.0 \ | |
| --port 8080 \ | |
| -c 4096 \ <span class="hljs-comment"># Context size</span> | |
| --threads 8 \ <span class="hljs-comment"># CPU threads to use</span> | |
| --batch-size 512 \ <span class="hljs-comment"># Batch size for prompt evaluation</span> | |
| --n-gpu-layers 0 <span class="hljs-comment"># GPU layers (0 = CPU only)</span><!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1b0ha1x">InferenceClient ကို အသုံးပြုပါ။</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> huggingface_hub <span class="hljs-keyword">import</span> InferenceClient | |
| client = InferenceClient(model=<span class="hljs-string">"http://localhost:8080/v1"</span>, token=<span class="hljs-string">"sk-no-key-required"</span>) | |
| <span class="hljs-comment"># Advanced parameters example</span> | |
| response = client.chat_completion( | |
| messages=[ | |
| {<span class="hljs-string">"role"</span>: <span class="hljs-string">"system"</span>, <span class="hljs-string">"content"</span>: <span class="hljs-string">"You are a creative storyteller."</span>}, | |
| {<span class="hljs-string">"role"</span>: <span class="hljs-string">"user"</span>, <span class="hljs-string">"content"</span>: <span class="hljs-string">"Write a creative story"</span>}, | |
| ], | |
| temperature=<span class="hljs-number">0.8</span>, | |
| max_tokens=<span class="hljs-number">200</span>, | |
| top_p=<span class="hljs-number">0.95</span>, | |
| ) | |
| <span class="hljs-built_in">print</span>(response.choices[<span class="hljs-number">0</span>].message.content) | |
| <span class="hljs-comment"># တိုက်ရိုက် text generation အတွက်</span> | |
| response = client.text_generation( | |
| <span class="hljs-string">"Write a creative story about space exploration"</span>, | |
| max_new_tokens=<span class="hljs-number">200</span>, | |
| temperature=<span class="hljs-number">0.8</span>, | |
| top_p=<span class="hljs-number">0.95</span>, | |
| repetition_penalty=<span class="hljs-number">1.1</span>, | |
| details=<span class="hljs-literal">True</span>, | |
| ) | |
| <span class="hljs-built_in">print</span>(response.generated_text)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-5cj0js">ဒါမှမဟုတ် sampling parameters တွေကို ထိန်းချုပ်ပြီး generation လုပ်ဖို့ OpenAI client ကို အသုံးပြုပါ။</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> openai <span class="hljs-keyword">import</span> OpenAI | |
| client = OpenAI(base_url=<span class="hljs-string">"http://localhost:8080/v1"</span>, api_key=<span class="hljs-string">"sk-no-key-required"</span>) | |
| <span class="hljs-comment"># အဆင့်မြင့် parameters ဥပမာ</span> | |
| response = client.chat.completions.create( | |
| model=<span class="hljs-string">"smollm2-1.7b-instruct"</span>, | |
| messages=[ | |
| {<span class="hljs-string">"role"</span>: <span class="hljs-string">"system"</span>, <span class="hljs-string">"content"</span>: <span class="hljs-string">"You are a creative storyteller."</span>}, | |
| {<span class="hljs-string">"role"</span>: <span class="hljs-string">"user"</span>, <span class="hljs-string">"content"</span>: <span class="hljs-string">"Write a creative story"</span>}, | |
| ], | |
| temperature=<span class="hljs-number">0.8</span>, <span class="hljs-comment"># ပိုမိုဖန်တီးမှုရှိရန် ပိုမိုမြင့်မားသော တန်ဖိုး</span> | |
| top_p=<span class="hljs-number">0.95</span>, <span class="hljs-comment"># Nucleus sampling probability</span> | |
| frequency_penalty=<span class="hljs-number">0.5</span>, <span class="hljs-comment"># မကြာခဏ ပေါ်လာသော tokens များကို ထပ်ခါတလဲလဲ မဖြစ်အောင် လျှော့ချပါ။</span> | |
| presence_penalty=<span class="hljs-number">0.5</span>, <span class="hljs-comment"># ရှိပြီးသား tokens များကို ပြန်လည်ပေါ်ထွက်မှုကို လျှော့ချပါ။</span> | |
| max_tokens=<span class="hljs-number">200</span>, <span class="hljs-comment"># အများဆုံး generation length</span> | |
| ) | |
| <span class="hljs-built_in">print</span>(response.choices[<span class="hljs-number">0</span>].message.content)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-n7p3ha">llama.cpp ရဲ့ native library ကိုလည်း ပိုမိုထိန်းချုပ်နိုင်ရန် အသုံးပြုနိုင်ပါတယ်။</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-comment"># တိုက်ရိုက် model access အတွက် llama-cpp-python package ကို အသုံးပြုခြင်း</span> | |
| <span class="hljs-keyword">from</span> llama_cpp <span class="hljs-keyword">import</span> Llama | |
| <span class="hljs-comment"># Model ကို load လုပ်ပါ။</span> | |
| llm = Llama( | |
| model_path=<span class="hljs-string">"smollm2-1.7b-instruct.Q4_K_M.gguf"</span>, | |
| n_ctx=<span class="hljs-number">4096</span>, <span class="hljs-comment"># Context window size</span> | |
| n_threads=<span class="hljs-number">8</span>, <span class="hljs-comment"># CPU threads</span> | |
| n_gpu_layers=<span class="hljs-number">0</span>, <span class="hljs-comment"># GPU layers (0 = CPU only)</span> | |
| ) | |
| <span class="hljs-comment"># Model ရဲ့ မျှော်လင့်ထားတဲ့ format အတိုင်း prompt ကို format လုပ်ပါ။</span> | |
| prompt = <span class="hljs-string">"""<|im_start|>system | |
| You are a creative storyteller. | |
| <|im_end|> | |
| <|im_start|>user | |
| Write a creative story | |
| <|im_end|> | |
| <|im_start|>assistant | |
| """</span> | |
| <span class="hljs-comment"># တိကျတဲ့ parameter ထိန်းချုပ်မှုနဲ့ response ကို generate လုပ်ပါ။</span> | |
| output = llm( | |
| prompt, | |
| max_tokens=<span class="hljs-number">200</span>, | |
| temperature=<span class="hljs-number">0.8</span>, | |
| top_p=<span class="hljs-number">0.95</span>, | |
| frequency_penalty=<span class="hljs-number">0.5</span>, | |
| presence_penalty=<span class="hljs-number">0.5</span>, | |
| stop=[<span class="hljs-string">"<|im_end|>"</span>], | |
| ) | |
| <span class="hljs-built_in">print</span>(output[<span class="hljs-string">"choices"</span>][<span class="hljs-number">0</span>][<span class="hljs-string">"text"</span>])<!-- HTML_TAG_END --></pre></div> | |
| </hfoption> | |
| <hfoption value="vllm" label="vLLM"> | |
| <p data-svelte-h="svelte-qroqx3">vLLM နဲ့ အဆင့်မြင့် အသုံးပြုမှုအတွက် InferenceClient ကို အသုံးပြုနိုင်ပါတယ်။</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> huggingface_hub <span class="hljs-keyword">import</span> InferenceClient | |
| client = InferenceClient(model=<span class="hljs-string">"http://localhost:8000/v1"</span>) | |
| <span class="hljs-comment"># အဆင့်မြင့် parameters ဥပမာ</span> | |
| response = client.chat_completion( | |
| messages=[ | |
| {<span class="hljs-string">"role"</span>: <span class="hljs-string">"system"</span>, <span class="hljs-string">"content"</span>: <span class="hljs-string">"You are a creative storyteller."</span>}, | |
| {<span class="hljs-string">"role"</span>: <span class="hljs-string">"user"</span>, <span class="hljs-string">"content"</span>: <span class="hljs-string">"Write a creative story"</span>}, | |
| ], | |
| temperature=<span class="hljs-number">0.8</span>, | |
| max_tokens=<span class="hljs-number">200</span>, | |
| top_p=<span class="hljs-number">0.95</span>, | |
| ) | |
| <span class="hljs-built_in">print</span>(response.choices[<span class="hljs-number">0</span>].message.content) | |
| <span class="hljs-comment"># တိုက်ရိုက် text generation အတွက်</span> | |
| response = client.text_generation( | |
| <span class="hljs-string">"Write a creative story about space exploration"</span>, | |
| max_new_tokens=<span class="hljs-number">200</span>, | |
| temperature=<span class="hljs-number">0.8</span>, | |
| top_p=<span class="hljs-number">0.95</span>, | |
| details=<span class="hljs-literal">True</span>, | |
| ) | |
| <span class="hljs-built_in">print</span>(response.generated_text)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1hbm25i">OpenAI client ကိုလည်း အသုံးပြုနိုင်ပါတယ်။</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> openai <span class="hljs-keyword">import</span> OpenAI | |
| client = OpenAI(base_url=<span class="hljs-string">"http://localhost:8000/v1"</span>, api_key=<span class="hljs-string">"not-needed"</span>) | |
| <span class="hljs-comment"># အဆင့်မြင့် parameters ဥပမာ</span> | |
| response = client.chat.completions.create( | |
| model=<span class="hljs-string">"HuggingFaceTB/SmolLM2-360M-Instruct"</span>, | |
| messages=[ | |
| {<span class="hljs-string">"role"</span>: <span class="hljs-string">"system"</span>, <span class="hljs-string">"content"</span>: <span class="hljs-string">"You are a creative storyteller."</span>}, | |
| {<span class="hljs-string">"role"</span>: <span class="hljs-string">"user"</span>, <span class="hljs-string">"content"</span>: <span class="hljs-string">"Write a creative story"</span>}, | |
| ], | |
| temperature=<span class="hljs-number">0.8</span>, | |
| top_p=<span class="hljs-number">0.95</span>, | |
| max_tokens=<span class="hljs-number">200</span>, | |
| ) | |
| <span class="hljs-built_in">print</span>(response.choices[<span class="hljs-number">0</span>].message.content)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-ws02fz">vLLM က fine-grained control ပါဝင်တဲ့ native Python interface ကိုလည်း ပံ့ပိုးပေးပါတယ်-</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> vllm <span class="hljs-keyword">import</span> LLM, SamplingParams | |
| <span class="hljs-comment"># အဆင့်မြင့် parameters တွေနဲ့ model ကို Initialize လုပ်ပါ။</span> | |
| llm = LLM( | |
| model=<span class="hljs-string">"HuggingFaceTB/SmolLM2-360M-Instruct"</span>, | |
| gpu_memory_utilization=<span class="hljs-number">0.85</span>, | |
| max_num_batched_tokens=<span class="hljs-number">8192</span>, | |
| max_num_seqs=<span class="hljs-number">256</span>, | |
| block_size=<span class="hljs-number">16</span>, | |
| ) | |
| <span class="hljs-comment"># Sampling parameters တွေကို Configure လုပ်ပါ။</span> | |
| sampling_params = SamplingParams( | |
| temperature=<span class="hljs-number">0.8</span>, <span class="hljs-comment"># ပိုမိုဖန်တီးမှုရှိရန် ပိုမိုမြင့်မားသော တန်ဖိုး</span> | |
| top_p=<span class="hljs-number">0.95</span>, <span class="hljs-comment"># 95% ဖြစ်နိုင်ခြေအများဆုံး tokens များကို ထည့်သွင်းစဉ်းစားပါ။</span> | |
| max_tokens=<span class="hljs-number">100</span>, <span class="hljs-comment"># အများဆုံး အရှည်</span> | |
| presence_penalty=<span class="hljs-number">1.1</span>, <span class="hljs-comment"># ထပ်ခါတလဲလဲ မဖြစ်အောင် လျှော့ချပါ။</span> | |
| frequency_penalty=<span class="hljs-number">1.1</span>, <span class="hljs-comment"># ထပ်ခါတလဲလဲ မဖြစ်အောင် လျှော့ချပါ။</span> | |
| stop=[<span class="hljs-string">"\n\n"</span>, <span class="hljs-string">"###"</span>], <span class="hljs-comment"># Stop sequences</span> | |
| ) | |
| <span class="hljs-comment"># Text generate လုပ်ပါ။</span> | |
| prompt = <span class="hljs-string">"Write a creative story"</span> | |
| outputs = llm.generate(prompt, sampling_params) | |
| <span class="hljs-built_in">print</span>(outputs[<span class="hljs-number">0</span>].outputs[<span class="hljs-number">0</span>].text) | |
| <span class="hljs-comment"># Chat-style interactions အတွက်</span> | |
| chat_prompt = [ | |
| {<span class="hljs-string">"role"</span>: <span class="hljs-string">"system"</span>, <span class="hljs-string">"content"</span>: <span class="hljs-string">"You are a creative storyteller."</span>}, | |
| {<span class="hljs-string">"role"</span>: <span class="hljs-string">"user"</span>, <span class="hljs-string">"content"</span>: <span class="hljs-string">"Write a creative story"</span>}, | |
| ] | |
| formatted_prompt = llm.get_chat_template()(chat_prompt) <span class="hljs-comment"># Uses model's chat template</span> | |
| outputs = llm.generate(formatted_prompt, sampling_params) | |
| <span class="hljs-built_in">print</span>(outputs[<span class="hljs-number">0</span>].outputs[<span class="hljs-number">0</span>].text)<!-- HTML_TAG_END --></pre></div> | |
| </hfoption></div> <h2 class="relative group"><a id="advanced-generation-control" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#advanced-generation-control"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Advanced Generation Control</span></h2> <h3 class="relative group"><a id="token-selection-and-sampling" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#token-selection-and-sampling"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Token Selection နှင့် Sampling</span></h3> <p data-svelte-h="svelte-s71e1i">text ကို generate လုပ်တဲ့ လုပ်ငန်းစဉ်မှာ အဆင့်တိုင်းမှာ နောက်ထပ် token ကို ရွေးချယ်တာ ပါဝင်ပါတယ်။ ဒီရွေးချယ်မှု လုပ်ငန်းစဉ်ကို parameters အမျိုးမျိုးကနေတစ်ဆင့် ထိန်းချုပ်နိုင်ပါတယ် -</p> <ol data-svelte-h="svelte-1qpbbdy"><li><strong>Raw Logits</strong>: token တစ်ခုစီအတွက် မူရင်း output probabilities များ။</li> <li><strong>Temperature</strong>: ရွေးချယ်မှုမှာရှိတဲ့ ကျပန်းဆန်မှုကို ထိန်းချုပ်ပါတယ် (ပိုမြင့်ရင် ပိုမိုဖန်တီးမှုရှိပါတယ်)။</li> <li><strong>Top-p (Nucleus) Sampling</strong>: ဖြစ်နိုင်ခြေပမာဏ X% ကို ဖွဲ့စည်းထားတဲ့ ထိပ်ဆုံး tokens တွေကို စစ်ထုတ်ပါတယ်။</li> <li><strong>Top-k Filtering</strong>: ဖြစ်နိုင်ခြေအများဆုံး tokens k ခုအထိ ရွေးချယ်မှုကို ကန့်သတ်ပါတယ်။</li></ol> <p data-svelte-h="svelte-1a8bfw8">ဒီ parameters တွေကို ဘယ်လို configure လုပ်ရမလဲဆိုတာကတော့…</p> <div class="flex space-x-2 items-center my-1.5 mr-8 h-7 !pl-0 -mx-3 md:mx-0"></div> <div class="language-select"><hfoption value="tgi" label="TGI"> | |
| <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->client.generate( | |
| <span class="hljs-string">"Write a creative story"</span>, | |
| temperature=<span class="hljs-number">0.8</span>, <span class="hljs-comment"># ပိုမိုဖန်တီးမှုရှိရန် ပိုမိုမြင့်မားသော တန်ဖိုး</span> | |
| top_p=<span class="hljs-number">0.95</span>, <span class="hljs-comment"># 95% ဖြစ်နိုင်ခြေအများဆုံး tokens များကို ထည့်သွင်းစဉ်းစားပါ။</span> | |
| top_k=<span class="hljs-number">50</span>, <span class="hljs-comment"># ထိပ်ဆုံး 50 tokens များကို ထည့်သွင်းစဉ်းစားပါ။</span> | |
| max_new_tokens=<span class="hljs-number">100</span>, <span class="hljs-comment"># အများဆုံး အရှည်</span> | |
| repetition_penalty=<span class="hljs-number">1.1</span>, <span class="hljs-comment"># ထပ်ခါတလဲလဲ မဖြစ်အောင် လျှော့ချပါ။</span> | |
| )<!-- HTML_TAG_END --></pre></div> | |
| </hfoption> | |
| <hfoption value="llama.cpp" label="llama.cpp"> | |
| <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-comment"># OpenAI API compatibility မှတစ်ဆင့်</span> | |
| response = client.completions.create( | |
| model=<span class="hljs-string">"smollm2-1.7b-instruct"</span>, <span class="hljs-comment"># Model name (llama.cpp server အတွက် မည်သည့် string မဆို ဖြစ်နိုင်သည်)</span> | |
| prompt=<span class="hljs-string">"ဖန်တီးမှုရှိတဲ့ ပုံပြင်တစ်ပုဒ် ရေးပေးပါ။"</span>, | |
| temperature=<span class="hljs-number">0.8</span>, <span class="hljs-comment"># ပိုမိုဖန်တီးမှုရှိရန် ပိုမိုမြင့်မားသော တန်ဖိုး</span> | |
| top_p=<span class="hljs-number">0.95</span>, <span class="hljs-comment"># 95% ဖြစ်နိုင်ခြေအများဆုံး tokens များကို ထည့်သွင်းစဉ်းစားပါ။</span> | |
| frequency_penalty=<span class="hljs-number">1.1</span>, <span class="hljs-comment"># မကြာခဏ ပေါ်လာသော tokens များကို ထပ်ခါတလဲလဲ မဖြစ်အောင် လျှော့ချပါ။</span> | |
| presence_penalty=<span class="hljs-number">0.1</span>, <span class="hljs-comment"># ရှိပြီးသား tokens များကို ထပ်ခါတလဲလဲ မဖြစ်အောင် လျှော့ချပါ။</span> | |
| max_tokens=<span class="hljs-number">100</span>, <span class="hljs-comment"># အများဆုံး အရှည်</span> | |
| ) | |
| <span class="hljs-comment"># llama-cpp-python တိုက်ရိုက် access မှတစ်ဆင့်</span> | |
| output = llm( | |
| <span class="hljs-string">"Write a creative story"</span>, | |
| temperature=<span class="hljs-number">0.8</span>, | |
| top_p=<span class="hljs-number">0.95</span>, | |
| top_k=<span class="hljs-number">50</span>, | |
| max_tokens=<span class="hljs-number">100</span>, | |
| repeat_penalty=<span class="hljs-number">1.1</span>, | |
| )<!-- HTML_TAG_END --></pre></div> | |
| </hfoption> | |
| <hfoption value="vllm" label="vLLM"> | |
| <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->params = SamplingParams( | |
| temperature=<span class="hljs-number">0.8</span>, <span class="hljs-comment"># ပိုမိုဖန်တီးမှုရှိရန် ပိုမိုမြင့်မားသော တန်ဖိုး</span> | |
| top_p=<span class="hljs-number">0.95</span>, <span class="hljs-comment"># 95% ဖြစ်နိုင်ခြေအများဆုံး tokens များကို ထည့်သွင်းစဉ်းစားပါ။</span> | |
| top_k=<span class="hljs-number">50</span>, <span class="hljs-comment"># ထိပ်ဆုံး 50 tokens များကို ထည့်သွင်းစဉ်းစားပါ။</span> | |
| max_tokens=<span class="hljs-number">100</span>, <span class="hljs-comment"># အများဆုံး အရှည်</span> | |
| presence_penalty=<span class="hljs-number">0.1</span>, <span class="hljs-comment"># ထပ်ခါတလဲလဲ မဖြစ်အောင် လျှော့ချပါ။</span> | |
| ) | |
| llm.generate(<span class="hljs-string">"Write a creative story"</span>, sampling_params=params)<!-- HTML_TAG_END --></pre></div> | |
| </hfoption></div> <h3 class="relative group"><a id="controlling-repetition" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#controlling-repetition"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>ထပ်ခါတလဲလဲ ဖြစ်မှုကို ထိန်းချုပ်ခြင်း</span></h3> <p data-svelte-h="svelte-1wbj1k1">frameworks နှစ်ခုလုံးက ထပ်ခါတလဲလဲ text generation ကို ကာကွယ်ဖို့ နည်းလမ်းတွေ ပံ့ပိုးပေးပါတယ်။</p> <div class="flex space-x-2 items-center my-1.5 mr-8 h-7 !pl-0 -mx-3 md:mx-0"></div> <div class="language-select"><hfoption value="tgi" label="TGI"> | |
| <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->client.generate( | |
| <span class="hljs-string">"Write a varied text"</span>, | |
| repetition_penalty=<span class="hljs-number">1.1</span>, <span class="hljs-comment"># ထပ်ခါတလဲလဲ ဖြစ်သော tokens များကို ဒဏ်ခတ်ပါ။</span> | |
| no_repeat_ngram_size=<span class="hljs-number">3</span>, <span class="hljs-comment"># 3-gram ထပ်ခါတလဲလဲ ဖြစ်မှုကို ကာကွယ်ပါ။</span> | |
| )<!-- HTML_TAG_END --></pre></div> | |
| </hfoption> | |
| <hfoption value="llama.cpp" label="llama.cpp"> | |
| <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-comment"># OpenAI API မှတစ်ဆင့်</span> | |
| response = client.completions.create( | |
| model=<span class="hljs-string">"smollm2-1.7b-instruct"</span>, | |
| prompt=<span class="hljs-string">"Write a varied text"</span>, | |
| frequency_penalty=<span class="hljs-number">1.1</span>, <span class="hljs-comment"># မကြာခဏ ပေါ်လာသော tokens များကို ဒဏ်ခတ်ပါ။</span> | |
| presence_penalty=<span class="hljs-number">0.8</span>, <span class="hljs-comment"># ရှိပြီးသား tokens များကို ဒဏ်ခတ်ပါ။</span> | |
| ) | |
| <span class="hljs-comment"># တိုက်ရိုက် library မှတစ်ဆင့်</span> | |
| output = llm( | |
| <span class="hljs-string">"Write a varied text"</span>, | |
| repeat_penalty=<span class="hljs-number">1.1</span>, <span class="hljs-comment"># Penalize repeated tokens</span> | |
| frequency_penalty=<span class="hljs-number">0.5</span>, <span class="hljs-comment"># အပို frequency penalty</span> | |
| presence_penalty=<span class="hljs-number">0.5</span>, <span class="hljs-comment"># အပို presence penalty</span> | |
| )<!-- HTML_TAG_END --></pre></div> | |
| </hfoption> | |
| <hfoption value="vllm" label="vLLM"> | |
| <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->params = SamplingParams( | |
| presence_penalty=<span class="hljs-number">0.1</span>, <span class="hljs-comment"># Token ရှိခြင်းအတွက် ဒဏ်ခတ်ပါ။</span> | |
| frequency_penalty=<span class="hljs-number">0.1</span>, <span class="hljs-comment"># Token မကြာခဏ ပေါ်လာခြင်းအတွက် ဒဏ်ခတ်ပါ။</span> | |
| )<!-- HTML_TAG_END --></pre></div> | |
| </hfoption></div> <h3 class="relative group"><a id="length-control-and-stop-sequences" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#length-control-and-stop-sequences"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>အရှည် ထိန်းချုပ်ခြင်းနှင့် ရပ်တန့်ခြင်း Sequences များ</span></h3> <p data-svelte-h="svelte-l475cx">generation length ကို ထိန်းချုပ်နိုင်ပြီး ဘယ်အချိန်မှာ ရပ်တန့်ရမယ်ဆိုတာ သတ်မှတ်နိုင်ပါတယ်။</p> <div class="flex space-x-2 items-center my-1.5 mr-8 h-7 !pl-0 -mx-3 md:mx-0"></div> <div class="language-select"><hfoption value="tgi" label="TGI"> | |
| <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->client.generate( | |
| <span class="hljs-string">"Generate a short paragraph"</span>, | |
| max_new_tokens=<span class="hljs-number">100</span>, | |
| min_new_tokens=<span class="hljs-number">10</span>, | |
| stop_sequences=[<span class="hljs-string">"\n\n"</span>, <span class="hljs-string">"###"</span>], | |
| )<!-- HTML_TAG_END --></pre></div> | |
| </hfoption> | |
| <hfoption value="llama.cpp" label="llama.cpp"> | |
| <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-comment"># OpenAI API မှတစ်ဆင့်</span> | |
| response = client.completions.create( | |
| model=<span class="hljs-string">"smollm2-1.7b-instruct"</span>, | |
| prompt=<span class="hljs-string">"Generate a short paragraph"</span>, | |
| max_tokens=<span class="hljs-number">100</span>, | |
| stop=[<span class="hljs-string">"\n\n"</span>, <span class="hljs-string">"###"</span>], | |
| ) | |
| <span class="hljs-comment"># တိုက်ရိုက် library မှတစ်ဆင့်</span> | |
| output = llm(<span class="hljs-string">"Generate a short paragraph"</span>, max_tokens=<span class="hljs-number">100</span>, stop=[<span class="hljs-string">"\n\n"</span>, <span class="hljs-string">"###"</span>])<!-- HTML_TAG_END --></pre></div> | |
| </hfoption> | |
| <hfoption value="vllm" label="vLLM"> | |
| <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->params = SamplingParams( | |
| max_tokens=<span class="hljs-number">100</span>, | |
| min_tokens=<span class="hljs-number">10</span>, | |
| stop=[<span class="hljs-string">"###"</span>, <span class="hljs-string">"\n\n"</span>], | |
| ignore_eos=<span class="hljs-literal">False</span>, | |
| skip_special_tokens=<span class="hljs-literal">True</span>, | |
| )<!-- HTML_TAG_END --></pre></div> | |
| </hfoption></div> <h2 class="relative group"><a id="memory-management" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#memory-management"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Memory Management</span></h2> <p data-svelte-h="svelte-m5e0wm">frameworks နှစ်ခုလုံးက ထိရောက်တဲ့ inference အတွက် အဆင့်မြင့် memory management နည်းစနစ်တွေကို implement လုပ်ထားပါတယ်။</p> <div class="flex space-x-2 items-center my-1.5 mr-8 h-7 !pl-0 -mx-3 md:mx-0"></div> <div class="language-select"><hfoption value="tgi" label="TGI"> | |
| <p data-svelte-h="svelte-6ldqyv">TGI က Flash Attention 2 နဲ့ continuous batching ကို အသုံးပြုပါတယ်။</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-comment"># Memory optimization ပါဝင်တဲ့ Docker deployment</span> | |
| docker run --gpus all -p 8080:80 \ | |
| --shm-size 1g \ | |
| ghcr.io/huggingface/text-generation-inference:latest \ | |
| --model-id HuggingFaceTB/SmolLM2-1.7B-Instruct \ | |
| --max-batch-total-tokens 8192 \ | |
| --max-input-length 4096<!-- HTML_TAG_END --></pre></div> | |
| </hfoption> | |
| <hfoption value="llama.cpp" label="llama.cpp"> | |
| <p data-svelte-h="svelte-bj0aul">llama.cpp က quantization နဲ့ optimized memory layout ကို အသုံးပြုပါတယ်-</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-comment"># Memory optimizations ပါဝင်တဲ့ Server</span> | |
| ./server \ | |
| -m smollm2-1.7b-instruct.Q4_K_M.gguf \ | |
| --host 0.0.0.0 \ | |
| --port 8080 \ | |
| -c 2048 \ <span class="hljs-comment"># Context size</span> | |
| --threads 4 \ <span class="hljs-comment"># CPU threads</span> | |
| --n-gpu-layers 32 \ <span class="hljs-comment"># ပိုကြီးတဲ့ models တွေအတွက် GPU layers များများ အသုံးပြုပါ</span> | |
| --mlock \ <span class="hljs-comment"># Swapping မဖြစ်စေရန် memory ကို lock လုပ်ပါ</span> | |
| --cont-batching <span class="hljs-comment"># Continuous batching ကို ဖွင့်ပါ။</span><!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1jz5arz">သင့် GPU အတွက် အရမ်းကြီးတဲ့ models တွေအတွက် CPU offloading ကို အသုံးပြုနိုင်ပါတယ်။</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->./server \ | |
| -m smollm2-1.7b-instruct.Q4_K_M.gguf \ | |
| --n-gpu-layers 20 \ <span class="hljs-comment"># ပထမဆုံး 20 layers ကို GPU မှာ ထားပါ</span> | |
| --threads 8 <span class="hljs-comment"># CPU layers တွေအတွက် CPU threads များများ အသုံးပြုပါ</span><!-- HTML_TAG_END --></pre></div> | |
| </hfoption> | |
| <hfoption value="vllm" label="vLLM"> | |
| <p data-svelte-h="svelte-1gl74a4">vLLM က အကောင်းဆုံး memory management အတွက် PagedAttention ကို အသုံးပြုပါတယ်။</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> vllm.engine.arg_utils <span class="hljs-keyword">import</span> AsyncEngineArgs | |
| engine_args = AsyncEngineArgs( | |
| model=<span class="hljs-string">"HuggingFaceTB/SmolLM2-1.7B-Instruct"</span>, | |
| gpu_memory_utilization=<span class="hljs-number">0.85</span>, | |
| max_num_batched_tokens=<span class="hljs-number">8192</span>, | |
| block_size=<span class="hljs-number">16</span>, | |
| ) | |
| llm = LLM(engine_args=engine_args)<!-- HTML_TAG_END --></pre></div> | |
| </hfoption></div> <h2 class="relative group"><a id="resources" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#resources"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>အရင်းအမြစ်များ</span></h2> <ul data-svelte-h="svelte-15h1dzu"><li><a href="https://huggingface.co/docs/text-generation-inference" rel="nofollow">Text Generation Inference Documentation</a></li> <li><a href="https://github.com/huggingface/text-generation-inference" rel="nofollow">TGI GitHub Repository</a></li> <li><a href="https://vllm.readthedocs.io/" rel="nofollow">vLLM Documentation</a></li> <li><a href="https://github.com/vllm-project/vllm" rel="nofollow">vLLM GitHub Repository</a></li> <li><a href="https://arxiv.org/abs/2309.06180" rel="nofollow">PagedAttention Paper</a></li> <li><a href="https://github.com/ggerganov/llama.cpp" rel="nofollow">llama.cpp GitHub Repository</a></li> <li><a href="https://github.com/abetlen/llama-cpp-python" rel="nofollow">llama-cpp-python Repository</a></li></ul> <h2 class="relative group"><a id="ဝဟရ-ရငလငခက-glossary" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#ဝဟရ-ရငလငခက-glossary"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>ဝေါဟာရ ရှင်းလင်းချက် (Glossary)</span></h2> <ul data-svelte-h="svelte-1sn3sib"><li><strong>Optimized Inference Deployment</strong>: AI မော်ဒယ်များကို အသုံးပြုသူများထံသို့ ထိရောက်စွာနှင့် လျင်မြန်စွာ ဝန်ဆောင်မှုပေးနိုင်ရန် အကောင်းဆုံးဖြစ်အောင် ပြုလုပ်ထားသော လုပ်ငန်းစဉ်။</li> <li><strong>LLM (Large Language Model)</strong>: လူသားဘာသာစကားကို နားလည်ပြီး ထုတ်လုပ်ပေးနိုင်တဲ့ အလွန်ကြီးမားတဲ့ Artificial Intelligence (AI) မော်ဒယ်တွေ ဖြစ်ပါတယ်။</li> <li><strong>Text Generation Inference (TGI)</strong>: Hugging Face မှ LLM များအတွက် မြန်နှုန်းမြင့် text generation ကို အထူးပြုထားသော framework တစ်ခု။</li> <li><strong>vLLM</strong>: မြန်နှုန်းမြင့် LLM inference အတွက် ဒီဇိုင်းထုတ်ထားသော library တစ်ခုဖြစ်ပြီး PagedAttention ကို အသုံးပြုသည်။</li> <li><strong>llama.cpp</strong>: LLaMA models များကို consumer hardware ပေါ်တွင် run နိုင်ရန် အဓိကထားသော C/C++ implementation တစ်ခု။</li> <li><strong>Production Environments</strong>: ဆော့ဖ်ဝဲလ် သို့မဟုတ် မော်ဒယ်များကို အစစ်အမှန် အသုံးပြုသူများထံသို့ ဝန်ဆောင်မှုပေးသည့် ပတ်ဝန်းကျင်။</li> <li><strong>Inference Efficiency</strong>: AI မော်ဒယ်တစ်ခုက input data မှ output ကို ထုတ်လုပ်ရာတွင် အချိန်နှင့် အရင်းအမြစ်များကို မည်မျှ ထိရောက်စွာ အသုံးပြုနိုင်မှု။</li> <li><strong>Framework Selection Guide</strong>: မတူညီသော အသုံးပြုမှုပုံစံများအတွက် သင့်လျော်သော framework ကို ရွေးချယ်ရန် လမ်းညွှန်။</li> <li><strong>Memory Management</strong>: ကွန်ပျူတာ၏ memory ကို ထိထိရောက်ရောက် စီမံခန့်ခွဲခြင်း။</li> <li><strong>Performance</strong>: စနစ်တစ်ခု၏ အလုပ်လုပ်နိုင်စွမ်း သို့မဟုတ် အရှိန်အဟုန်။</li> <li><strong>Flash Attention 2</strong>: Transformer models များတွင် attention mechanism ကို memory bandwidth bottlenecks များကို ဖြေရှင်းပေးခြင်းဖြင့် optimization လုပ်သော နည်းပညာ။</li> <li><strong>Continuous Batching</strong>: GPU ကို အလုပ်များနေအောင် ထိန်းထားနိုင်ရန် requests များကို အဆက်မပြတ် batch လုပ်ပြီး ပေးပို့သော နည်းလမ်း။</li> <li><strong>GPU (Graphics Processing Unit)</strong>: AI/ML လုပ်ငန်းများတွင် အရှိန်မြှင့်ရန် အသုံးပြုသော processor။</li> <li><strong>CPU (Central Processing Unit)</strong>: ကွန်ပျူတာ၏ အဓိက processor။</li> <li><strong>VRAM (Video RAM)</strong>: GPU တွင် အသုံးပြုသော RAM အမျိုးအစား။</li> <li><strong>PagedAttention</strong>: LLM inference တွင် KV cache memory management ကို optimization လုပ်သော နည်းပညာ။</li> <li><strong>KV Cache</strong>: Text generation လုပ်နေစဉ်အတွင်း Transformer model က သိမ်းဆည်းထားသော attention keys နှင့် values များ။</li> <li><strong>Memory Paging</strong>: Memory ကို fixed-size “pages” များအဖြစ် ပိုင်းခြားခြင်း။</li> <li><strong>Memory Fragmentation</strong>: Memory ကို ထိရောက်စွာ အသုံးမပြုနိုင်ဘဲ အပိုင်းအစများအဖြစ် ပြန့်ကျဲနေခြင်း။</li> <li><strong>Throughput</strong>: အချိန်ယူနစ်တစ်ခုအတွင်း စနစ်တစ်ခုက လုပ်ဆောင်နိုင်သော လုပ်ငန်းပမာဏ။</li> <li><strong>Quantization</strong>: Model weights တွေရဲ့ precision ကို လျှော့ချခြင်းဖြင့် model size နဲ့ memory requirements တွေကို လျှော့ချသော နည်းလမ်း။</li> <li><strong>INT8 (8-bit integers)</strong>: 8-bit integers ဖြင့် ကိုယ်စားပြုသော ကိန်းဂဏန်းများ။</li> <li><strong>GGML/GGUF Format</strong>: llama.cpp မှ quantized inference အတွက် အကောင်းဆုံးဖြစ်အောင် ပြုလုပ်ထားသော custom tensor formats များ။</li> <li><strong>Mixed Precision</strong>: Model ၏ မတူညီသော အစိတ်အပိုင်းများတွင် မတူညီသော quantization levels များကို အသုံးပြုခြင်း။</li> <li><strong>CPU Architectures</strong>: CPU အမျိုးအစားများ (ဥပမာ - AVX2, AVX-512, NEON)။</li> <li><strong>Local Deployments</strong>: မော်ဒယ်များကို သုံးစွဲသူ၏ ကွန်ပျူတာ သို့မဟုတ် local server ပေါ်တွင် တပ်ဆင်အသုံးပြုခြင်း။</li> <li><strong>Edge Devices</strong>: ကွန်ပျူတာကွန်ရက်၏ အစွန်းပိုင်း (ဥပမာ - mobile devices, IoT devices) တွင် အလုပ်လုပ်သော devices များ။</li> <li><strong>Enterprise-level Deployment</strong>: လုပ်ငန်းကြီးများအတွက် ဒီဇိုင်းထုတ်ထားသော deployment ပုံစံ။</li> <li><strong>Kubernetes Support</strong>: Containerized application များကို automate လုပ်ပြီး deploy, scale လုပ်ရန်အတွက် Kubernetes platform ကို ထောက်ပံ့ခြင်း။</li> <li><strong>Prometheus</strong>: Monitoring system တစ်ခု။</li> <li><strong>Grafana</strong>: Data visualization tool တစ်ခု။</li> <li><strong>Automatic Scaling</strong>: requests များ၏ ပမာဏအပေါ် မူတည်ပြီး resources များကို အလိုအလျောက် ချိန်ညှိခြင်း။</li> <li><strong>Content Filtering</strong>: မသင့်လျော်သော သို့မဟုတ် အန္တရာယ်ရှိသော အကြောင်းအရာများကို စစ်ထုတ်ခြင်း။</li> <li><strong>Rate Limiting</strong>: အချိန်အတိုင်းအတာတစ်ခုအတွင်း requests အရေအတွက်ကို ကန့်သတ်ခြင်း။</li> <li><strong>Developer-friendly Approach</strong>: Developers များအတွက် အသုံးပြုရလွယ်ကူသော ချဉ်းကပ်မှု။</li> <li><strong>OpenAI API Compatibility</strong>: OpenAI ၏ API နှင့် တွဲဖက်အသုံးပြုနိုင်ခြင်း။</li> <li><strong>Ray</strong>: Distributed computing အတွက် Python framework တစ်ခု။</li> <li><strong>Portability</strong>: ဆော့ဖ်ဝဲလ်တစ်ခုကို မတူညီသော platform များ သို့မဟုတ် environments များသို့ အလွယ်တကူ ရွှေ့ပြောင်းအသုံးပြုနိုင်ခြင်း။</li> <li><strong><code>docker run --gpus all</code></strong>: Docker container ကို GPU အားလုံးကို အသုံးပြုပြီး run ရန် command။</li> <li><strong><code>--shm-size 1g</code></strong>: Shared memory size ကို 1GB အဖြစ် သတ်မှတ်ခြင်း။</li> <li><strong><code>InferenceClient</code></strong>: Hugging Face Hub မှ inference endpoint များနှင့် အပြန်အလှန်ဆက်သွယ်ရန် Python client။</li> <li><strong><code>openai</code></strong>: OpenAI API ကို အသုံးပြုရန်အတွက် Python client library။</li> <li><strong><code>git clone</code></strong>: Git repository ကို download လုပ်ရန် command။</li> <li><strong><code>make</code></strong>: Source code ကို executable file အဖြစ် build လုပ်ရန် command။</li> <li><strong><code>curl -L -O</code></strong>: URL မှ file တစ်ခုကို download လုပ်ရန် command။</li> <li><strong><code>--host</code>, <code>--port</code></strong>: Server ကို listen လုပ်မည့် host address နှင့် port နံပါတ်။</li> <li><strong><code>--n-gpu-layers</code></strong>: GPU တွင် ထားရှိမည့် model layers အရေအတွက်။</li> <li><strong>Context Size (<code>-c</code>)</strong>: Model က တစ်ကြိမ်တည်း လုပ်ဆောင်နိုင်သော tokens အရေအတွက် အများဆုံး။</li> <li><strong>CPU Threads (<code>--threads</code>)</strong>: CPU တွင် အသုံးပြုမည့် threads အရေအတွက်။</li> <li><strong>Batch Size (<code>--batch-size</code>)</strong>: Prompt evaluation အတွက် batch အရွယ်အစား။</li> <li><strong><code>llama_cpp</code></strong>: llama.cpp C++ library အတွက် Python bindings။</li> <li><strong><code>Llama</code> Class</strong>: llama-cpp-python library မှ LLaMA model ကို load လုပ်ရန် class။</li> <li><strong><code>n_ctx</code></strong>: Model ၏ context window size။</li> <li><strong><code>n_threads</code></strong>: CPU threads အရေအတွက်။</li> <li><strong><code>n_gpu_layers</code></strong>: GPU ပေါ်တွင် ထားရှိမည့် layers အရေအတွက်။</li> <li><strong><code>SamplingParams</code></strong>: vLLM တွင် text generation အတွက် sampling parameters များကို သတ်မှတ်ရန် class။</li> <li><strong>Temperature</strong>: generated text ၏ randomness သို့မဟုတ် creativity ကို ထိန်းချုပ်သော parameter။</li> <li><strong>Top-p (Nucleus) Sampling</strong>: ဖြစ်နိုင်ခြေအများဆုံး tokens အချို့ကို ရွေးချယ်ပြီး ၎င်းတို့၏ စုစုပေါင်း ဖြစ်နိုင်ခြေသည် သတ်မှတ်ထားသော တန်ဖိုး (ဥပမာ - 0.95) ထက် မကျော်လွန်စေရ။</li> <li><strong>Top-k Filtering</strong>: ဖြစ်နိုင်ခြေအများဆုံး tokens <code>k</code> ခုကိုသာ ရွေးချယ်ပြီး ကျန်များကို လျစ်လျူရှုသည်။</li> <li><strong><code>max_new_tokens</code> / <code>max_tokens</code></strong>: Generate လုပ်မည့် tokens အရေအတွက် အများဆုံး။</li> <li><strong><code>repetition_penalty</code></strong>: ထပ်ခါတလဲလဲ ဖြစ်သော tokens များကို ဒဏ်ခတ်ရန် parameter။</li> <li><strong><code>do_sample</code></strong>: True ဖြစ်ပါက sampling ကို အသုံးပြုပြီး၊ False ဖြစ်ပါက greedy decoding ကို အသုံးပြုသည်။</li> <li><strong><code>frequency_penalty</code></strong>: မကြာခဏ ပေါ်လာသော tokens များကို ထပ်ခါတလဲလဲ မဖြစ်အောင် လျှော့ချရန် parameter။</li> <li><strong><code>presence_penalty</code></strong>: ရှိပြီးသား tokens များကို ထပ်ခါတလဲလဲ မဖြစ်အောင် လျှော့ချရန် parameter။</li> <li><strong><code>min_new_tokens</code> / <code>min_tokens</code></strong>: Generate လုပ်မည့် tokens အရေအတွက် အနည်းဆုံး။</li> <li><strong><code>stop_sequences</code></strong>: Generated text ကို ရပ်တန့်ရန်အတွက် သတ်မှတ်ထားသော sequence များ။</li> <li><strong><code>ignore_eos</code></strong>: End-of-sequence token ကို လျစ်လျူရှုရန်။</li> <li><strong><code>skip_special_tokens</code></strong>: Generated text မှ special tokens များကို ဖယ်ရှားရန်။</li> <li><strong>CPU Offloading</strong>: Model ၏ အစိတ်အပိုင်းအချို့ကို GPU မှ CPU သို့ ရွှေ့ပြောင်းပြီး လုပ်ဆောင်ခြင်း။</li> <li><strong><code>--mlock</code></strong>: Memory ကို lock လုပ်ပြီး swapping မဖြစ်စေရန် ကာကွယ်ခြင်း။</li></ul> <a class="!text-gray-400 !no-underline text-sm flex items-center not-prose mt-4" href="https://github.com/huggingface/course/blob/main/chapters/my/chapter2/8.mdx" target="_blank"><span data-svelte-h="svelte-1kd6by1"><</span> <span data-svelte-h="svelte-x0xyl0">></span> <span data-svelte-h="svelte-1dajgef"><span class="underline ml-1.5">Update</span> on GitHub</span></a> <p></p> | |
| <script> | |
| { | |
| __sveltekit_dep9rk = { | |
| assets: "/docs/course/pr_1107/my", | |
| base: "/docs/course/pr_1107/my", | |
| env: {} | |
| }; | |
| const element = document.currentScript.parentElement; | |
| const data = [null,null]; | |
| Promise.all([ | |
| import("/docs/course/pr_1107/my/_app/immutable/entry/start.5c6233a8.js"), | |
| import("/docs/course/pr_1107/my/_app/immutable/entry/app.55586789.js") | |
| ]).then(([kit, app]) => { | |
| kit.start(app, element, { | |
| node_ids: [0, 21], | |
| data, | |
| form: null, | |
| error: null | |
| }); | |
| }); | |
| } | |
| </script> | |
Xet Storage Details
- Size:
- 145 kB
- Xet hash:
- def1b9ac955cc19a50371444f65fddfc7fe768e5d4ddc926fafa216ffc07b614
·
Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.