# Clone the repository git clone https://github.com/ggml-org/llama.cpp cd llama.cpp # Create build directory mkdir build cd build # Configure for CUDA 13.0 and A100 (sm_80) # We use -DGGML_CUDA=ON to enable GPU acceleration cmake .. -DGGML_CUDA=ON -DCMAKE_CUDA_ARCHITECTURES=80 # Build the binaries (using all CPU cores) cmake --build . --config Release -j $(nproc) cd ~/llama.cpp/build rm -rf * CUDA_DEVICE_ORDER=PCI_BUS_ID CUDA_VISIBLE_DEVICES=2 ~/llama.cpp/build/bin/llama-server \ -m /home/mshahidul/readctrl/models/translategemma-27b-it-Q8_0.gguf \ --n-gpu-layers 999 \ --flash-attn ~/llama.cpp/build/bin/llama-server \ -m ~/models/Meta-Llama-3.1-70B-Instruct-Q4_K_M.gguf \ -ngl 99 \ -fa \ --port 8080 \ --host 0.0.0.0 python /home/mshahidul/readctrl/script/translate_correction_gpt5.py --input "/home/mshahidul/readctrl/data/translated_data/translation_wo_judge/multiclinsum_gs_train_en2zh_gemma(0_200).json" --start 0 --end 80 python /home/mshahidul/readctrl/script/translate_correction_gpt5.py --input "/home/mshahidul/readctrl/data/translated_data/translation_wo_judge/multiclinsum_gs_train_en2vi_gemma(0_200).json" --start 0 --end 80 python /home/mshahidul/readctrl/script/translate_correction_gpt5.py --input "/home/mshahidul/readctrl/data/translated_data/translation_wo_judge/multiclinsum_gs_train_en2hi_gemma(0_200).json" --start 0 --end 80