import gradio as gr from llama_cpp import Llama from huggingface_hub import hf_hub_download # 1. Download Model Koding Terbaik (Qwen2.5-Coder 7B) # Kita pakai versi GGUF Q4_K_M agar muat di RAM gratisan dan cepat print("Sedang mendownload model... Mohon tunggu.") model_path = hf_hub_download( repo_id="Qwen/Qwen2.5-Coder-7B-Instruct-GGUF", filename="qwen2.5-coder-7b-instruct-q4_k_m.gguf" ) # 2. Load Model ke Memory (CPU) llm = Llama( model_path=model_path, n_ctx=8192, # Context window besar untuk kode panjang n_threads=2, # Menggunakan 2 core CPU verbose=False ) # 3. Fungsi Chat def chat_response(message, history): # Format prompt agar sesuai standar Qwen system_prompt = "You are an expert coding assistant. Write clean, professional code." prompt_str = f"<|im_start|>system\n{system_prompt}<|im_end|>\n" for user_input, ai_output in history: prompt_str += f"<|im_start|>user\n{user_input}<|im_end|>\n<|im_start|>assistant\n{ai_output}<|im_end|>\n" prompt_str += f"<|im_start|>user\n{message}<|im_end|>\n<|im_start|>assistant\n" # Generate jawaban stream (agar teks muncul satu-satu tidak nge-lag) output_stream = llm( prompt_str, max_tokens=2048, stop=["<|im_end|>"], stream=True ) partial_message = "" for chunk in output_stream: delta = chunk['choices'][0]['text'] partial_message += delta yield partial_message # 4. Tampilan Web (Mobile Friendly) custom_css = """ footer {visibility: hidden} """ demo = gr.ChatInterface( fn=chat_response, title="🤖 Qwen Android Coder", description="Asisten Koding Profesional untuk Android. Copy kode & jalankan di Termux/Acode.", theme=gr.themes.Soft(), css=custom_css ) if __name__ == "__main__": demo.launch()