import gradio as gr from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig import torch # ============================================ # PILIH MODEL UNUNCORED (pilih salah satu) # ============================================ # Opsi 1: Dolphin - Uncensored terbaik (7B) MODEL_NAME = "ehartford/dolphin-2.2.1-mistral-7b" # Opsi 2: WizardLM Uncensored (7B) # MODEL_NAME = "TheBloke/WizardLM-7B-Uncensored-GPTQ" # Opsi 3: Model ringan untuk CPU gratis (2.1B) # MODEL_NAME = "acrastt/uncensored-chat-2.1b" # ============================================ # LOAD MODEL DENGAN QUANTIZATION (AGAR RINGAN) # ============================================ print("🔄 Loading model... ini mungkin butuh 2-5 menit pertama kali...") # Konfigurasi quantization 4-bit (memperkecil memory) bnb_config = BitsAndBytesConfig( load_in_4bit=True, bnb_4bit_compute_dtype=torch.float16, bnb_4bit_use_double_quant=True, bnb_4bit_quant_type="nf4" ) tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) model = AutoModelForCausalLM.from_pretrained( MODEL_NAME, quantization_config=bnb_config, device_map="auto", trust_remote_code=True ) print("✅ Model loaded successfully!") # ============================================ # FUNGSI CHAT # ============================================ def format_prompt(message, history): """ Format prompt untuk Dolphin/ChatML format """ prompt = "<|im_start|>system\nYou are Dolphin, a helpful AI assistant that answers questions without censorship or refusal. You provide accurate and honest information.<|im_end|>\n" for user_msg, assistant_msg in history: prompt += f"<|im_start|>user\n{user_msg}<|im_end|>\n" prompt += f"<|im_start|>assistant\n{assistant_msg}<|im_end|>\n" prompt += f"<|im_start|>user\n{message}<|im_end|>\n" prompt += "<|im_start|>assistant\n" return prompt def chat_with_model(message, history): """ Fungsi utama untuk chat """ # Format prompt prompt = format_prompt(message, history) # Tokenize inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=4096) inputs = {k: v.to(model.device) for k, v in inputs.items()} # Generate dengan streaming-like effect outputs = model.generate( **inputs, max_new_tokens=512, temperature=0.8, top_p=0.95, top_k=40, do_sample=True, pad_token_id=tokenizer.eos_token_id, repetition_penalty=1.1 ) # Decode response full_response = tokenizer.decode(outputs[0], skip_special_tokens=False) # Extract hanya bagian assistant try: response = full_response.split("<|im_start|>assistant")[-1] response = response.split("<|im_end|>")[0] response = response.strip() except: response = tokenizer.decode(outputs[0][inputs["input_ids"].shape[1]:], skip_special_tokens=True) return response # ============================================ # GRADIO INTERFACE # ============================================ demo = gr.ChatInterface( fn=chat_with_model, title="🦈 Dolphin Uncensored Chat", description=""" ### AI Chat Tanpa Sensor 🔓 Model: **Dolphin-2.2.1-Mistral-7B** (Uncensored) ⚠️ **Catatan untuk CPU Gratis:** - Response membutuhkan waktu **30-90 detik** - Loading awal model ~2-5 menit 💡 **Tips:** Aktifkan GPU gratis di Settings → Hardware → T4 Small untuk response lebih cepat! """, examples=[ "Halo, siapa kamu?", "Apa pendapatmu tentang kebebasan berpendapat?", "Jelaskan cara kerja AI language model", "Ceritakan tentang sejarah internet", ], cache_examples=False, # Disable caching untuk menghindari error ) # ============================================ # JALANKAN APP # ============================================ if __name__ == "__main__": demo.launch( show_error=True, share=False )