import gradio as gr import torch from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer from peft import PeftModel from threading import Thread BASE_MODEL = "Qwen/Qwen3-0.6B" ADAPTER_ID = "Redhanuman/Shadow-0.7B" print("🌑 Loading Shadow Brain...") tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL) base_model = AutoModelForCausalLM.from_pretrained( BASE_MODEL, torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32, device_map="auto" ) model = PeftModel.from_pretrained(base_model, ADAPTER_ID) model.eval() def predict(message, history): system_prompt = ( "You are Shadow 0.7B, a reasoning AI created by Aman Kumar Pandey. " "Use tags to plan logic before answering." ) messages = [{"role": "system", "content": system_prompt}] for user_msg, bot_msg in history: messages.append({"role": "user", "content": user_msg}) messages.append({"role": "assistant", "content": bot_msg}) messages.append({"role": "user", "content": message}) input_ids = tokenizer.apply_chat_template( messages, tokenize=True, add_generation_prompt=True, return_tensors="pt" ).to(model.device) streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True) generation_kwargs = dict( input_ids=input_ids, streamer=streamer, max_new_tokens=1024, temperature=0.7, top_p=0.9, repetition_penalty=1.1, ) thread = Thread(target=model.generate, kwargs=generation_kwargs) thread.start() partial_message = "" for new_token in streamer: partial_message += new_token yield partial_message # Create the Gradio interface - minimal parameters for compatibility demo = gr.ChatInterface( fn=predict, examples=[ ["Write a Python function to check for palindromes."], ["If I have 3 apples and eat one, how many do I have?"] ], ) if __name__ == "__main__": demo.queue().launch()