shadow-0.7b / app.py
Redhanuman's picture
Update app.py
662a774 verified
import gradio as gr
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
from peft import PeftModel
from threading import Thread
BASE_MODEL = "Qwen/Qwen3-0.6B"
ADAPTER_ID = "Redhanuman/Shadow-0.7B"
print("πŸŒ‘ Loading Shadow Brain...")
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)
base_model = AutoModelForCausalLM.from_pretrained(
BASE_MODEL,
torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
device_map="auto"
)
model = PeftModel.from_pretrained(base_model, ADAPTER_ID)
model.eval()
def predict(message, history):
system_prompt = (
"You are Shadow 0.7B, a reasoning AI created by Aman Kumar Pandey. "
"Use <think> tags to plan logic before answering."
)
messages = [{"role": "system", "content": system_prompt}]
for user_msg, bot_msg in history:
messages.append({"role": "user", "content": user_msg})
messages.append({"role": "assistant", "content": bot_msg})
messages.append({"role": "user", "content": message})
input_ids = tokenizer.apply_chat_template(
messages,
tokenize=True,
add_generation_prompt=True,
return_tensors="pt"
).to(model.device)
streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
generation_kwargs = dict(
input_ids=input_ids,
streamer=streamer,
max_new_tokens=1024,
temperature=0.7,
top_p=0.9,
repetition_penalty=1.1,
)
thread = Thread(target=model.generate, kwargs=generation_kwargs)
thread.start()
partial_message = ""
for new_token in streamer:
partial_message += new_token
yield partial_message
# Create the Gradio interface - minimal parameters for compatibility
demo = gr.ChatInterface(
fn=predict,
examples=[
["Write a Python function to check for palindromes."],
["If I have 3 apples and eat one, how many do I have?"]
],
)
if __name__ == "__main__":
demo.queue().launch()