Spaces:
Sleeping
Sleeping
| import torch | |
| from transformers import AutoTokenizer, AutoModelForCausalLM | |
| from peft import PeftModel | |
| import gradio as gr | |
| # -------------------- | |
| # Model setup | |
| # -------------------- | |
| BASE_MODEL = "TinyLlama/TinyLlama-1.1B-Chat-v1.0" | |
| LORA_REPO = "nitya001/autotrain-oa5ez-0dtoc" | |
| device = "cuda" if torch.cuda.is_available() else "cpu" | |
| dtype = torch.float16 if device == "cuda" else torch.float32 | |
| print("Loading tokenizer...") | |
| tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL) | |
| print("Loading base model...") | |
| base_model = AutoModelForCausalLM.from_pretrained( | |
| BASE_MODEL, | |
| torch_dtype=dtype, | |
| device_map="auto" if device == "cuda" else None, | |
| ) | |
| print("Loading LoRA adapter:", LORA_REPO) | |
| model = PeftModel.from_pretrained(base_model, LORA_REPO) | |
| model.to(device) | |
| model.eval() | |
| # Generic system prompt | |
| SYSTEM_PROMPT = ( | |
| "You are a helpful AI assistant. " | |
| "Answer clearly, accurately, and concisely. " | |
| "If you do not know something, say so honestly." | |
| ) | |
| # -------------------- | |
| # Generation function | |
| # -------------------- | |
| def generate_reply(message: str, history: list): | |
| conversation = f"<|system|>{SYSTEM_PROMPT}</s>\n" | |
| if history: | |
| for msg in history: | |
| role = msg.get("role", "user") | |
| content = msg.get("content", "") | |
| if role == "user": | |
| conversation += f"<|user|>{content}</s>\n" | |
| elif role == "assistant": | |
| conversation += f"<|assistant|>{content}</s>\n" | |
| conversation += f"<|user|>{message}</s>\n<|assistant|>" | |
| inputs = tokenizer( | |
| conversation, | |
| return_tensors="pt", | |
| truncation=True, | |
| max_length=2048, | |
| ).to(device) | |
| with torch.no_grad(): | |
| output_ids = model.generate( | |
| **inputs, | |
| max_new_tokens=256, | |
| temperature=0.7, | |
| top_p=0.9, | |
| do_sample=True, | |
| pad_token_id=tokenizer.eos_token_id, | |
| ) | |
| generated_ids = output_ids[0][inputs["input_ids"].shape[-1]:] | |
| answer = tokenizer.decode(generated_ids, skip_special_tokens=True).strip() | |
| if not answer: | |
| answer = "I'm not sure how to answer that. Could you rephrase?" | |
| return answer | |
| # -------------------- | |
| # Gradio UI | |
| # -------------------- | |
| demo = gr.ChatInterface( | |
| fn=generate_reply, | |
| title="Custom AI Assistant", | |
| description="Ask anything.", | |
| ) | |
| if __name__ == "__main__": | |
| demo.launch() |