import torch from transformers import AutoTokenizer, AutoModelForCausalLM from peft import PeftModel import gradio as gr # -------------------- # Model setup # -------------------- BASE_MODEL = "TinyLlama/TinyLlama-1.1B-Chat-v1.0" LORA_REPO = "nitya001/autotrain-4n1y9-5ekvs" device = "cuda" if torch.cuda.is_available() else "cpu" dtype = torch.float16 if device == "cuda" else torch.float32 print("Loading tokenizer...") tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL) print("Loading base model...") base_model = AutoModelForCausalLM.from_pretrained( BASE_MODEL, torch_dtype=dtype, device_map="auto" if device == "cuda" else None, ) print("Loading LoRA adapter:", LORA_REPO) model = PeftModel.from_pretrained(base_model, LORA_REPO) model.to(device) model.eval() SYSTEM_PROMPT = ( "You are a helpful assistant fine-tuned for loan journeys and UTR queries. " "Answer clearly and concisely. If you don't know some specific account value, " "explain what information is needed instead of hallucinating numbers." ) # -------------------- # Generation function # -------------------- def generate_reply(message: str, history: list): """ ChatInterface passes: message: latest user message (string) history: list of dicts: [{role: 'user'/'assistant', content: '...'}, ...] We return just the assistant's reply as a string. ChatInterface will handle the messages format for the UI. """ # Build a simple conversation prompt using TinyLlama chat-style tags conversation = f"<|system|>{SYSTEM_PROMPT}\n" if history: for msg in history: role = msg.get("role", "user") content = msg.get("content", "") if role == "user": conversation += f"<|user|>{content}\n" elif role == "assistant": conversation += f"<|assistant|>{content}\n" # Add the latest user message conversation += f"<|user|>{message}\n<|assistant|>" inputs = tokenizer( conversation, return_tensors="pt", truncation=True, max_length=2048, ).to(device) with torch.no_grad(): output_ids = model.generate( **inputs, max_new_tokens=256, temperature=0.7, top_p=0.9, do_sample=True, pad_token_id=tokenizer.eos_token_id, ) # Take only the newly generated tokens generated_ids = output_ids[0][inputs["input_ids"].shape[-1] :] answer = tokenizer.decode(generated_ids, skip_special_tokens=True).strip() if not answer: answer = "I'm not sure how to answer that. Could you rephrase your question?" return answer # -------------------- # Gradio UI # -------------------- demo = gr.ChatInterface( fn=generate_reply, title="UTR & Loan Assistant (TinyLlama LoRA)", description=( "Ask things like:\n" "- What is my latest UTR?\n" "- How is my EMI calculated?\n" "- Summarize my repayment schedule.\n" ), examples=[ "What is my latest UTR?", "Explain my repayment schedule.", "How are late payment charges calculated?", ], ) if __name__ == "__main__": # Spaces will call `app.py` directly, so this is mainly for local testing demo.launch()