| import torch |
| from transformers import AutoTokenizer, AutoModelForCausalLM |
| from peft import PeftModel |
| import gradio as gr |
|
|
| |
| |
| |
| BASE_MODEL = "TinyLlama/TinyLlama-1.1B-Chat-v1.0" |
| LORA_REPO = "nitya001/autotrain-4n1y9-5ekvs" |
|
|
| device = "cuda" if torch.cuda.is_available() else "cpu" |
| dtype = torch.float16 if device == "cuda" else torch.float32 |
|
|
| print("Loading tokenizer...") |
| tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL) |
|
|
| print("Loading base model...") |
| base_model = AutoModelForCausalLM.from_pretrained( |
| BASE_MODEL, |
| torch_dtype=dtype, |
| device_map="auto" if device == "cuda" else None, |
| ) |
|
|
| print("Loading LoRA adapter:", LORA_REPO) |
| model = PeftModel.from_pretrained(base_model, LORA_REPO) |
| model.to(device) |
| model.eval() |
|
|
| SYSTEM_PROMPT = ( |
| "You are a helpful assistant fine-tuned for loan journeys and UTR queries. " |
| "Answer clearly and concisely. If you don't know some specific account value, " |
| "explain what information is needed instead of hallucinating numbers." |
| ) |
|
|
| |
| |
| |
| def generate_reply(message: str, history: list): |
| """ |
| ChatInterface passes: |
| message: latest user message (string) |
| history: list of dicts: [{role: 'user'/'assistant', content: '...'}, ...] |
| |
| We return just the assistant's reply as a string. |
| ChatInterface will handle the messages format for the UI. |
| """ |
| |
| conversation = f"<|system|>{SYSTEM_PROMPT}</s>\n" |
|
|
| if history: |
| for msg in history: |
| role = msg.get("role", "user") |
| content = msg.get("content", "") |
| if role == "user": |
| conversation += f"<|user|>{content}</s>\n" |
| elif role == "assistant": |
| conversation += f"<|assistant|>{content}</s>\n" |
|
|
| |
| conversation += f"<|user|>{message}</s>\n<|assistant|>" |
|
|
| inputs = tokenizer( |
| conversation, |
| return_tensors="pt", |
| truncation=True, |
| max_length=2048, |
| ).to(device) |
|
|
| with torch.no_grad(): |
| output_ids = model.generate( |
| **inputs, |
| max_new_tokens=256, |
| temperature=0.7, |
| top_p=0.9, |
| do_sample=True, |
| pad_token_id=tokenizer.eos_token_id, |
| ) |
|
|
| |
| generated_ids = output_ids[0][inputs["input_ids"].shape[-1] :] |
| answer = tokenizer.decode(generated_ids, skip_special_tokens=True).strip() |
|
|
| if not answer: |
| answer = "I'm not sure how to answer that. Could you rephrase your question?" |
|
|
| return answer |
|
|
| |
| |
| |
| demo = gr.ChatInterface( |
| fn=generate_reply, |
| title="UTR & Loan Assistant (TinyLlama LoRA)", |
| description=( |
| "Ask things like:\n" |
| "- What is my latest UTR?\n" |
| "- How is my EMI calculated?\n" |
| "- Summarize my repayment schedule.\n" |
| ), |
| examples=[ |
| "What is my latest UTR?", |
| "Explain my repayment schedule.", |
| "How are late payment charges calculated?", |
| ], |
| ) |
|
|
| if __name__ == "__main__": |
| |
| demo.launch() |
|
|