test / app.py
nitya001's picture
Update app.py
07e183d verified
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import PeftModel
import gradio as gr
# --------------------
# Model setup
# --------------------
BASE_MODEL = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
LORA_REPO = "nitya001/autotrain-4n1y9-5ekvs"
device = "cuda" if torch.cuda.is_available() else "cpu"
dtype = torch.float16 if device == "cuda" else torch.float32
print("Loading tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)
print("Loading base model...")
base_model = AutoModelForCausalLM.from_pretrained(
BASE_MODEL,
torch_dtype=dtype,
device_map="auto" if device == "cuda" else None,
)
print("Loading LoRA adapter:", LORA_REPO)
model = PeftModel.from_pretrained(base_model, LORA_REPO)
model.to(device)
model.eval()
SYSTEM_PROMPT = (
"You are a helpful assistant fine-tuned for loan journeys and UTR queries. "
"Answer clearly and concisely. If you don't know some specific account value, "
"explain what information is needed instead of hallucinating numbers."
)
# --------------------
# Generation function
# --------------------
def generate_reply(message: str, history: list):
"""
ChatInterface passes:
message: latest user message (string)
history: list of dicts: [{role: 'user'/'assistant', content: '...'}, ...]
We return just the assistant's reply as a string.
ChatInterface will handle the messages format for the UI.
"""
# Build a simple conversation prompt using TinyLlama chat-style tags
conversation = f"<|system|>{SYSTEM_PROMPT}</s>\n"
if history:
for msg in history:
role = msg.get("role", "user")
content = msg.get("content", "")
if role == "user":
conversation += f"<|user|>{content}</s>\n"
elif role == "assistant":
conversation += f"<|assistant|>{content}</s>\n"
# Add the latest user message
conversation += f"<|user|>{message}</s>\n<|assistant|>"
inputs = tokenizer(
conversation,
return_tensors="pt",
truncation=True,
max_length=2048,
).to(device)
with torch.no_grad():
output_ids = model.generate(
**inputs,
max_new_tokens=256,
temperature=0.7,
top_p=0.9,
do_sample=True,
pad_token_id=tokenizer.eos_token_id,
)
# Take only the newly generated tokens
generated_ids = output_ids[0][inputs["input_ids"].shape[-1] :]
answer = tokenizer.decode(generated_ids, skip_special_tokens=True).strip()
if not answer:
answer = "I'm not sure how to answer that. Could you rephrase your question?"
return answer
# --------------------
# Gradio UI
# --------------------
demo = gr.ChatInterface(
fn=generate_reply,
title="UTR & Loan Assistant (TinyLlama LoRA)",
description=(
"Ask things like:\n"
"- What is my latest UTR?\n"
"- How is my EMI calculated?\n"
"- Summarize my repayment schedule.\n"
),
examples=[
"What is my latest UTR?",
"Explain my repayment schedule.",
"How are late payment charges calculated?",
],
)
if __name__ == "__main__":
# Spaces will call `app.py` directly, so this is mainly for local testing
demo.launch()