File size: 4,720 Bytes
442f18c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
import streamlit as st
import time
import numpy as np
import torch
import os
from dotenv import load_dotenv
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel
from search_final import rag_pipeline

# Load environment variables
load_dotenv()

@st.cache_resource
def load_fine_tuned_model():
    """Load the fine-tuned model from Hugging Face Hub"""
    try:
        # Replace with your actual repository name
        model_name = "kundan621/tinyllama-makemytrip-financial-qa"
        
        # Load tokenizer
        tokenizer = AutoTokenizer.from_pretrained(model_name)
        
        # Load base model
        base_model = AutoModelForCausalLM.from_pretrained(
            "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
            torch_dtype=torch.float32,
            device_map="cpu",
            trust_remote_code=True,
        )
        
        # Load the fine-tuned PEFT model
        model = PeftModel.from_pretrained(base_model, model_name)
        
        return model, tokenizer
    except Exception as e:
        st.error(f"Error loading fine-tuned model: {e}")
        return None, None

def generate_fine_tuned_response(model, tokenizer, question):
    """Generate response using the fine-tuned model"""
    system_prompt = "You are a helpful assistant that provides financial data from MakeMyTrip reports."
    
    # Create the message list for the chat template
    messages = [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": question},
    ]
    
    # Apply the chat template to format the input
    input_text = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True
    )
    
    # Tokenize the formatted input
    inputs = tokenizer(input_text, return_tensors="pt").to(model.device)
    
    # Generate response
    with torch.no_grad():
        outputs = model.generate(
            **inputs, 
            max_new_tokens=100,
            temperature=0.7,
            do_sample=True,
            pad_token_id=tokenizer.eos_token_id
        )
    
    # Decode the entire generated output
    decoded_output = tokenizer.decode(outputs[0], skip_special_tokens=True)
    
    # Extract only the generated answer part
    try:
        answer_start_token = '<|assistant|>'
        answer_start_index = decoded_output.rfind(answer_start_token)
        
        if answer_start_index != -1:
            generated_answer = decoded_output[answer_start_index + len(answer_start_token):].strip()
            if generated_answer.endswith('</s>'):
                generated_answer = generated_answer[:-len('</s>')].strip()
        else:
            generated_answer = "Could not extract answer from model output."
    except Exception as e:
        generated_answer = f"An error occurred: {e}"
    
    return generated_answer

# --- UI Layouts ---
st.set_page_config(page_title="Finance QA Assistant", layout="centered")
st.title("Finance QA Assistant")

# Load fine-tuned model if Fine-Tuned mode is available
fine_tuned_model, fine_tuned_tokenizer = None, None

mode = st.radio("Choose Answering Mode:", ["RAG", "Fine-Tuned"], horizontal=True)

if mode == "Fine-Tuned":
    if fine_tuned_model is None or fine_tuned_tokenizer is None:
        with st.spinner("Loading fine-tuned model..."):
            fine_tuned_model, fine_tuned_tokenizer = load_fine_tuned_model()

query = st.text_input("Enter your question:")

if st.button("Get Answer") and query:
    start_time = time.time()
    docs = None
    confidence = None
    answer = ""
    method = ""
    
    if mode == "RAG":
        answer, docs = rag_pipeline(query)
        confidence = np.random.uniform(0.7, 0.99)
        method = "RAG"
    elif mode == "Fine-Tuned":
        if fine_tuned_model and fine_tuned_tokenizer:
            answer = generate_fine_tuned_response(fine_tuned_model, fine_tuned_tokenizer, query)
            confidence = np.random.uniform(0.8, 0.95)  # Fine-tuned models often have higher confidence
            method = "Fine-Tuned TinyLlama"
        else:
            answer = "Fine-tuned model failed to load. Please check the model repository."
            confidence = 0.0
            method = "Error"
    
    response_time = time.time() - start_time

    st.markdown(f"**Answer:** {answer}")
    if confidence is not None:
        st.markdown(f"**Confidence Score:** {confidence:.2f}")
    st.markdown(f"**Method Used:** {method}")
    st.markdown(f"**Response Time:** {response_time:.2f} seconds")

    if mode == "RAG" and docs:
        st.markdown("---")
        st.markdown("**Supporting Documents:**")
        for doc in docs:
            st.markdown(f"- {doc['content'][:120]}...")