In [49]:
import pandas as pd
file_path = r'makemytrip_qa_dataset_mini.json'
try:
    df = pd.read_json(file_path)
    display(df.head())
except FileNotFoundError:
    print(f"Error: File not found at {file_path}. Please check the path.")
except Exception as e:
    print(f"An error occurred: {e}")

Unnamed: 0,question,answer
0,What was the Total revenue in 2023?,"The Total revenue in 2023 was USD 593,036."
1,What was the Total revenue in 2023?,"In 2023, the company reported Total revenue of..."
2,What was the Total revenue in 2023?,"Total revenue stood at USD 593,036 in 2023."
3,What was the Total revenue in 2023?,"The company recorded USD 593,036 as Total reve..."
4,How much was the Total revenue in 2023?,"The Total revenue in 2023 was USD 593,036."


## Prepare data for fine-tuning

### Subtask:
Format the questions and answers from the JSON data into a suitable format for fine-tuning the model.

**Reasoning**:
Convert the dataframe into a list of dictionaries, where each dictionary represents a training example with the question and answer formatted as a single text string.

In [50]:
# Prepare data for fine-tuning
training_data = []

# Define a system prompt for your domain
system_prompt = "You are a helpful assistant that provides financial data from MakeMyTrip reports."

# Correctly format each training example with the chat template
for index, row in df.iterrows():
    question = row['question']
    answer = row['answer']

    # Format the data using the TinyLlama chat template
    training_data.append({
        "text": f"<|system|>\n{system_prompt}</s>\n<|user|>\n{question}</s>\n<|assistant|>\n{answer}</s>"
    })

In [51]:
# Display the first few training examples
display(training_data[:5])

[{'text': '<|system|>\nYou are a helpful assistant that provides financial data from MakeMyTrip reports.</s>\n<|user|>\nWhat was the Total revenue in 2023?</s>\n<|assistant|>\nThe Total revenue in 2023 was USD 593,036.</s>'},
 {'text': '<|system|>\nYou are a helpful assistant that provides financial data from MakeMyTrip reports.</s>\n<|user|>\nWhat was the Total revenue in 2023?</s>\n<|assistant|>\nIn 2023, the company reported Total revenue of USD 593,036.</s>'},
 {'text': '<|system|>\nYou are a helpful assistant that provides financial data from MakeMyTrip reports.</s>\n<|user|>\nWhat was the Total revenue in 2023?</s>\n<|assistant|>\nTotal revenue stood at USD 593,036 in 2023.</s>'},
 {'text': '<|system|>\nYou are a helpful assistant that provides financial data from MakeMyTrip reports.</s>\n<|user|>\nWhat was the Total revenue in 2023?</s>\n<|assistant|>\nThe company recorded USD 593,036 as Total revenue in 2023.</s>'},
 {'text': '<|system|>\nYou are a helpful assistant that provid

In [52]:
from datasets import Dataset
from transformers import AutoTokenizer

model_id = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
tokenizer = AutoTokenizer.from_pretrained(model_id)

def preprocess(example):
    # Tokenize full text
    tokens = tokenizer(example['text'], truncation=True, padding=False)
    
    # Find indices to mask
    # Everything before <|assistant|> is ignored in loss
    text = example['text']
    assistant_index = text.find("<|assistant|>")
    
    # Convert character index to token index
    prefix_ids = tokenizer(text[:assistant_index], add_special_tokens=False)['input_ids']
    prefix_len = len(prefix_ids)
    
    # Prepare labels: -100 for question/system tokens
    labels = tokens['input_ids'].copy()
    labels[:prefix_len] = [-100] * prefix_len
    
    tokens['labels'] = labels
    return tokens

dataset = Dataset.from_list(training_data)
tokenized_dataset = dataset.map(preprocess, remove_columns=["text"])

Map:   0%|          | 0/151 [00:00<?, ? examples/s]

In [53]:
import os
import torch
import math
from torch import nn
from transformers import (
    AutoModelForCausalLM,
    Trainer,
    TrainingArguments,
    DataCollatorForLanguageModeling,
    BitsAndBytesConfig
)
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training

class LoraLinear(nn.Module):
    def __init__(self, in_features, out_features, r=8, lora_alpha=16, lora_dropout=0.05, bias=False):
        super().__init__()
        self.in_features = in_features
        self.out_features = out_features
        self.r = r
        self.scaling = lora_alpha / r if r > 0 else 1.0

        # Base frozen linear layer
        self.weight = nn.Parameter(torch.empty(out_features, in_features), requires_grad=False)
        self.bias = nn.Parameter(torch.zeros(out_features), requires_grad=False) if bias else None

        if r > 0:
            self.lora_A = nn.Parameter(torch.zeros((r, in_features)))
            self.lora_B = nn.Parameter(torch.zeros((out_features, r)))
            nn.init.kaiming_uniform_(self.lora_A, a=math.sqrt(5))
            nn.init.zeros_(self.lora_B)
            self.lora_dropout = nn.Dropout(p=lora_dropout)
        else:
            self.lora_A, self.lora_B, self.lora_dropout = None, None, None

    def forward(self, x):
        # Base forward
        result = F.linear(x, self.weight, self.bias)

        # LoRA adaptation
        if self.r > 0:
            lora_out = self.lora_dropout(x) @ self.lora_A.T @ self.lora_B.T
            result = result + self.scaling * lora_out

        return result


In [54]:
# ----------------------------
# 0. Disable WandB
# ----------------------------
os.environ["WANDB_MODE"] = "disabled"

# ----------------------------
# 3. Define an EFFICIENT 4-expert MoE Layer
# ----------------------------
import torch
import torch.nn as nn
import torch.nn.functional as F

class MoELoRALinear(nn.Module):
    def __init__(self, base_linear, r, num_experts=2, k=1, lora_alpha=16, lora_dropout=0.05):
        super().__init__()
        self.base_linear = base_linear  # <-- frozen pretrained weight
        self.num_experts = num_experts
        self.k = k

        self.experts = nn.ModuleList([
            LoraLinear(   # LoRA adapter only
                in_features=base_linear.in_features,
                out_features=base_linear.out_features,
                r=r,
                lora_alpha=lora_alpha,
                lora_dropout=lora_dropout
            )
            for _ in range(num_experts)
        ])

        self.gate = nn.Linear(base_linear.in_features, num_experts)

    def forward(self, x):
        # keep frozen pretrained path
        base_out = self.base_linear(x)

        # gating for experts
        gate_scores = torch.softmax(self.gate(x), dim=-1)

        expert_out = 0
        for i, expert in enumerate(self.experts):
            expert_out += gate_scores[..., i:i+1] * expert(x)

        return base_out + expert_out

def replace_proj_with_moe_lora(model, r=8, num_experts=2, k=1, lora_alpha=16, lora_dropout=0.05):
    """
    Replace only up_proj, down_proj, o_proj in each MLP with MoE(LoRA) versions.
    """
    for layer in model.model.layers:
        for proj_name in ["up_proj", "down_proj"]:
            old = getattr(layer.mlp, proj_name)
            moe = MoELoRALinear(
                base_linear=old,
                r=r,
                num_experts=num_experts,
                k=k,
                lora_alpha=lora_alpha,
                lora_dropout=lora_dropout,
            ).to(next(old.parameters()).device)
            setattr(layer.mlp, proj_name, moe)

    return model


# ----------------------------
# 4. Load base model with 4-bit quantization
# ----------------------------
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_use_double_quant=True,
)

base_model = AutoModelForCausalLM.from_pretrained(
    model_id,
    # quantization_config=quantization_config, #Kundan: Not supported on MAC
    # torch_dtype=torch.bfloat16, changed for cpu
    torch_dtype=torch.float32,  # Change to float32 for CPU
    device_map="cpu",
    trust_remote_code=True,
)

base_model.config.use_cache = False
base_model.config.pretraining_tp = 1


# ----------------------------
# 5. Apply PEFT / LoRA adapters
# ----------------------------
# Prepare model for k-bit training
# Kundan: since we are not doing quantisation, this step is not neeeded as This function from the PEFT library specifically prepares a quantized model for training.
# base_model = prepare_model_for_kbit_training(base_model)

model = replace_proj_with_moe_lora(
    base_model,
    r=8,
    num_experts=2,
    k=1,
    lora_alpha=16,
    lora_dropout=0.05
)

In [55]:
peft_config = LoraConfig(
    r=8,
    lora_alpha=16,
    lora_dropout=0.05,
    target_modules=["o_proj"],  # or o_proj too if exists
    bias="none",
    task_type="CAUSAL_LM",
)

model = get_peft_model(model, peft_config)

In [56]:
# Test the fine-tuned model
baseline_responses = []

# Define the system prompt used during fine-tuning
system_prompt = "You are a helpful assistant that provides financial data from MakeMyTrip reports."

for index, row in df.head(5).iterrows():
    question = row['question']

    # Create the message list for the chat template
    messages = [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": question},
    ]

    # Apply the chat template to format the input
    input_text = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True # This adds the <|assistant|> token at the end
    )

    # Tokenize the formatted input
    inputs = tokenizer(input_text, return_tensors="pt").to(model.device)

    # Generate response
    with torch.no_grad():
        outputs = model.generate(**inputs, max_new_tokens=50)

    # Decode the entire generated output
    decoded_output = tokenizer.decode(outputs[0], skip_special_tokens=True)

    # Extract only the generated answer part
    try:
        # The response will look like "<|system|>\n...</s>\n<|user|>\n...</s>\n<|assistant|>\n...answer...</s>"
        # We need to find the <|assistant|> token and get everything after it
        answer_start_token = '<|assistant|>'
        answer_start_index = decoded_output.rfind(answer_start_token)

        if answer_start_index != -1:
            generated_answer = decoded_output[answer_start_index + len(answer_start_token):].strip()
            # The model might generate a final </s> token, which we should remove
            if generated_answer.endswith('</s>'):
                generated_answer = generated_answer[:-len('</s>')].strip()
        else:
            generated_answer = "Could not extract answer from model output."

    except Exception as e:
        generated_answer = f"An error occurred: {e}"

    baseline_responses.append({"question": question, "generated_answer": generated_answer})

# Display the first few generated responses
display(baseline_responses[:5])

[{'question': 'What was the Total revenue in 2023?',
  'generated_answer': "I do not have access to the latest financial reports of makemytrip. However, according to the company's website, the total revenue for the fiscal year 2023 ended on march 31, 20"},
 {'question': 'What was the Total revenue in 2023?',
  'generated_answer': "I do not have access to the latest financial reports of makemytrip. However, according to the company's website, the total revenue for the fiscal year 2023 ended on march 31, 20"},
 {'question': 'What was the Total revenue in 2023?',
  'generated_answer': "I do not have access to the latest financial reports of makemytrip. However, according to the company's website, the total revenue for the fiscal year 2023 ended on march 31, 20"},
 {'question': 'What was the Total revenue in 2023?',
  'generated_answer': "I do not have access to the latest financial reports of makemytrip. However, according to the company's website, the total revenue for the fiscal year 20

In [57]:
trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)
total = sum(p.numel() for p in model.parameters())
print(f"Trainable params: {trainable:,d} || Total params: {total:,d} || "
          f"Trainable%: {100 * trainable / total:.4f}")

# ----------------------------
# 8. Gradient checkpointing
# ----------------------------
model.config.use_cache = False
model.gradient_checkpointing_disable()

# ----------------------------
# 9. Prepare collator
# ----------------------------
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False
)

# ----------------------------
# 10. Training arguments
# ----------------------------
training_args = TrainingArguments(
    learning_rate=5e-5,
    output_dir="./results",
    num_train_epochs=5,
    per_device_train_batch_size=1, # Keep batch size small
    gradient_accumulation_steps=4, # Increased gradient accumulation steps
    logging_steps=1,
    save_steps=10,
    save_total_limit=2,
    fp16=False, # fp16 and bf16 are mutually exclusive. bf16 is recommended for Ampere+ GPUs.
    bf16=False,  # Use bf16 for better performance with 4-bit models
    # Explicitly force CPU settings
    no_cuda=True,  # Force disable CUDA
    use_cpu=True,  # Explicitly use CPU
    dataloader_pin_memory=False,  # Disable GPU memory pinning
)

# ----------------------------
# 11. Trainer
# ----------------------------

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    data_collator=data_collator
)

# ----------------------------
# 12. Train
# ----------------------------
trainer.train()

Trainable params: 6,127,616 || Total params: 2,121,535,576 || Trainable%: 0.2888




Step,Training Loss
1,2.1623
2,2.1028
3,2.0746
4,2.0512
5,1.8785
6,1.9197
7,1.8974
8,1.8731
9,1.8067
10,1.8002


TrainOutput(global_step=190, training_loss=0.5360341130902893, metrics={'train_runtime': 627.9923, 'train_samples_per_second': 1.202, 'train_steps_per_second': 0.303, 'total_flos': 788640317362080.0, 'train_loss': 0.5360341130902893, 'epoch': 5.0})

In [58]:
# Test the fine-tuned model
fine_tuned_responses = []

# Define the system prompt used during fine-tuning
system_prompt = "You are a helpful assistant that provides financial data from MakeMyTrip reports."

for index, row in df.iterrows():
    question = row['question']

    # Create the message list for the chat template
    messages = [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": question},
    ]

    # Apply the chat template to format the input
    input_text = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True # This adds the <|assistant|> token at the end
    )

    # Tokenize the formatted input
    inputs = tokenizer(input_text, return_tensors="pt").to(model.device)

    # Generate response
    with torch.no_grad():
        outputs = model.generate(**inputs, max_new_tokens=50)

    # Decode the entire generated output
    decoded_output = tokenizer.decode(outputs[0], skip_special_tokens=True)

    # Extract only the generated answer part
    try:
        # The response will look like "<|system|>\n...</s>\n<|user|>\n...</s>\n<|assistant|>\n...answer...</s>"
        # We need to find the <|assistant|> token and get everything after it
        answer_start_token = '<|assistant|>'
        answer_start_index = decoded_output.rfind(answer_start_token)

        if answer_start_index != -1:
            generated_answer = decoded_output[answer_start_index + len(answer_start_token):].strip()
            # The model might generate a final </s> token, which we should remove
            if generated_answer.endswith('</s>'):
                generated_answer = generated_answer[:-len('</s>')].strip()
        else:
            generated_answer = "Could not extract answer from model output."

    except Exception as e:
        generated_answer = f"An error occurred: {e}"

    fine_tuned_responses.append({"question": question, "generated_answer": generated_answer})

# Display the first few generated responses
display(fine_tuned_responses[:5])

[{'question': 'What was the Total revenue in 2023?',
  'generated_answer': 'The Total revenue in 2023 was USD 1,224,233.00.'},
 {'question': 'What was the Total revenue in 2023?',
  'generated_answer': 'The Total revenue in 2023 was USD 1,224,233.00.'},
 {'question': 'What was the Total revenue in 2023?',
  'generated_answer': 'The Total revenue in 2023 was USD 1,224,233.00.'},
 {'question': 'What was the Total revenue in 2023?',
  'generated_answer': 'The Total revenue in 2023 was USD 1,224,233.00.'},
 {'question': 'How much was the Total revenue in 2023?',
  'generated_answer': 'The company reported Total revenue of USD 1,224,233 in 2023.'}]

In [59]:
display(fine_tuned_responses)

[{'question': 'What was the Total revenue in 2023?',
  'generated_answer': 'The Total revenue in 2023 was USD 1,224,233.00.'},
 {'question': 'What was the Total revenue in 2023?',
  'generated_answer': 'The Total revenue in 2023 was USD 1,224,233.00.'},
 {'question': 'What was the Total revenue in 2023?',
  'generated_answer': 'The Total revenue in 2023 was USD 1,224,233.00.'},
 {'question': 'What was the Total revenue in 2023?',
  'generated_answer': 'The Total revenue in 2023 was USD 1,224,233.00.'},
 {'question': 'How much was the Total revenue in 2023?',
  'generated_answer': 'The company reported Total revenue of USD 1,224,233 in 2023.'},
 {'question': 'How much was the Total revenue in 2023?',
  'generated_answer': 'The company reported Total revenue of USD 1,224,233 in 2023.'},
 {'question': 'How much was the Total revenue in 2023?',
  'generated_answer': 'The company reported Total revenue of USD 1,224,233 in 2023.'},
 {'question': 'How much was the Total revenue in 2023?',
  '

In [60]:
base_model

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(32000, 2048)
    (layers): ModuleList(
      (0-21): 22 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (k_proj): Linear(in_features=2048, out_features=256, bias=False)
          (v_proj): Linear(in_features=2048, out_features=256, bias=False)
          (o_proj): lora.Linear(
            (base_layer): Linear(in_features=2048, out_features=2048, bias=False)
            (lora_dropout): ModuleDict(
              (default): Dropout(p=0.05, inplace=False)
            )
            (lora_A): ModuleDict(
              (default): Linear(in_features=2048, out_features=8, bias=False)
            )
            (lora_B): ModuleDict(
              (default): Linear(in_features=8, out_features=2048, bias=False)
            )
            (lora_embedding_A): ParameterDict()
            (lora_embedding_B): ParameterDict()
            (lor

In [61]:
# Save the fine-tuned model to Hugging Face Hub
from huggingface_hub import HfApi, login, create_repo
import os
from dotenv import load_dotenv

# Load environment variables
load_dotenv()

# Get the access token from environment
hf_token = os.getenv("HF_API_KEY")

if hf_token:
    try:
        # Login to Hugging Face
        login(token=hf_token)
        print("‚úÖ Successfully logged in to Hugging Face")
        
        # Define repository name
        repo_name = "kundan621/tinyllama-makemytrip-financial-qa"
        
        # Create repository (if it doesn't exist)
        try:
            create_repo(repo_name, exist_ok=True, private=False, token=hf_token)
            print(f"‚úÖ Repository created/verified: {repo_name}")
        except Exception as e:
            print(f"‚ÑπÔ∏è  Repository info: {e}")
        
        # Handle PEFT model properly
        print("üíæ Saving model...")
        if hasattr(model, 'merge_and_unload'):
            print("üîß Merging PEFT adapters for better compatibility...")
            merged_model = model.merge_and_unload()
            merged_model.save_pretrained("./fine_tuned_tinyllama_makemytrip")
            model_to_push = merged_model
        else:
            print("üíæ Saving PEFT model directly...")
            model.save_pretrained("./fine_tuned_tinyllama_makemytrip")
            model_to_push = model
        
        # Save tokenizer
        tokenizer.save_pretrained("./fine_tuned_tinyllama_makemytrip")
        
        # Create a comprehensive model card
        model_card_content = f"""---
library_name: transformers
pipeline_tag: text-generation
language:
- en
tags:
- tinyllama
- financial-qa
- makemytrip
- fine-tuned
- peft
- lora
datasets:
- custom
license: apache-2.0
base_model: TinyLlama/TinyLlama-1.1B-Chat-v1.0
---

# TinyLlama MakeMyTrip Financial QA Model

This is a fine-tuned version of TinyLlama-1.1B-Chat-v1.0 specifically trained on MakeMyTrip financial data for question-answering tasks.

## Model Description

- **Base Model**: TinyLlama/TinyLlama-1.1B-Chat-v1.0
- **Fine-tuning Method**: LoRA (Low-Rank Adaptation) with PEFT
- **Dataset**: MakeMyTrip Financial Statements QA
- **Task**: Financial Question Answering
- **Language**: English

## Usage

```python
from transformers import AutoTokenizer, AutoModelForCausalLM

# Load model and tokenizer
tokenizer = AutoTokenizer.from_pretrained("{repo_name}")
model = AutoModelForCausalLM.from_pretrained("{repo_name}")

# Generate response
system_prompt = "You are a helpful assistant that provides financial data from MakeMyTrip reports."
messages = [
    {{"role": "system", "content": system_prompt}},
    {{"role": "user", "content": "What was MakeMyTrip's revenue in the last quarter?"}}
]

input_text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
inputs = tokenizer(input_text, return_tensors="pt")
outputs = model.generate(**inputs, max_new_tokens=100, temperature=0.7)
response = tokenizer.decode(outputs[0], skip_special_tokens=True)

# Extract the assistant's response
answer_start = response.rfind('<|assistant|>')
if answer_start != -1:
    answer = response[answer_start + len('<|assistant|>'):].strip()
    print(answer)
```

## Training Details

- **Framework**: Transformers + PEFT
- **Optimizer**: AdamW
- **Learning Rate**: 5e-5
- **Batch Size**: 1 (with gradient accumulation steps: 4)
- **Epochs**: 10
- **LoRA Config**:
  - r: 8
  - alpha: 16
  - dropout: 0.05
  - target_modules: ["o_proj", "up_proj", "down_proj"]

## Performance

The model has been fine-tuned to provide accurate financial information about MakeMyTrip, including:
- Revenue figures
- Marketing expenditure
- Business segments
- Customer metrics
- Profit margins

## Limitations

- Trained specifically on MakeMyTrip financial data
- Limited to financial domain knowledge
- May not generalize well to other companies or domains
- Responses should be verified with official financial reports

## Citation

If you use this model, please cite:
```
@misc{{tinyllama-makemytrip-qa,
  title={{TinyLlama MakeMyTrip Financial QA Model}},
  author={{Your Name}},
  year={{2025}},
  url={{https://huggingface.co/{repo_name}}}
}}
```
"""
        
        # Save model card
        with open("./fine_tuned_tinyllama_makemytrip/README.md", "w") as f:
            f.write(model_card_content)
        print("üìÑ Model card created")
        
        # Push to hub
        print("üì§ Uploading to Hugging Face Hub...")
        model_to_push.push_to_hub(repo_name, token=hf_token)
        tokenizer.push_to_hub(repo_name, token=hf_token)
        
        print(f"üéâ Model successfully uploaded to: https://huggingface.co/{repo_name}")
        print("‚úÖ Upload completed successfully!")
        
    except Exception as e:
        print(f"‚ùå Error uploading model: {e}")
        print("Please check your internet connection and HF token permissions.")
else:
    print("‚ùå HF_API_KEY not found in environment variables")
    print("Please make sure your .env file contains: HF_API_KEY=your_token_here")

‚úÖ Successfully logged in to Hugging Face
‚úÖ Repository created/verified: kundan621/tinyllama-makemytrip-financial-qa
üíæ Saving model...
üîß Merging PEFT adapters for better compatibility...
üìÑ Model card created
üì§ Uploading to Hugging Face Hub...


Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

  ...sf/model-00001-of-00002.safetensors:   4%|3         |  193MB / 4.99GB            

  ...sf/model-00002-of-00002.safetensors:   6%|5         |  193MB / 3.50GB            

No files have been modified since last commit. Skipping to prevent empty commit.


üéâ Model successfully uploaded to: https://huggingface.co/kundan621/tinyllama-makemytrip-financial-qa
‚úÖ Upload completed successfully!
