Upload 11 files

Browse files

Files changed (11) hide show

MAP_EXP_24.py +285 -0
MAP_EXP_24_oof.parquet +3 -0
README.md +202 -0
adapter_config.json +43 -0
adapter_model.safetensors +3 -0
chat_template.jinja +24 -0
special_tokens_map.json +24 -0
tokenizer.json +0 -0
tokenizer.model +3 -0
tokenizer_config.json +0 -0
training_args.bin +3 -0

MAP_EXP_24.py ADDED Viewed

	@@ -0,0 +1,285 @@

+# All imports at the top
+import torch
+import os
+import shutil
+import numpy as np
+import pandas as pd
+import mlflow
+from collections import Counter
+from sklearn.model_selection import train_test_split
+from sklearn.preprocessing import LabelEncoder
+from datasets import Dataset
+import torch
+import numpy as np
+import mlflow
+from collections import Counter
+from transformers import Trainer
+from transformers import (
+    AutoTokenizer,
+    TrainingArguments,
+    Trainer,
+    DataCollatorWithPadding,
+    BitsAndBytesConfig,
+    AutoModelForSequenceClassification
+)
+from peft import (
+    LoraConfig,
+    TaskType,
+    get_peft_model,
+    prepare_model_for_kbit_training,
+)
+os.environ["HF_TOKEN"] = "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"
+# Configuration
+exp_name   = "MAP_EXP_24"
+model_name = "mistralai/Mathstral-7B-v0.1"
+MAX_LEN    = 256
+# MLflow setup
+mlflow.set_tracking_uri("http://127.0.0.1:8081")
+# Step 2: Loading the dataset
+le = LabelEncoder()
+train = pd.read_csv('category_misconception_folds.csv')
+train.Misconception = train.Misconception.fillna('NA')
+train['target'] = train.Category +":"+ train.Misconception
+train['label'] = le.fit_transform(train['target'])
+n_classes = len(le.classes_)
+print(f"Train shape: {train.shape} with {n_classes} target classes")
+print(train.head())
+# Process correct answers
+idx = train.apply(lambda row: row.Category.split('_')[0], axis=1) == 'True'
+correct = train.loc[idx].copy()
+correct['c'] = correct.groupby(['QuestionId', 'MC_Answer']).MC_Answer.transform('count')
+correct = correct.sort_values('c', ascending=False)
+correct = correct.drop_duplicates(['QuestionId'])
+correct = correct[['QuestionId', 'MC_Answer']]
+correct['is_correct'] = 1
+train = train.merge(correct, on=['QuestionId', 'MC_Answer'], how='left')
+train.is_correct = train.is_correct.fillna(0)
+# Format input text
+def format_input(row):
+    x = "This answer is correct."
+    if not row['is_correct']:
+        x = "This is answer is incorrect."
+    return (
+        f"Question: {row['QuestionText']}\n"
+        f"Answer: {row['MC_Answer']}\n"
+        f"{x}\n"
+        f"Student Explanation: {row['StudentExplanation']}"
+    )
+train['text'] = train.apply(format_input, axis=1)
+# Split data
+train_df      = train[train["fold"]==0].reset_index(drop=True)
+val_df        = train[train["fold"]==1].reset_index(drop=True)
+COLS = ['text', 'label']
+train_ds = Dataset.from_pandas(train_df[COLS])
+val_ds = Dataset.from_pandas(val_df[COLS])
+# Initialize tokenizer
+tokenizer = AutoTokenizer.from_pretrained(model_name, token=os.environ["HF_TOKEN"])
+tokenizer.pad_token = tokenizer.eos_token
+# Tokenization function
+def tokenize_func(example):
+    return tokenizer(
+        example["text"],
+        add_special_tokens=True,
+        truncation=True,
+        max_length=512,
+    )
+# Tokenize datasets
+train_ds = train_ds.map(tokenize_func, batched=True, desc="Tokenizing train data")
+eval_ds = val_ds.map(tokenize_func, batched=True, desc="Tokenizing eval data")
+# Step 3: Load model
+# Model configuration
+model_kwargs = dict(
+    trust_remote_code=True,
+    torch_dtype=torch.float16
+)
+model_kwargs["quantization_config"] = BitsAndBytesConfig(
+    load_in_4bit=True,
+    bnb_4bit_quant_type="nf4",
+    bnb_4bit_use_double_quant=True,
+    bnb_4bit_compute_dtype="float16",
+)
+# Load model
+print(f"Loading model : {model_name}")
+model = AutoModelForSequenceClassification.from_pretrained(
+    model_name, use_cache=False, num_labels=n_classes, token=os.environ["HF_TOKEN"], **model_kwargs
+)
+model.config.pad_token_id = tokenizer.pad_token_id
+# LoRA configuration
+lora_config = LoraConfig(
+    r=64,
+    lora_alpha=64,
+    target_modules="all-linear",
+    lora_dropout=0.05,
+    bias="none",
+    task_type=TaskType.SEQ_CLS,
+    modules_to_save=["score"],
+)
+# Prepare model for training
+model = prepare_model_for_kbit_training(model)
+model = get_peft_model(model, lora_config)
+model.print_trainable_parameters()
+# Training arguments
+training_args = TrainingArguments(
+    output_dir=exp_name,
+    eval_strategy="steps",
+    save_strategy="no",
+    logging_strategy="steps",
+    eval_steps=500,
+    max_steps=2500,
+    logging_steps=100,
+    learning_rate=1e-4,
+    per_device_train_batch_size=16,
+    per_device_eval_batch_size=32,
+    #gradient_accumulation_steps=1,
+    lr_scheduler_type="cosine",
+    warmup_ratio=0.05,
+    report_to="mlflow",
+    gradient_checkpointing=True,
+    max_grad_norm=1.0,
+    weight_decay=0.01,
+    num_train_epochs=2
+)
+class MLflowMetricsLogger:
+    """
+    A callable class to compute and log metrics to MLflow with step tracking.
+    """
+    def __init__(self, trainer: Trainer, ks=[3, 5, 10]):
+        """
+        Initializes the metrics logger.
+        Args:
+            trainer (Trainer): The Hugging Face Trainer instance.
+            ks (list): A list of k values for MAP@k calculation.
+        """
+        self.trainer = trainer
+        self.ks = ks
+    def __call__(self, eval_pred):
+        """
+        This method is called by the Trainer during evaluation.
+        """
+        # Get the current training step from the trainer's state
+        step = self.trainer.state.global_step
+        # 1. Unpack logits and labels
+        logits, labels = eval_pred
+        labels = np.array(labels)
+        # 2. Convert logits to probabilities
+        probs = torch.nn.functional.softmax(torch.tensor(logits), dim=-1).numpy()
+        # 3. Get top-k predictions
+        max_k = max(self.ks)
+        top_k_preds = np.argsort(-probs, axis=1)[:, :max_k]
+        # 4. Create a boolean match array
+        match_array = (top_k_preds == labels[:, None])
+        # 5. Compute MAP@k for each specified k
+        metrics = {}
+        for k in self.ks:
+            match_at_k = match_array[:, :k]
+            ranks = np.argmax(match_at_k, axis=1) + 1
+            has_match_at_k = np.any(match_at_k, axis=1)
+            scores = has_match_at_k * (1.0 / ranks)
+            metrics[f"map@{k}"] = np.mean(scores)
+        # 6. Calculate detailed rank position breakdown
+        ranks_with_indices = [np.where(row)[0] for row in match_array]
+        correct_ranks = np.array([r[0] + 1 if len(r) > 0 else max_k + 1 for r in ranks_with_indices])
+        total = labels.shape[0]
+        rank_1_count = np.sum(correct_ranks == 1)
+        rank_2_to_3_count = np.sum((correct_ranks >= 2) & (correct_ranks <= 3))
+        rank_above_3_count = np.sum((correct_ranks > 3) & (correct_ranks <= max_k))
+        no_match_count = np.sum(correct_ranks > max_k)
+        # Log metrics to MLflow WITH the step argument
+        mlflow.log_metric("rank_1", rank_1_count, step=step)
+        mlflow.log_metric("rank_2_to_3", rank_2_to_3_count, step=step)
+        mlflow.log_metric("rank_above_3", rank_above_3_count, step=step)
+        mlflow.log_metric("no_match_in_top_k", no_match_count, step=step)
+        metrics["rank_1"] = rank_1_count
+        metrics["rank_2_to_3"] = rank_2_to_3_count
+        metrics["rank_above_3"] = rank_above_3_count
+        metrics["no_match_in_top_k"] = no_match_count
+        metrics["total"] = total
+        return metrics
+# Initialize trainer
+trainer = Trainer(
+    model,
+    args=training_args,
+    train_dataset=train_ds,
+    eval_dataset=eval_ds,
+    tokenizer=tokenizer,
+    data_collator=DataCollatorWithPadding(tokenizer),
+)
+metrics_computer = MLflowMetricsLogger(trainer)
+trainer.compute_metrics = metrics_computer
+# Main execution
+if __name__ == "__main__":
+    # Start training
+    trainer.train()
+    # Save the model
+    trainer.save_model(exp_name)
+    print("Getting predictions on validation set...")
+    predictions = trainer.predict(eval_ds)
+    # Extract logits and predictions
+    logits = predictions.predictions
+    predicted_labels = np.argmax(logits, axis=1)
+    # Create results dataframe
+    val_results = val_df.copy()
+    val_results['predicted'] = predicted_labels
+    # Convert logits to list of lists for storage
+    val_results['logits'] = [logit.tolist() for logit in logits]
+    # Save validation results
+    val_results.to_parquet(f"{exp_name}/{exp_name}_oof.parquet", index=False)
+    print(f"Validation results saved to {exp_name}/{exp_name}.parquet")
+    source_file = f"{exp_name}.py"
+    destination_directory = exp_name
+    shutil.copy(source_file, destination_directory)
+    print(f"File '{source_file}' copied to '{destination_directory}'")
+    print("Training completed and model saved!")

MAP_EXP_24_oof.parquet ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:dbaf83a651322985d601e81f84c479b9c8978f5ce606846fa7e32c539ccb67ad
+size 3816746

README.md ADDED Viewed

	@@ -0,0 +1,202 @@

+---
+base_model: mistralai/Mathstral-7B-v0.1
+library_name: peft
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]
+### Framework versions
+- PEFT 0.15.2

adapter_config.json ADDED Viewed

	@@ -0,0 +1,43 @@

+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "mistralai/Mathstral-7B-v0.1",
+  "bias": "none",
+  "corda_config": null,
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 64,
+  "lora_bias": false,
+  "lora_dropout": 0.05,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": [
+    "score",
+    "classifier",
+    "score"
+  ],
+  "peft_type": "LORA",
+  "r": 64,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "gate_proj",
+    "v_proj",
+    "k_proj",
+    "down_proj",
+    "q_proj",
+    "o_proj",
+    "up_proj"
+  ],
+  "task_type": "SEQ_CLS",
+  "trainable_token_indices": null,
+  "use_dora": false,
+  "use_rslora": false
+}

adapter_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8b6aaf18afcffaeb4a3900c7d343b8564451f280ac400e8f758f6f7265b7969e
+size 672214232

chat_template.jinja ADDED Viewed

	@@ -0,0 +1,24 @@

+{%- if messages[0]['role'] == 'system' %}
+    {%- set system_message = messages[0]['content'] %}
+    {%- set loop_messages = messages[1:] %}
+{%- else %}
+    {%- set loop_messages = messages %}
+{%- endif %}
+{{- bos_token }}
+{%- for message in loop_messages %}
+    {%- if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}
+        {{- raise_exception('After the optional system message, conversation roles must alternate user/assistant/user/assistant/...') }}
+    {%- endif %}
+    {%- if message['role'] == 'user' %}
+        {%- if loop.last and system_message is defined %}
+            {{- '[INST] ' + system_message + '\n\n' + message['content'] + '[/INST]' }}
+        {%- else %}
+            {{- '[INST] ' + message['content'] + '[/INST]' }}
+        {%- endif %}
+    {%- elif message['role'] == 'assistant' %}
+        {{- ' ' + message['content'] + eos_token}}
+    {%- else %}
+        {{- raise_exception('Only user and assistant roles are supported, with the exception of an initial optional system message!') }}
+    {%- endif %}
+{%- endfor %}

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,24 @@

+{
+  "bos_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": "</s>",
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

tokenizer.model ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:59f95e28944c062244741268596badc900df86c7f5ded05088d2da22a7379e06
+size 587583

tokenizer_config.json ADDED Viewed

The diff for this file is too large to render. See raw diff

training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8cba18f58eb5deabb738d167109f181e9741cd85a99a36981c296f18334414ac
+size 5304