Spaces:

mohsin416
/

CodeInsight

Running

App Files Files Community

GitHub Actions commited on Nov 2, 2025

Commit

c2af030

1 Parent(s): 4c2aafc

Sync from GitHub Actions

Browse files

Files changed (28) hide show

.dockerignore +0 -0
.gitattributes +0 -35
.gitignore +197 -0
Dockerfile +12 -5
app.py +54 -0
codeInsight/evaluation/__init__.py +0 -0
codeInsight/evaluation/evaluator.py +6 -0
codeInsight/exception/__init__.py +23 -0
codeInsight/inference/__init__.py +0 -0
codeInsight/inference/code_assistant.py +72 -0
codeInsight/logger/__init__.py +23 -0
codeInsight/models/__init__.py +0 -0
codeInsight/models/model_loader.py +45 -0
codeInsight/models/peft_trainer.py +134 -0
codeInsight/pipeline/__init__.py +0 -0
codeInsight/pipeline/prediction_pipeline.py +27 -0
codeInsight/pipeline/training_pipeline.py +106 -0
codeInsight/safety/__init__.py +0 -0
codeInsight/safety/safety_checker.py +38 -0
codeInsight/training/__init__.py +0 -0
codeInsight/training/train.py +25 -0
codeInsight/utils/__init__.py +0 -0
codeInsight/utils/config.py +20 -0
config/model.yaml +68 -0
requirements.txt +10 -2
setup.py +9 -0
src/streamlit_app.py +0 -40
template.py +56 -0

.dockerignore ADDED Viewed

File without changes

.gitattributes DELETED Viewed

@@ -1,35 +0,0 @@
-*.7z filter=lfs diff=lfs merge=lfs -text
-*.arrow filter=lfs diff=lfs merge=lfs -text
-*.bin filter=lfs diff=lfs merge=lfs -text
-*.bz2 filter=lfs diff=lfs merge=lfs -text
-*.ckpt filter=lfs diff=lfs merge=lfs -text
-*.ftz filter=lfs diff=lfs merge=lfs -text
-*.gz filter=lfs diff=lfs merge=lfs -text
-*.h5 filter=lfs diff=lfs merge=lfs -text
-*.joblib filter=lfs diff=lfs merge=lfs -text
-*.lfs.* filter=lfs diff=lfs merge=lfs -text
-*.mlmodel filter=lfs diff=lfs merge=lfs -text
-*.model filter=lfs diff=lfs merge=lfs -text
-*.msgpack filter=lfs diff=lfs merge=lfs -text
-*.npy filter=lfs diff=lfs merge=lfs -text
-*.npz filter=lfs diff=lfs merge=lfs -text
-*.onnx filter=lfs diff=lfs merge=lfs -text
-*.ot filter=lfs diff=lfs merge=lfs -text
-*.parquet filter=lfs diff=lfs merge=lfs -text
-*.pb filter=lfs diff=lfs merge=lfs -text
-*.pickle filter=lfs diff=lfs merge=lfs -text
-*.pkl filter=lfs diff=lfs merge=lfs -text
-*.pt filter=lfs diff=lfs merge=lfs -text
-*.pth filter=lfs diff=lfs merge=lfs -text
-*.rar filter=lfs diff=lfs merge=lfs -text
-*.safetensors filter=lfs diff=lfs merge=lfs -text
-saved_model/**/* filter=lfs diff=lfs merge=lfs -text
-*.tar.* filter=lfs diff=lfs merge=lfs -text
-*.tar filter=lfs diff=lfs merge=lfs -text
-*.tflite filter=lfs diff=lfs merge=lfs -text
-*.tgz filter=lfs diff=lfs merge=lfs -text
-*.wasm filter=lfs diff=lfs merge=lfs -text
-*.xz filter=lfs diff=lfs merge=lfs -text
-*.zip filter=lfs diff=lfs merge=lfs -text
-*.zst filter=lfs diff=lfs merge=lfs -text
-*tfevents* filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,197 @@

+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+data/
+monitoring/
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+super_GPT.py
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# UV
+#   Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#uv.lock
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/latest/usage/project/#working-with-version-control
+.pdm.toml
+.pdm-python
+.pdm-build/
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/
+# Abstra
+# Abstra is an AI-powered process automation framework.
+# Ignore directories containing user credentials, local state, and settings.
+# Learn more at https://abstra.io/docs
+.abstra/
+# Visual Studio Code
+#  Visual Studio Code specific template is maintained in a separate VisualStudioCode.gitignore
+#  that can be found at https://github.com/github/gitignore/blob/main/Global/VisualStudioCode.gitignore
+#  and can be added to the global gitignore or merged into this file. However, if you prefer,
+#  you could uncomment the following to ignore the enitre vscode folder
+# .vscode/
+# Ruff stuff:
+.ruff_cache/
+# PyPI configuration file
+.pypirc
+# Cursor
+#  Cursor is an AI-powered code editor. `.cursorignore` specifies files/directories to
+#  exclude from AI features like autocomplete and code analysis. Recommended for sensitive data
+#  refer to https://docs.cursor.com/context/ignore-files
+.cursorignore
+.cursorindexingignore

Dockerfile CHANGED Viewed

@@ -1,20 +1,27 @@
-FROM python:3.13.5-slim
 WORKDIR /app
 RUN apt-get update && apt-get install -y \
     build-essential \
     curl \
     git \
     && rm -rf /var/lib/apt/lists/*
-COPY requirements.txt ./
-COPY src/ ./src/
-RUN pip3 install -r requirements.txt
 EXPOSE 8501
 HEALTHCHECK CMD curl --fail http://localhost:8501/_stcore/health
-ENTRYPOINT ["streamlit", "run", "src/streamlit_app.py", "--server.port=8501", "--server.address=0.0.0.0"]

+FROM python:3.12-slim
 WORKDIR /app
+COPY requirements.txt .
+RUN pip install --upgrade pip
+RUN pip install -r requirements.txt
 RUN apt-get update && apt-get install -y \
     build-essential \
     curl \
     git \
     && rm -rf /var/lib/apt/lists/*
+RUN mkdir -p /app/.cache \
+    && chmod -R 777 /app/.cache
+ENV HF_HOME=/app/.cache
+ENV TRANSFORMERS_CACHE=/app/.cache
+COPY . .
 EXPOSE 8501
 HEALTHCHECK CMD curl --fail http://localhost:8501/_stcore/health
+ENTRYPOINT ["streamlit", "run", "app.py", "--server.port=8501", "--server.address=0.0.0.0"]

app.py ADDED Viewed

	@@ -0,0 +1,54 @@

+import streamlit as st
+from codeInsight.pipeline.prediction_pipeline import PredictionPipeline
+from codeInsight.logger import logging
+st.set_page_config(
+    page_title="CodeInsight Assistant",
+    page_icon="🤖",
+    layout="wide"
+)
+@st.cache_resource
+def load_pipeline():
+    try:
+        pipeline = PredictionPipeline()
+        return pipeline
+    except Exception as e:
+        logging.error("Failed to load pipeline in Streamlit app")
+        st.error(f"Failed to load model pipeline: {e}")
+        return None
+pipeline = load_pipeline()
+st.title("🤖 CodeInsight Assistant")
+st.caption("Your fine-tuned CodeLlama-7b model, ready to help with Python.")
+if pipeline:
+    if "message" not in st.session_state:
+        st.session_state.messages = [
+            {"role": "assistant", "content": "Hello! How can I help you with Python programming today?"}
+        ]
+        for message in st.session_state.message:
+            with st.chat_message(message["role"]):
+                st.markdown(message["contant"])
+        prompt = st.chat_input("Ask me to write python code")
+        if prompt:
+            st.session_state.messages.append({"role": "user", "content": prompt})
+            with st.chat_message("user"):
+                st.markdown(prompt)
+            with st.chat_message("assistant"):
+                with st.spinner("Thinking..."):
+                    response = pipeline.predict(prompt)
+                    formatted_response = f"```python\n{response}\n```"
+                    st.markdown(formatted_response)
+            st.session_state.messages.append({"role": "assistant", "content": formatted_response})
+        else:
+            st.error("The prediction pipeline could not be loaded. Please check the logs.")

codeInsight/evaluation/__init__.py ADDED Viewed

File without changes

codeInsight/evaluation/evaluator.py ADDED Viewed

	@@ -0,0 +1,6 @@

+import math
+def compute_metrics(eval_preds):
+    eval_loss = eval_preds.loss
+    perplexity = math.exp(eval_loss) if eval_loss < 20 else float("inf")
+    return {"perplexity": perplexity}

codeInsight/exception/__init__.py ADDED Viewed

	@@ -0,0 +1,23 @@

+import os
+import sys
+def error_message_deatils(error, error_deatil : sys):
+    _, _, exc_tab = error_deatil.exc_info()
+    file_name = exc_tab.tb_frame.f_code.co_filename
+    error_message = "Error occurred python script name [{0}] line number [{1}] error message [{2}]".format(
+        file_name, exc_tab.tb_lineno, str(error)
+    )
+    return error_message
+class ExceptionHandle(Exception):
+    def __init__(self, error_message, error_deatil):
+        super().__init__(error_message)
+        self.error_message = error_message_deatils(
+            error_message, error_deatil
+        )
+    def __str__(self):
+        return self.error_message

codeInsight/inference/__init__.py ADDED Viewed

File without changes

codeInsight/inference/code_assistant.py ADDED Viewed

	@@ -0,0 +1,72 @@

+import torch
+import os
+import sys
+from transformers import AutoModelForCausalLM, AutoTokenizer
+from codeInsight.utils.config import load_config
+from codeInsight.exception import ExceptionHandle
+from codeInsight.logger import logging
+class CodeAssistant:
+    def __init__(self, config_path="config/model.yaml"):
+        try:
+            self.config = load_config(config_path)
+            self.dataset_config = self.config['dataset']
+            model_repo = self.config['model']['final_model_repo']
+            logging.info(f"Initializing CodeAssistant with model from: {model_repo}")
+            self.model = AutoModelForCausalLM.from_pretrained(
+                model_repo,
+                device_map="auto",
+                torch_dtype=torch.bfloat16,
+                trust_remote_code=False
+            )
+            self.tokenizer = AutoTokenizer.from_pretrained(
+                model_repo
+            )
+            self.model.eval()
+            self.model.config.use_cache = True
+            logging.info("CodeAssistant initialized successfully.")
+        except Exception as e:
+            logging.error("Failed to initialize CodeAssistant")
+            raise ExceptionHandle(e, sys)
+    def _formet_prompt(self, prompt : str) -> str:
+        return f"{self.dataset_config['SYSTEM_PROMPT']}{self.dataset_config['USER_TOKEN']}{prompt}{self.dataset_config['END_TOKEN']}\n\n{self.dataset_config['ASSISTANT_TOKEN']}"
+    def generate(self, prompt : str, max_length : int = 512, temperature: float = 0.1, top_p : float =0.80) -> str:
+        try:
+            input_text = self._formet_prompt(prompt)
+            inputs = self.tokenizer(
+                input_text,
+                return_tensors="pt",
+            ).to(self.model.device)
+            with torch.no_grad():
+                outputs = self.model.generate(
+                    **inputs,
+                    max_new_tokens=max_length,
+                    temperature=temperature,
+                    top_p=top_p,
+                    do_sample=True,
+                    eos_token_id=self.tokenizer.convert_tokens_to_ids(self.dataset_config['END_TOKEN']),
+                    pad_token_id=self.tokenizer.eos_token_id
+                )
+            generated_text = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
+            if self.dataset_config['ASSISTANT_TOKEN'] in generated_text:
+                generated_code = generated_text.split(self.dataset_config['ASSISTANT_TOKEN'])[1].strip()
+                if self.dataset_config['END_TOKEN'] in generated_code:
+                    generated_code = generated_code.split(self.dataset_config['END_TOKEN'])[0].strip()
+            else:
+                generated_code = generated_text
+            logging.info("Response generated successfully.")
+            return generated_code
+        except Exception as e:
+            logging.error("Failed during code generation")
+            raise ExceptionHandle(e, sys)

codeInsight/logger/__init__.py ADDED Viewed

	@@ -0,0 +1,23 @@

+import logging
+import os
+from datetime import datetime
+dir = "tmp/logs"
+os.makedirs(dir, exist_ok=True)
+LOG_FILE = f"{datetime.now().strftime('%m_%d_%Y_%H_%M_%S')}.log"
+LOF_PATH = os.path.join(dir, LOG_FILE)
+file_handler = logging.FileHandler(LOF_PATH)
+console_handler = logging.StreamHandler()
+log_format = "[ %(asctime)s ] %(name)s - %(levelname)s - %(message)s"
+formetter = logging.Formatter(log_format)
+file_handler.setFormatter(formetter)
+console_handler.setFormatter(formetter)
+logging.basicConfig(
+    level=logging.DEBUG,
+    handlers=[file_handler, console_handler],
+)

codeInsight/models/__init__.py ADDED Viewed

File without changes

codeInsight/models/model_loader.py ADDED Viewed

	@@ -0,0 +1,45 @@

+import torch
+from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
+from peft import prepare_model_for_kbit_training
+from codeInsight.logger import logging
+from codeInsight.exception import ExceptionHandle
+import sys
+def load_model_and_tokenizer(config : dict) -> tuple[AutoModelForCausalLM, AutoTokenizer]:
+    try:
+        model_id = config['base_model_id']
+        quant_config = config['quantization']
+        logging.info(f"Loading base model: {model_id}")
+        bnb_config = BitsAndBytesConfig(
+            load_in_4bit=quant_config['load_in_4bit'],
+            bnb_4bit_quant_type=quant_config['bnb_4bit_quant_type'],
+            bnb_4bit_compute_dtype=quant_config['bnb_4bit_compute_dtype'],
+            bnb_4bit_use_double_quant=quant_config['bnb_4bit_use_double_quant']
+        )
+        model = AutoModelForCausalLM.from_pretrained(
+            model_id,
+            quantization_config=bnb_config,
+            device_map="auto",
+            trust_remote_code=True,
+            attn_implementation=config['attn_implementation']
+        )
+        model.config.use_cache = False
+        model = prepare_model_for_kbit_training(model, use_gradient_checkpointing=True)
+        model.gradient_checkpointing_enable()
+        logging.info("Base model loaded successfully with 4-bit quantization.")
+        tokenizer = AutoTokenizer.from_pretrained(model_id)
+        tokenizer.pad_token = tokenizer.eos_token
+        tokenizer.padding_side = "right"
+        logging.info("Tokenizer loaded successfully.")
+        return model, tokenizer
+    except Exception as e:
+        logging.error("Failed to load model or tokenizer")
+        raise ExceptionHandle(e, sys)

codeInsight/models/peft_trainer.py ADDED Viewed

	@@ -0,0 +1,134 @@

+import sys
+from peft import get_peft_model, LoraConfig
+from trl import SFTTrainer, SFTConfig, DataCollatorForCompletionOnlyLM
+from transformers import EarlyStoppingCallback
+from codeInsight.logger import logging
+from codeInsight.exception import ExceptionHandle
+class ModelTrainer:
+    def __init__(self, model, tokenizer, datasets: dict, config: dict):
+        self.model = model
+        self.tokenizer = tokenizer
+        self.datasets = datasets
+        self.lora_config = config['lora']
+        self.training_config = config['training']
+        self.paths_config = config['paths']
+        self.trainer = self._setup_trainer()
+        logging.info("ModelTrainer initialized.")
+    def _get_target_module(self, model) -> list:
+        try:
+            logging.info('Start Finding LoRA target module')
+            candidates = ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"]
+            present = set()
+            for name, module in model.named_modules():
+                for cand in candidates:
+                    if name.endswith(cand):
+                        present.add(cand)
+            return list(present) if present else ["q_proj", "v_proj"]
+        except Exception as e:
+            logging.error(f"Something is wrong here")
+            raise ExceptionHandle(e, sys)
+    def _peft_model_setup(self):
+        try:
+            logging.info('Setting up PEFT LoRA model')
+            lora_config = LoraConfig(
+                r=self.lora_config['r'],
+                lora_alpha=self.lora_config['lora_alpha'],
+                target_modules=self._get_target_module(self.model),
+                lora_dropout=self.lora_config['lora_dropout'],
+                bias=self.lora_config['bias'],
+                task_type=self.lora_config['task_type'],
+                use_rslora=self.lora_config['use_rslora']
+            )
+            peft_model = get_peft_model(self.model, lora_config)
+            logging.info("PEFT model created successfully.")
+            peft_model.print_trainable_parameters()
+            return peft_model
+        except Exception as e:
+            logging.error("Failed to setup PEFT model")
+            raise ExceptionHandle(e, sys)
+    def _get_training_args(self) -> SFTConfig:
+        try:
+            return SFTConfig(
+                output_dir=self.paths_config['output_dir'],
+                per_device_train_batch_size=self.training_config['per_device_train_batch_size'],
+                per_device_eval_batch_siz=self.training_config['per_device_eval_batch_size'],
+                gradient_accumulation_steps=self.training_config['gradient_accumulation_steps'],
+                num_train_epochs=self.training_config['num_train_epochs'],
+                learning_rate=self.training_config['learning_rate'],
+                warmup_ratio=self.training_config['warmup_ratio'],
+                warmup_steps=self.training_config['warmup_steps'],
+                bf16=self.training_config['bf16'],
+                tf32=self.training_config['tf32'],
+                fp16=self.training_config['fp16'],
+                lr_scheduler_type=self.training_config['lr_scheduler_type'],
+                optim=self.training_config['optim'],
+                gradient_checkpointing=self.training_config['gradient_checkpointing'],
+                gradient_checkpointing_kwargs=self.training_config['gradient_checkpointing_kwargs'],
+                max_grad_norm=self.training_config['max_grad_norm'],
+                weight_decay=self.training_config['weight_decay'],
+                logging_steps=self.training_config['logging_steps'],
+                eval_steps=self.training_config['eval_steps'],
+                save_steps=self.training_config['save_steps'],
+                evaluation_strategy=self.training_config['eval_strategy'],
+                save_strategy=self.training_config['save_strategy'],
+                save_total_limit=self.training_config['save_total_limit'],
+                load_best_model_at_end=self.training_config['load_best_model_at_end'],
+                metric_for_best_model=self.training_config['metric_for_best_model'],
+                greater_is_better=self.training_config['greater_is_better'],
+                prediction_loss_only=self.training_config['prediction_loss_only'],
+                report_to=self.training_config['report_to'],
+                dataloader_num_workers=self.training_config['dataloader_num_workers'],
+                max_seq_length=self.training_config['max_seq_length'],
+                dataset_text_field=self.training_config['dataset_text_field'],
+                label_names=self.training_config['label_names'],
+                neftune_noise_alpha=self.training_config['neftune_noise_alpha']
+            )
+        except Exception as e:
+            logging.error("Failed to create TrainingArguments")
+            raise ExceptionHandle(e, sys)
+    def _data_collator(self):
+        try:
+            return DataCollatorForCompletionOnlyLM(
+                response_template="<|assistant|>",
+                tokenizer=self.tokenizer
+            )
+        except Exception as e:
+            logging.error("Failed to create Data Collator")
+            raise ExceptionHandle(e, sys)
+    def _setup_trainer(self) -> SFTTrainer:
+        logging.info("Initializing SFTTrainer")
+        peft_model = self._peft_model_setup()
+        training_args = self._get_training_args()
+        trainer = SFTTrainer(
+            model=peft_model,
+            train_dataset=self.datasets['train'],
+            eval_dataset=self.datasets['val'],
+            args=training_args,
+            data_collator=self._data_collator(),
+            callbacks=[EarlyStoppingCallback(early_stopping_patience=5, early_stopping_threshold=0.001)],
+        )
+        logging.info("SFTTrainer initialized successfully.")
+        return trainer
+    def save_apater(self):
+        try:
+            adapter_path = self.paths_config['adapter_save_dir']
+            self.trainer.model.save_pretrained(adapter_path)
+            logging.info(f"LoRA adapter saved successfully to {adapter_path}")
+        except Exception as e:
+            logging.error("Failed to save LoRA adapter")
+            raise ExceptionHandle(e, sys)

codeInsight/pipeline/__init__.py ADDED Viewed

File without changes

codeInsight/pipeline/prediction_pipeline.py ADDED Viewed

	@@ -0,0 +1,27 @@

+import sys
+from codeInsight.inference.code_assistant import CodeAssistant
+from codeInsight.safety.safety_checker import SafetyChecker
+from codeInsight.exception import ExceptionHandle
+from codeInsight.logger import logging
+class PredictionPipeline:
+    def __init__(self, config_path : str = "config/model.yaml"):
+        try:
+            self.assistant = CodeAssistant(config_path)
+            self.safety_checker = SafetyChecker()
+            logging.info("Prediction Pipeline initialized successfully.")
+        except Exception as e:
+            logging.error("Failed to initialize PredictionPipeline")
+            raise ExceptionHandle(e, sys)
+    def predict(self, instruction : str) -> str:
+        try:
+            raw_output = self.assistant.generate(instruction)
+            safe_output = self.safety_checker.check_outputs(raw_output)
+            return safe_output
+        except Exception as e:
+            logging.error(f"Prediction failed: {e}")
+            return "An error occurred while processing your request. Please try again."

codeInsight/pipeline/training_pipeline.py ADDED Viewed

	@@ -0,0 +1,106 @@

+import os
+import sys
+import torch
+import wandb
+from peft import PeftModel
+from transformers import AutoModelForCausalLM, AutoTokenizer
+from codeInsight.utils.config import load_config
+from codeInsight.data.dataset_builder import DatasetBuilder
+from codeInsight.models.model_loader import load_model_and_tokenizer
+from codeInsight.models.peft_trainer import ModelTrainer
+from codeInsight.evaluation.evaluator import compute_metrics
+from codeInsight.logger import logging
+from codeInsight.exception import ExceptionHandle
+class TrainingPipeline:
+    def __init__(self, config_path: str = "config/model.yaml"):
+        self.config = load_config(config_path)
+        self.wandb_key = os.getenv('WANDB_API_KEY')
+        self.gf_token = os.getenv('HF_TOKEN')
+    def _wandb_login(self):
+        try:
+            if self.wandb_key:
+                wandb.login(key=self.wandb_key)
+                wandb.init(project=self.config['wandb']['project_name'])
+                logging.info("WandB login successful.")
+            else:
+                raise ValueError('WANDB_API_KEY is not set')
+        except Exception as e:
+            logging.error("Failed to login to WandB")
+            raise ExceptionHandle(e, sys)
+    def run_training(self):
+        try:
+            if self.config['training']['report_to'] == "wandb":
+                self._wandb_login()
+            model, tokenizer = load_model_and_tokenizer(self.config['model'])
+            dataset_builder = DatasetBuilder(self.config, tokenizer)
+            tokenized_datasets = dataset_builder.get_dataset()
+            trainer = ModelTrainer(
+                model=model,
+                tokenizer=tokenizer,
+                datasets=tokenized_datasets,
+                config=self.config,
+            )
+            trainer.train()
+            logging.info("Model Training Successfull")
+            trainer.save_apater()
+        except Exception as e:
+            logging.error(f"Training pipeline failed: {e}")
+            raise ExceptionHandle(e, sys)
+    def run_merge_and_push(self):
+        try:
+            model_config = self.config['model']
+            paths_config = self.config['paths']
+            logging.info("Starting model merge and push process")
+            torch.cuda.empty_cache()
+            logging.info('Cleaned GPU cache')
+            base_model = AutoModelForCausalLM.from_pretrained(
+                model_config['base_model_id'],
+                return_dict=True,
+                torch_dtype=torch.bfloat16,
+                device_map="auto",
+            )
+            tokenizer = AutoTokenizer.from_pretrained(model_config['base_model_id'])
+            tokenizer.pad_token = tokenizer.eos_token
+            logging.info(f"Loading adapter from {paths_config['adapter_save_dir']}")
+            model_to_merge = PeftModel.from_pretrained(
+                base_model,
+                paths_config['adapter_save_dir']
+            )
+            merged_model = model_to_merge.merge_and_unload()
+            logging.info("Merge complete.")
+            repo_id = paths_config['final_model_repo']
+            logging.info(f"Pushing merged model and tokenizer to Hugging Face Hub: {repo_id}")
+            merged_model.push_to_hub(
+                repo_id,
+                token=self.hf_token,
+                check_pr=False
+            )
+            tokenizer.push_to_hub(
+                repo_id,
+                token=self.hf_token,
+                check_pr=False
+            )
+            logging.info("Successfully pushed model and tokenizer to the Hub.")
+        except ExceptionHandle as e:
+            logging.error("Failed to merge and push model")
+            raise ExceptionHandle(e, sys)

codeInsight/safety/__init__.py ADDED Viewed

File without changes

codeInsight/safety/safety_checker.py ADDED Viewed

	@@ -0,0 +1,38 @@

+from codeInsight.logger import logging
+import re
+class SafetyChecker:
+    def __init__(self):
+        logging.info("SafetyChecker initialized.")
+    def check_outputs(self, text : str) -> str:
+        if not text:
+            return "No response Generated"
+        refusal_phrases = ["I cannot", "I am unable", "As an AI model", "I'm sorry"]
+        if any(phrase.lower() in text.lower() for phrase in refusal_phrases):
+            logging.warning(f"Model refusal detected: {text}")
+            return "I'm sorry, but I cannot fulfill that request."
+        bad_word_pattern = r"\b(fuck|shit|bitch|asshole|bastard)\b"
+        if re.search(bad_word_pattern, text, re.IGNORECASE):
+            logging.warning('Bad word detected')
+            return "[Content removed due to inappropriate language]"
+        pii_pattern = [
+            r"\b\d{3}-\d{2}-\d{4}\b",
+            r"\b\d{16}\b",
+            r"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b"
+        ]
+        for pattern in pii_pattern:
+            if re.search(pattern, text):
+                logging.warning("PII detected in model output.")
+                return "[Sensitive information removed for privacy]"
+        hallucination_markers = ["According to a study", "In recent news", "As per research"]
+        if any(marker.lower() in text.lower() for marker in hallucination_markers):
+            logging.info("Potential hallucination detected.")
+        logging.info("Output passed all safety checks.")
+        return text

codeInsight/training/__init__.py ADDED Viewed

File without changes

codeInsight/training/train.py ADDED Viewed

	@@ -0,0 +1,25 @@

+import sys
+from codeInsight.pipeline.training_pipeline import TrainingPipeline
+from codeInsight.exception import ExceptionHandle
+from codeInsight.logger import logging
+def start_training():
+    try:
+        logging.info("Initializing Training Pipeline...")
+        pipeline = TrainingPipeline()
+        logging.info("Starting Model Training")
+        pipeline.run_training()
+        logging.info("Start Model Merge and Push")
+        pipeline.run_merge_and_push()
+        logging.info("Pipeline Complet")
+    except Exception as e:
+        logging.error("Pipeline failed")
+        raise ExceptionHandle(e, sys)
+if __name__ == "__main__":
+    start_training()

codeInsight/utils/__init__.py ADDED Viewed

File without changes

codeInsight/utils/config.py ADDED Viewed

	@@ -0,0 +1,20 @@

+from codeInsight.exception import ExceptionHandle
+from codeInsight.logger import logging
+from pathlib import Path
+import sys
+import yaml
+def load_config(config_path : Path = Path("config/model.yaml")) -> dict:
+    try:
+        with open(config_path, "r") as yaml_file:
+            config = yaml.safe_load(yaml_file)
+            logging.info(f"Config loaded from {config_path}")
+        return config
+    except FileNotFoundError:
+        logging.error(f"Configuration file not found at: {config_path}")
+        raise ExceptionHandle(e, sys)
+    except Exception as e:
+        logging.error(f"Error loading config from {config_path}")
+        raise ExceptionHandle(e, sys)

config/model.yaml ADDED Viewed

	@@ -0,0 +1,68 @@

+dataset:
+  name: "mohsin416/Python-Alpaca-5k"
+  shuffle_seed: 42
+  SYSTEM_PROMPT: "<|system|>\nYou are a senior Python developer. Provide clear, correct, well-commented code.<|end|>\n\n"
+  USER_TOKEN: "<|user|>\n"
+  ASSISTANT_TOKEN: "<|assistant|>\n"
+  END_TOKEN: "<|end|>"
+model:
+  base_model_id: "microsoft/Phi-3-mini-128k-instruct"
+  attn_implementation: "flash_attention_2"
+  quantization:
+    load_in_4bit: True
+    bnb_4bit_quant_type: "nf4"
+    bnb_4bit_compute_dtype: "bfloat16"
+    bnb_4bit_use_double_quant: True
+lora:
+  r: 32
+  load_alpha: 32
+  lora_dropout: 0.1
+  bias: "None"
+  task_type: "CAUSAL_LM"
+  use_rslora: True
+paths:
+  output_dir: "artifacts/outputs"
+  adapter_save_dir: "artifacts/phi3-python-instruct-adapter"
+  final_model_repo: "mohsin416/phi3-python-instruct"
+training:
+  per_device_train_batch_size: 4
+  per_device_eval_batch_size: 4
+  gradient_accumulation_steps: 8
+  num_train_epochs: 2
+  learning_rate: 2.0e-5
+  warmup_ratio: 0.1
+  warmup_steps: 0
+  bf16: True
+  tf32: False
+  fp16: False
+  lr_scheduler_type: "cosine"
+  optim: "paged_adamw_8bit"
+  gradient_checkpointing: True
+  gradient_checkpointing_kwargs: {"use_reentrant": False}
+  max_grad_norm: 1.0
+  weight_decay: 0.01
+  logging_steps: 50
+  eval_steps: 50
+  save_steps: 50
+  eval_strategy: "steps"
+  save_strategy: "steps"
+  save_total_limit: 3
+  load_best_model_at_end: True
+  metric_for_best_model: "eval_loss"
+  greater_is_better: False
+  prediction_loss_only: True
+  report_to: "wandb"
+  dataloader_num_workers: 4
+  dataloader_pin_memory: True
+  max_seq_length: 4096
+  dataset_text_field: "text"
+  label_names: ["labels"]
+  neftune_noise_alpha: 5
+wandb:
+  project_name: "Phi-3-mini-128k-instruct-metrics"

requirements.txt CHANGED Viewed

@@ -1,3 +1,11 @@
-altair
-pandas
 streamlit

+transformers
+datasets
+peft
+torch
+accelerate
+evaluate
+sentencepiece
+trl
+wandb
+pyyaml
 streamlit

setup.py ADDED Viewed

	@@ -0,0 +1,9 @@

+from setuptools import setup, find_packages
+setup(
+    name="codeinsight",
+    author="Md Mohsin",
+    author_email="siam.mohsin2005@gmail.com",
+    version="0.0.1",
+    packages=find_packages(),
+)

src/streamlit_app.py DELETED Viewed

@@ -1,40 +0,0 @@
-import altair as alt
-import numpy as np
-import pandas as pd
-import streamlit as st
-"""
-# Welcome to Streamlit!
-Edit `/streamlit_app.py` to customize this app to your heart's desire :heart:.
-If you have any questions, checkout our [documentation](https://docs.streamlit.io) and [community
-forums](https://discuss.streamlit.io).
-In the meantime, below is an example of what you can do with just a few lines of code:
-"""
-num_points = st.slider("Number of points in spiral", 1, 10000, 1100)
-num_turns = st.slider("Number of turns in spiral", 1, 300, 31)
-indices = np.linspace(0, 1, num_points)
-theta = 2 * np.pi * num_turns * indices
-radius = indices
-x = radius * np.cos(theta)
-y = radius * np.sin(theta)
-df = pd.DataFrame({
-    "x": x,
-    "y": y,
-    "idx": indices,
-    "rand": np.random.randn(num_points),
-})
-st.altair_chart(alt.Chart(df, height=700, width=700)
-    .mark_point(filled=True)
-    .encode(
-        x=alt.X("x", axis=None),
-        y=alt.Y("y", axis=None),
-        color=alt.Color("idx", legend=None, scale=alt.Scale()),
-        size=alt.Size("rand", legend=None, scale=alt.Scale(range=[1, 150])),
-    ))

template.py ADDED Viewed

	@@ -0,0 +1,56 @@

+import os
+from pathlib import Path
+project_name = "codeInsight"
+list_of_files = [
+    f"{project_name}/models/__init__.py",
+    f"{project_name}/models/model_loader.py",
+    f"{project_name}/models/peft_trainer.py",
+    f"{project_name}/training/__init__.py",
+    f"{project_name}/training/train.py",
+    f"{project_name}/evaluation/__init__.py",
+    f"{project_name}/evaluation/evaluator.py",
+    f"{project_name}/inference/__init__.py",
+    f"{project_name}/inference/code_assistant.py",
+    f"{project_name}/data/__init__.py",
+    f"{project_name}/data/dataset_builder.py",
+    f"{project_name}/utils/__init__.py",
+    f"{project_name}/utils/config.py",
+    f"{project_name}/safety/__init__.py",
+    f"{project_name}/safety/safety_checker.py",
+    f"{project_name}/exception/__init__.py",
+    f"{project_name}/logger/__init__.py",
+    f"{project_name}/pipeline/__init__.py",
+    f"{project_name}/pipeline/training_pipeline.py",
+    f"{project_name}/pipeline/prediction_pipeline.py",
+    "app.py",
+    "Demo.py",
+    "requirements.txt",
+    "Dockerfile",
+    "setup.py",
+    ".gitignore",
+    "README.md",
+    "config/model.yaml",
+]
+for filepath in list_of_files:
+    filepath = Path(filepath)
+    filedir, filename = os.path.split(filepath)
+    if filedir != "":
+        os.makedirs(filedir, exist_ok=True)
+    if not filepath.exists() or filepath.stat().st_size == 0:
+        filepath.touch()
+    else:
+        print(f'{filename} is already present in {filedir} and has some content. Skipping creation.')