GitHub Actions commited on
Commit
c2af030
·
1 Parent(s): 4c2aafc

Sync from GitHub Actions

Browse files
.dockerignore ADDED
File without changes
.gitattributes DELETED
@@ -1,35 +0,0 @@
1
- *.7z filter=lfs diff=lfs merge=lfs -text
2
- *.arrow filter=lfs diff=lfs merge=lfs -text
3
- *.bin filter=lfs diff=lfs merge=lfs -text
4
- *.bz2 filter=lfs diff=lfs merge=lfs -text
5
- *.ckpt filter=lfs diff=lfs merge=lfs -text
6
- *.ftz filter=lfs diff=lfs merge=lfs -text
7
- *.gz filter=lfs diff=lfs merge=lfs -text
8
- *.h5 filter=lfs diff=lfs merge=lfs -text
9
- *.joblib filter=lfs diff=lfs merge=lfs -text
10
- *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
- *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
- *.model filter=lfs diff=lfs merge=lfs -text
13
- *.msgpack filter=lfs diff=lfs merge=lfs -text
14
- *.npy filter=lfs diff=lfs merge=lfs -text
15
- *.npz filter=lfs diff=lfs merge=lfs -text
16
- *.onnx filter=lfs diff=lfs merge=lfs -text
17
- *.ot filter=lfs diff=lfs merge=lfs -text
18
- *.parquet filter=lfs diff=lfs merge=lfs -text
19
- *.pb filter=lfs diff=lfs merge=lfs -text
20
- *.pickle filter=lfs diff=lfs merge=lfs -text
21
- *.pkl filter=lfs diff=lfs merge=lfs -text
22
- *.pt filter=lfs diff=lfs merge=lfs -text
23
- *.pth filter=lfs diff=lfs merge=lfs -text
24
- *.rar filter=lfs diff=lfs merge=lfs -text
25
- *.safetensors filter=lfs diff=lfs merge=lfs -text
26
- saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
- *.tar.* filter=lfs diff=lfs merge=lfs -text
28
- *.tar filter=lfs diff=lfs merge=lfs -text
29
- *.tflite filter=lfs diff=lfs merge=lfs -text
30
- *.tgz filter=lfs diff=lfs merge=lfs -text
31
- *.wasm filter=lfs diff=lfs merge=lfs -text
32
- *.xz filter=lfs diff=lfs merge=lfs -text
33
- *.zip filter=lfs diff=lfs merge=lfs -text
34
- *.zst filter=lfs diff=lfs merge=lfs -text
35
- *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
.gitignore ADDED
@@ -0,0 +1,197 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+
6
+ # C extensions
7
+ *.so
8
+ data/
9
+ monitoring/
10
+
11
+ # Distribution / packaging
12
+ .Python
13
+ build/
14
+ develop-eggs/
15
+ dist/
16
+ downloads/
17
+ eggs/
18
+ .eggs/
19
+ lib/
20
+ lib64/
21
+ parts/
22
+ sdist/
23
+ var/
24
+ wheels/
25
+ share/python-wheels/
26
+ *.egg-info/
27
+ .installed.cfg
28
+ *.egg
29
+ MANIFEST
30
+
31
+ # PyInstaller
32
+ # Usually these files are written by a python script from a template
33
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
34
+ *.manifest
35
+ *.spec
36
+
37
+ # Installer logs
38
+ pip-log.txt
39
+ pip-delete-this-directory.txt
40
+
41
+ # Unit test / coverage reports
42
+ htmlcov/
43
+ .tox/
44
+ .nox/
45
+ .coverage
46
+ .coverage.*
47
+ .cache
48
+ nosetests.xml
49
+ coverage.xml
50
+ *.cover
51
+ *.py,cover
52
+ .hypothesis/
53
+ .pytest_cache/
54
+ cover/
55
+ super_GPT.py
56
+
57
+ # Translations
58
+ *.mo
59
+ *.pot
60
+
61
+ # Django stuff:
62
+ *.log
63
+ local_settings.py
64
+ db.sqlite3
65
+ db.sqlite3-journal
66
+
67
+ # Flask stuff:
68
+ instance/
69
+ .webassets-cache
70
+
71
+ # Scrapy stuff:
72
+ .scrapy
73
+
74
+ # Sphinx documentation
75
+ docs/_build/
76
+
77
+ # PyBuilder
78
+ .pybuilder/
79
+ target/
80
+
81
+ # Jupyter Notebook
82
+ .ipynb_checkpoints
83
+
84
+ # IPython
85
+ profile_default/
86
+ ipython_config.py
87
+
88
+ # pyenv
89
+ # For a library or package, you might want to ignore these files since the code is
90
+ # intended to run in multiple environments; otherwise, check them in:
91
+ # .python-version
92
+
93
+ # pipenv
94
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
95
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
96
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
97
+ # install all needed dependencies.
98
+ #Pipfile.lock
99
+
100
+ # UV
101
+ # Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
102
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
103
+ # commonly ignored for libraries.
104
+ #uv.lock
105
+
106
+ # poetry
107
+ # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
108
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
109
+ # commonly ignored for libraries.
110
+ # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
111
+ #poetry.lock
112
+
113
+ # pdm
114
+ # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
115
+ #pdm.lock
116
+ # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
117
+ # in version control.
118
+ # https://pdm.fming.dev/latest/usage/project/#working-with-version-control
119
+ .pdm.toml
120
+ .pdm-python
121
+ .pdm-build/
122
+
123
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
124
+ __pypackages__/
125
+
126
+ # Celery stuff
127
+ celerybeat-schedule
128
+ celerybeat.pid
129
+
130
+ # SageMath parsed files
131
+ *.sage.py
132
+
133
+ # Environments
134
+ .env
135
+ .venv
136
+ env/
137
+ venv/
138
+ ENV/
139
+ env.bak/
140
+ venv.bak/
141
+
142
+ # Spyder project settings
143
+ .spyderproject
144
+ .spyproject
145
+
146
+ # Rope project settings
147
+ .ropeproject
148
+
149
+ # mkdocs documentation
150
+ /site
151
+
152
+ # mypy
153
+ .mypy_cache/
154
+ .dmypy.json
155
+ dmypy.json
156
+
157
+ # Pyre type checker
158
+ .pyre/
159
+
160
+ # pytype static type analyzer
161
+ .pytype/
162
+
163
+ # Cython debug symbols
164
+ cython_debug/
165
+
166
+ # PyCharm
167
+ # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
168
+ # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
169
+ # and can be added to the global gitignore or merged into this file. For a more nuclear
170
+ # option (not recommended) you can uncomment the following to ignore the entire idea folder.
171
+ #.idea/
172
+
173
+ # Abstra
174
+ # Abstra is an AI-powered process automation framework.
175
+ # Ignore directories containing user credentials, local state, and settings.
176
+ # Learn more at https://abstra.io/docs
177
+ .abstra/
178
+
179
+ # Visual Studio Code
180
+ # Visual Studio Code specific template is maintained in a separate VisualStudioCode.gitignore
181
+ # that can be found at https://github.com/github/gitignore/blob/main/Global/VisualStudioCode.gitignore
182
+ # and can be added to the global gitignore or merged into this file. However, if you prefer,
183
+ # you could uncomment the following to ignore the enitre vscode folder
184
+ # .vscode/
185
+
186
+ # Ruff stuff:
187
+ .ruff_cache/
188
+
189
+ # PyPI configuration file
190
+ .pypirc
191
+
192
+ # Cursor
193
+ # Cursor is an AI-powered code editor. `.cursorignore` specifies files/directories to
194
+ # exclude from AI features like autocomplete and code analysis. Recommended for sensitive data
195
+ # refer to https://docs.cursor.com/context/ignore-files
196
+ .cursorignore
197
+ .cursorindexingignore
Dockerfile CHANGED
@@ -1,20 +1,27 @@
1
- FROM python:3.13.5-slim
2
 
3
  WORKDIR /app
4
 
 
 
 
 
5
  RUN apt-get update && apt-get install -y \
6
  build-essential \
7
  curl \
8
  git \
9
  && rm -rf /var/lib/apt/lists/*
10
 
11
- COPY requirements.txt ./
12
- COPY src/ ./src/
 
 
 
13
 
14
- RUN pip3 install -r requirements.txt
15
 
16
  EXPOSE 8501
17
 
18
  HEALTHCHECK CMD curl --fail http://localhost:8501/_stcore/health
19
 
20
- ENTRYPOINT ["streamlit", "run", "src/streamlit_app.py", "--server.port=8501", "--server.address=0.0.0.0"]
 
1
+ FROM python:3.12-slim
2
 
3
  WORKDIR /app
4
 
5
+ COPY requirements.txt .
6
+ RUN pip install --upgrade pip
7
+ RUN pip install -r requirements.txt
8
+
9
  RUN apt-get update && apt-get install -y \
10
  build-essential \
11
  curl \
12
  git \
13
  && rm -rf /var/lib/apt/lists/*
14
 
15
+ RUN mkdir -p /app/.cache \
16
+ && chmod -R 777 /app/.cache
17
+
18
+ ENV HF_HOME=/app/.cache
19
+ ENV TRANSFORMERS_CACHE=/app/.cache
20
 
21
+ COPY . .
22
 
23
  EXPOSE 8501
24
 
25
  HEALTHCHECK CMD curl --fail http://localhost:8501/_stcore/health
26
 
27
+ ENTRYPOINT ["streamlit", "run", "app.py", "--server.port=8501", "--server.address=0.0.0.0"]
app.py ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from codeInsight.pipeline.prediction_pipeline import PredictionPipeline
3
+ from codeInsight.logger import logging
4
+
5
+ st.set_page_config(
6
+ page_title="CodeInsight Assistant",
7
+ page_icon="🤖",
8
+ layout="wide"
9
+ )
10
+
11
+ @st.cache_resource
12
+ def load_pipeline():
13
+ try:
14
+ pipeline = PredictionPipeline()
15
+ return pipeline
16
+
17
+ except Exception as e:
18
+ logging.error("Failed to load pipeline in Streamlit app")
19
+ st.error(f"Failed to load model pipeline: {e}")
20
+ return None
21
+
22
+ pipeline = load_pipeline()
23
+
24
+ st.title("🤖 CodeInsight Assistant")
25
+ st.caption("Your fine-tuned CodeLlama-7b model, ready to help with Python.")
26
+
27
+ if pipeline:
28
+ if "message" not in st.session_state:
29
+ st.session_state.messages = [
30
+ {"role": "assistant", "content": "Hello! How can I help you with Python programming today?"}
31
+ ]
32
+
33
+ for message in st.session_state.message:
34
+ with st.chat_message(message["role"]):
35
+ st.markdown(message["contant"])
36
+
37
+ prompt = st.chat_input("Ask me to write python code")
38
+
39
+ if prompt:
40
+ st.session_state.messages.append({"role": "user", "content": prompt})
41
+ with st.chat_message("user"):
42
+ st.markdown(prompt)
43
+
44
+ with st.chat_message("assistant"):
45
+ with st.spinner("Thinking..."):
46
+ response = pipeline.predict(prompt)
47
+
48
+ formatted_response = f"```python\n{response}\n```"
49
+ st.markdown(formatted_response)
50
+
51
+ st.session_state.messages.append({"role": "assistant", "content": formatted_response})
52
+
53
+ else:
54
+ st.error("The prediction pipeline could not be loaded. Please check the logs.")
codeInsight/evaluation/__init__.py ADDED
File without changes
codeInsight/evaluation/evaluator.py ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ import math
2
+
3
+ def compute_metrics(eval_preds):
4
+ eval_loss = eval_preds.loss
5
+ perplexity = math.exp(eval_loss) if eval_loss < 20 else float("inf")
6
+ return {"perplexity": perplexity}
codeInsight/exception/__init__.py ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import sys
3
+
4
+ def error_message_deatils(error, error_deatil : sys):
5
+ _, _, exc_tab = error_deatil.exc_info()
6
+ file_name = exc_tab.tb_frame.f_code.co_filename
7
+
8
+ error_message = "Error occurred python script name [{0}] line number [{1}] error message [{2}]".format(
9
+ file_name, exc_tab.tb_lineno, str(error)
10
+ )
11
+
12
+ return error_message
13
+
14
+ class ExceptionHandle(Exception):
15
+ def __init__(self, error_message, error_deatil):
16
+ super().__init__(error_message)
17
+
18
+ self.error_message = error_message_deatils(
19
+ error_message, error_deatil
20
+ )
21
+
22
+ def __str__(self):
23
+ return self.error_message
codeInsight/inference/__init__.py ADDED
File without changes
codeInsight/inference/code_assistant.py ADDED
@@ -0,0 +1,72 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import os
3
+ import sys
4
+ from transformers import AutoModelForCausalLM, AutoTokenizer
5
+ from codeInsight.utils.config import load_config
6
+ from codeInsight.exception import ExceptionHandle
7
+ from codeInsight.logger import logging
8
+
9
+ class CodeAssistant:
10
+ def __init__(self, config_path="config/model.yaml"):
11
+ try:
12
+ self.config = load_config(config_path)
13
+ self.dataset_config = self.config['dataset']
14
+ model_repo = self.config['model']['final_model_repo']
15
+ logging.info(f"Initializing CodeAssistant with model from: {model_repo}")
16
+
17
+ self.model = AutoModelForCausalLM.from_pretrained(
18
+ model_repo,
19
+ device_map="auto",
20
+ torch_dtype=torch.bfloat16,
21
+ trust_remote_code=False
22
+ )
23
+
24
+ self.tokenizer = AutoTokenizer.from_pretrained(
25
+ model_repo
26
+ )
27
+ self.model.eval()
28
+ self.model.config.use_cache = True
29
+
30
+ logging.info("CodeAssistant initialized successfully.")
31
+
32
+ except Exception as e:
33
+ logging.error("Failed to initialize CodeAssistant")
34
+ raise ExceptionHandle(e, sys)
35
+
36
+ def _formet_prompt(self, prompt : str) -> str:
37
+ return f"{self.dataset_config['SYSTEM_PROMPT']}{self.dataset_config['USER_TOKEN']}{prompt}{self.dataset_config['END_TOKEN']}\n\n{self.dataset_config['ASSISTANT_TOKEN']}"
38
+
39
+ def generate(self, prompt : str, max_length : int = 512, temperature: float = 0.1, top_p : float =0.80) -> str:
40
+ try:
41
+ input_text = self._formet_prompt(prompt)
42
+ inputs = self.tokenizer(
43
+ input_text,
44
+ return_tensors="pt",
45
+ ).to(self.model.device)
46
+
47
+ with torch.no_grad():
48
+ outputs = self.model.generate(
49
+ **inputs,
50
+ max_new_tokens=max_length,
51
+ temperature=temperature,
52
+ top_p=top_p,
53
+ do_sample=True,
54
+ eos_token_id=self.tokenizer.convert_tokens_to_ids(self.dataset_config['END_TOKEN']),
55
+ pad_token_id=self.tokenizer.eos_token_id
56
+ )
57
+
58
+ generated_text = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
59
+
60
+ if self.dataset_config['ASSISTANT_TOKEN'] in generated_text:
61
+ generated_code = generated_text.split(self.dataset_config['ASSISTANT_TOKEN'])[1].strip()
62
+ if self.dataset_config['END_TOKEN'] in generated_code:
63
+ generated_code = generated_code.split(self.dataset_config['END_TOKEN'])[0].strip()
64
+ else:
65
+ generated_code = generated_text
66
+
67
+ logging.info("Response generated successfully.")
68
+ return generated_code
69
+
70
+ except Exception as e:
71
+ logging.error("Failed during code generation")
72
+ raise ExceptionHandle(e, sys)
codeInsight/logger/__init__.py ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ import os
3
+ from datetime import datetime
4
+
5
+ dir = "tmp/logs"
6
+ os.makedirs(dir, exist_ok=True)
7
+
8
+ LOG_FILE = f"{datetime.now().strftime('%m_%d_%Y_%H_%M_%S')}.log"
9
+ LOF_PATH = os.path.join(dir, LOG_FILE)
10
+
11
+ file_handler = logging.FileHandler(LOF_PATH)
12
+ console_handler = logging.StreamHandler()
13
+
14
+ log_format = "[ %(asctime)s ] %(name)s - %(levelname)s - %(message)s"
15
+ formetter = logging.Formatter(log_format)
16
+
17
+ file_handler.setFormatter(formetter)
18
+ console_handler.setFormatter(formetter)
19
+
20
+ logging.basicConfig(
21
+ level=logging.DEBUG,
22
+ handlers=[file_handler, console_handler],
23
+ )
codeInsight/models/__init__.py ADDED
File without changes
codeInsight/models/model_loader.py ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
3
+ from peft import prepare_model_for_kbit_training
4
+ from codeInsight.logger import logging
5
+ from codeInsight.exception import ExceptionHandle
6
+ import sys
7
+
8
+ def load_model_and_tokenizer(config : dict) -> tuple[AutoModelForCausalLM, AutoTokenizer]:
9
+ try:
10
+ model_id = config['base_model_id']
11
+ quant_config = config['quantization']
12
+ logging.info(f"Loading base model: {model_id}")
13
+
14
+ bnb_config = BitsAndBytesConfig(
15
+ load_in_4bit=quant_config['load_in_4bit'],
16
+ bnb_4bit_quant_type=quant_config['bnb_4bit_quant_type'],
17
+ bnb_4bit_compute_dtype=quant_config['bnb_4bit_compute_dtype'],
18
+ bnb_4bit_use_double_quant=quant_config['bnb_4bit_use_double_quant']
19
+ )
20
+
21
+ model = AutoModelForCausalLM.from_pretrained(
22
+ model_id,
23
+ quantization_config=bnb_config,
24
+ device_map="auto",
25
+ trust_remote_code=True,
26
+ attn_implementation=config['attn_implementation']
27
+ )
28
+
29
+ model.config.use_cache = False
30
+ model = prepare_model_for_kbit_training(model, use_gradient_checkpointing=True)
31
+ model.gradient_checkpointing_enable()
32
+
33
+ logging.info("Base model loaded successfully with 4-bit quantization.")
34
+
35
+ tokenizer = AutoTokenizer.from_pretrained(model_id)
36
+ tokenizer.pad_token = tokenizer.eos_token
37
+ tokenizer.padding_side = "right"
38
+
39
+ logging.info("Tokenizer loaded successfully.")
40
+
41
+ return model, tokenizer
42
+
43
+ except Exception as e:
44
+ logging.error("Failed to load model or tokenizer")
45
+ raise ExceptionHandle(e, sys)
codeInsight/models/peft_trainer.py ADDED
@@ -0,0 +1,134 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+ from peft import get_peft_model, LoraConfig
3
+ from trl import SFTTrainer, SFTConfig, DataCollatorForCompletionOnlyLM
4
+ from transformers import EarlyStoppingCallback
5
+ from codeInsight.logger import logging
6
+ from codeInsight.exception import ExceptionHandle
7
+
8
+ class ModelTrainer:
9
+ def __init__(self, model, tokenizer, datasets: dict, config: dict):
10
+ self.model = model
11
+ self.tokenizer = tokenizer
12
+ self.datasets = datasets
13
+ self.lora_config = config['lora']
14
+ self.training_config = config['training']
15
+ self.paths_config = config['paths']
16
+
17
+ self.trainer = self._setup_trainer()
18
+ logging.info("ModelTrainer initialized.")
19
+
20
+ def _get_target_module(self, model) -> list:
21
+ try:
22
+ logging.info('Start Finding LoRA target module')
23
+ candidates = ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"]
24
+ present = set()
25
+ for name, module in model.named_modules():
26
+ for cand in candidates:
27
+ if name.endswith(cand):
28
+ present.add(cand)
29
+ return list(present) if present else ["q_proj", "v_proj"]
30
+
31
+ except Exception as e:
32
+ logging.error(f"Something is wrong here")
33
+ raise ExceptionHandle(e, sys)
34
+
35
+ def _peft_model_setup(self):
36
+ try:
37
+ logging.info('Setting up PEFT LoRA model')
38
+ lora_config = LoraConfig(
39
+ r=self.lora_config['r'],
40
+ lora_alpha=self.lora_config['lora_alpha'],
41
+ target_modules=self._get_target_module(self.model),
42
+ lora_dropout=self.lora_config['lora_dropout'],
43
+ bias=self.lora_config['bias'],
44
+ task_type=self.lora_config['task_type'],
45
+ use_rslora=self.lora_config['use_rslora']
46
+ )
47
+
48
+ peft_model = get_peft_model(self.model, lora_config)
49
+ logging.info("PEFT model created successfully.")
50
+ peft_model.print_trainable_parameters()
51
+
52
+ return peft_model
53
+
54
+ except Exception as e:
55
+ logging.error("Failed to setup PEFT model")
56
+ raise ExceptionHandle(e, sys)
57
+
58
+ def _get_training_args(self) -> SFTConfig:
59
+ try:
60
+ return SFTConfig(
61
+ output_dir=self.paths_config['output_dir'],
62
+ per_device_train_batch_size=self.training_config['per_device_train_batch_size'],
63
+ per_device_eval_batch_siz=self.training_config['per_device_eval_batch_size'],
64
+ gradient_accumulation_steps=self.training_config['gradient_accumulation_steps'],
65
+ num_train_epochs=self.training_config['num_train_epochs'],
66
+ learning_rate=self.training_config['learning_rate'],
67
+ warmup_ratio=self.training_config['warmup_ratio'],
68
+ warmup_steps=self.training_config['warmup_steps'],
69
+ bf16=self.training_config['bf16'],
70
+ tf32=self.training_config['tf32'],
71
+ fp16=self.training_config['fp16'],
72
+ lr_scheduler_type=self.training_config['lr_scheduler_type'],
73
+ optim=self.training_config['optim'],
74
+ gradient_checkpointing=self.training_config['gradient_checkpointing'],
75
+ gradient_checkpointing_kwargs=self.training_config['gradient_checkpointing_kwargs'],
76
+ max_grad_norm=self.training_config['max_grad_norm'],
77
+ weight_decay=self.training_config['weight_decay'],
78
+ logging_steps=self.training_config['logging_steps'],
79
+ eval_steps=self.training_config['eval_steps'],
80
+ save_steps=self.training_config['save_steps'],
81
+ evaluation_strategy=self.training_config['eval_strategy'],
82
+ save_strategy=self.training_config['save_strategy'],
83
+ save_total_limit=self.training_config['save_total_limit'],
84
+ load_best_model_at_end=self.training_config['load_best_model_at_end'],
85
+ metric_for_best_model=self.training_config['metric_for_best_model'],
86
+ greater_is_better=self.training_config['greater_is_better'],
87
+ prediction_loss_only=self.training_config['prediction_loss_only'],
88
+ report_to=self.training_config['report_to'],
89
+ dataloader_num_workers=self.training_config['dataloader_num_workers'],
90
+ max_seq_length=self.training_config['max_seq_length'],
91
+ dataset_text_field=self.training_config['dataset_text_field'],
92
+ label_names=self.training_config['label_names'],
93
+ neftune_noise_alpha=self.training_config['neftune_noise_alpha']
94
+ )
95
+
96
+ except Exception as e:
97
+ logging.error("Failed to create TrainingArguments")
98
+ raise ExceptionHandle(e, sys)
99
+
100
+ def _data_collator(self):
101
+ try:
102
+ return DataCollatorForCompletionOnlyLM(
103
+ response_template="<|assistant|>",
104
+ tokenizer=self.tokenizer
105
+ )
106
+ except Exception as e:
107
+ logging.error("Failed to create Data Collator")
108
+ raise ExceptionHandle(e, sys)
109
+
110
+ def _setup_trainer(self) -> SFTTrainer:
111
+ logging.info("Initializing SFTTrainer")
112
+ peft_model = self._peft_model_setup()
113
+ training_args = self._get_training_args()
114
+
115
+ trainer = SFTTrainer(
116
+ model=peft_model,
117
+ train_dataset=self.datasets['train'],
118
+ eval_dataset=self.datasets['val'],
119
+ args=training_args,
120
+ data_collator=self._data_collator(),
121
+ callbacks=[EarlyStoppingCallback(early_stopping_patience=5, early_stopping_threshold=0.001)],
122
+ )
123
+ logging.info("SFTTrainer initialized successfully.")
124
+ return trainer
125
+
126
+ def save_apater(self):
127
+ try:
128
+ adapter_path = self.paths_config['adapter_save_dir']
129
+ self.trainer.model.save_pretrained(adapter_path)
130
+ logging.info(f"LoRA adapter saved successfully to {adapter_path}")
131
+
132
+ except Exception as e:
133
+ logging.error("Failed to save LoRA adapter")
134
+ raise ExceptionHandle(e, sys)
codeInsight/pipeline/__init__.py ADDED
File without changes
codeInsight/pipeline/prediction_pipeline.py ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+ from codeInsight.inference.code_assistant import CodeAssistant
3
+ from codeInsight.safety.safety_checker import SafetyChecker
4
+ from codeInsight.exception import ExceptionHandle
5
+ from codeInsight.logger import logging
6
+
7
+ class PredictionPipeline:
8
+ def __init__(self, config_path : str = "config/model.yaml"):
9
+ try:
10
+ self.assistant = CodeAssistant(config_path)
11
+ self.safety_checker = SafetyChecker()
12
+ logging.info("Prediction Pipeline initialized successfully.")
13
+
14
+ except Exception as e:
15
+ logging.error("Failed to initialize PredictionPipeline")
16
+ raise ExceptionHandle(e, sys)
17
+
18
+ def predict(self, instruction : str) -> str:
19
+ try:
20
+ raw_output = self.assistant.generate(instruction)
21
+ safe_output = self.safety_checker.check_outputs(raw_output)
22
+
23
+ return safe_output
24
+
25
+ except Exception as e:
26
+ logging.error(f"Prediction failed: {e}")
27
+ return "An error occurred while processing your request. Please try again."
codeInsight/pipeline/training_pipeline.py ADDED
@@ -0,0 +1,106 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import sys
3
+ import torch
4
+ import wandb
5
+ from peft import PeftModel
6
+ from transformers import AutoModelForCausalLM, AutoTokenizer
7
+ from codeInsight.utils.config import load_config
8
+ from codeInsight.data.dataset_builder import DatasetBuilder
9
+ from codeInsight.models.model_loader import load_model_and_tokenizer
10
+ from codeInsight.models.peft_trainer import ModelTrainer
11
+ from codeInsight.evaluation.evaluator import compute_metrics
12
+ from codeInsight.logger import logging
13
+ from codeInsight.exception import ExceptionHandle
14
+
15
+ class TrainingPipeline:
16
+ def __init__(self, config_path: str = "config/model.yaml"):
17
+ self.config = load_config(config_path)
18
+ self.wandb_key = os.getenv('WANDB_API_KEY')
19
+ self.gf_token = os.getenv('HF_TOKEN')
20
+
21
+ def _wandb_login(self):
22
+ try:
23
+ if self.wandb_key:
24
+ wandb.login(key=self.wandb_key)
25
+ wandb.init(project=self.config['wandb']['project_name'])
26
+ logging.info("WandB login successful.")
27
+ else:
28
+ raise ValueError('WANDB_API_KEY is not set')
29
+
30
+ except Exception as e:
31
+ logging.error("Failed to login to WandB")
32
+ raise ExceptionHandle(e, sys)
33
+
34
+ def run_training(self):
35
+ try:
36
+ if self.config['training']['report_to'] == "wandb":
37
+ self._wandb_login()
38
+
39
+ model, tokenizer = load_model_and_tokenizer(self.config['model'])
40
+
41
+ dataset_builder = DatasetBuilder(self.config, tokenizer)
42
+ tokenized_datasets = dataset_builder.get_dataset()
43
+
44
+ trainer = ModelTrainer(
45
+ model=model,
46
+ tokenizer=tokenizer,
47
+ datasets=tokenized_datasets,
48
+ config=self.config,
49
+ )
50
+
51
+ trainer.train()
52
+ logging.info("Model Training Successfull")
53
+ trainer.save_apater()
54
+
55
+ except Exception as e:
56
+ logging.error(f"Training pipeline failed: {e}")
57
+ raise ExceptionHandle(e, sys)
58
+
59
+ def run_merge_and_push(self):
60
+ try:
61
+ model_config = self.config['model']
62
+ paths_config = self.config['paths']
63
+ logging.info("Starting model merge and push process")
64
+
65
+ torch.cuda.empty_cache()
66
+ logging.info('Cleaned GPU cache')
67
+
68
+ base_model = AutoModelForCausalLM.from_pretrained(
69
+ model_config['base_model_id'],
70
+ return_dict=True,
71
+ torch_dtype=torch.bfloat16,
72
+ device_map="auto",
73
+ )
74
+
75
+ tokenizer = AutoTokenizer.from_pretrained(model_config['base_model_id'])
76
+ tokenizer.pad_token = tokenizer.eos_token
77
+
78
+ logging.info(f"Loading adapter from {paths_config['adapter_save_dir']}")
79
+ model_to_merge = PeftModel.from_pretrained(
80
+ base_model,
81
+ paths_config['adapter_save_dir']
82
+ )
83
+
84
+ merged_model = model_to_merge.merge_and_unload()
85
+ logging.info("Merge complete.")
86
+
87
+ repo_id = paths_config['final_model_repo']
88
+
89
+ logging.info(f"Pushing merged model and tokenizer to Hugging Face Hub: {repo_id}")
90
+ merged_model.push_to_hub(
91
+ repo_id,
92
+ token=self.hf_token,
93
+ check_pr=False
94
+ )
95
+
96
+ tokenizer.push_to_hub(
97
+ repo_id,
98
+ token=self.hf_token,
99
+ check_pr=False
100
+ )
101
+
102
+ logging.info("Successfully pushed model and tokenizer to the Hub.")
103
+
104
+ except ExceptionHandle as e:
105
+ logging.error("Failed to merge and push model")
106
+ raise ExceptionHandle(e, sys)
codeInsight/safety/__init__.py ADDED
File without changes
codeInsight/safety/safety_checker.py ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from codeInsight.logger import logging
2
+ import re
3
+
4
+ class SafetyChecker:
5
+ def __init__(self):
6
+ logging.info("SafetyChecker initialized.")
7
+
8
+ def check_outputs(self, text : str) -> str:
9
+ if not text:
10
+ return "No response Generated"
11
+
12
+ refusal_phrases = ["I cannot", "I am unable", "As an AI model", "I'm sorry"]
13
+ if any(phrase.lower() in text.lower() for phrase in refusal_phrases):
14
+ logging.warning(f"Model refusal detected: {text}")
15
+ return "I'm sorry, but I cannot fulfill that request."
16
+
17
+ bad_word_pattern = r"\b(fuck|shit|bitch|asshole|bastard)\b"
18
+ if re.search(bad_word_pattern, text, re.IGNORECASE):
19
+ logging.warning('Bad word detected')
20
+ return "[Content removed due to inappropriate language]"
21
+
22
+ pii_pattern = [
23
+ r"\b\d{3}-\d{2}-\d{4}\b",
24
+ r"\b\d{16}\b",
25
+ r"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b"
26
+ ]
27
+ for pattern in pii_pattern:
28
+ if re.search(pattern, text):
29
+ logging.warning("PII detected in model output.")
30
+ return "[Sensitive information removed for privacy]"
31
+
32
+ hallucination_markers = ["According to a study", "In recent news", "As per research"]
33
+ if any(marker.lower() in text.lower() for marker in hallucination_markers):
34
+ logging.info("Potential hallucination detected.")
35
+
36
+
37
+ logging.info("Output passed all safety checks.")
38
+ return text
codeInsight/training/__init__.py ADDED
File without changes
codeInsight/training/train.py ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+ from codeInsight.pipeline.training_pipeline import TrainingPipeline
3
+ from codeInsight.exception import ExceptionHandle
4
+ from codeInsight.logger import logging
5
+
6
+ def start_training():
7
+ try:
8
+ logging.info("Initializing Training Pipeline...")
9
+ pipeline = TrainingPipeline()
10
+
11
+ logging.info("Starting Model Training")
12
+ pipeline.run_training()
13
+
14
+ logging.info("Start Model Merge and Push")
15
+ pipeline.run_merge_and_push()
16
+
17
+ logging.info("Pipeline Complet")
18
+
19
+ except Exception as e:
20
+ logging.error("Pipeline failed")
21
+ raise ExceptionHandle(e, sys)
22
+
23
+
24
+ if __name__ == "__main__":
25
+ start_training()
codeInsight/utils/__init__.py ADDED
File without changes
codeInsight/utils/config.py ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from codeInsight.exception import ExceptionHandle
2
+ from codeInsight.logger import logging
3
+ from pathlib import Path
4
+ import sys
5
+ import yaml
6
+
7
+ def load_config(config_path : Path = Path("config/model.yaml")) -> dict:
8
+ try:
9
+ with open(config_path, "r") as yaml_file:
10
+ config = yaml.safe_load(yaml_file)
11
+ logging.info(f"Config loaded from {config_path}")
12
+ return config
13
+
14
+ except FileNotFoundError:
15
+ logging.error(f"Configuration file not found at: {config_path}")
16
+ raise ExceptionHandle(e, sys)
17
+
18
+ except Exception as e:
19
+ logging.error(f"Error loading config from {config_path}")
20
+ raise ExceptionHandle(e, sys)
config/model.yaml ADDED
@@ -0,0 +1,68 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ dataset:
2
+ name: "mohsin416/Python-Alpaca-5k"
3
+ shuffle_seed: 42
4
+ SYSTEM_PROMPT: "<|system|>\nYou are a senior Python developer. Provide clear, correct, well-commented code.<|end|>\n\n"
5
+ USER_TOKEN: "<|user|>\n"
6
+ ASSISTANT_TOKEN: "<|assistant|>\n"
7
+ END_TOKEN: "<|end|>"
8
+
9
+ model:
10
+ base_model_id: "microsoft/Phi-3-mini-128k-instruct"
11
+ attn_implementation: "flash_attention_2"
12
+
13
+ quantization:
14
+ load_in_4bit: True
15
+ bnb_4bit_quant_type: "nf4"
16
+ bnb_4bit_compute_dtype: "bfloat16"
17
+ bnb_4bit_use_double_quant: True
18
+
19
+ lora:
20
+ r: 32
21
+ load_alpha: 32
22
+ lora_dropout: 0.1
23
+ bias: "None"
24
+ task_type: "CAUSAL_LM"
25
+ use_rslora: True
26
+
27
+ paths:
28
+ output_dir: "artifacts/outputs"
29
+ adapter_save_dir: "artifacts/phi3-python-instruct-adapter"
30
+ final_model_repo: "mohsin416/phi3-python-instruct"
31
+
32
+ training:
33
+ per_device_train_batch_size: 4
34
+ per_device_eval_batch_size: 4
35
+ gradient_accumulation_steps: 8
36
+ num_train_epochs: 2
37
+ learning_rate: 2.0e-5
38
+ warmup_ratio: 0.1
39
+ warmup_steps: 0
40
+ bf16: True
41
+ tf32: False
42
+ fp16: False
43
+ lr_scheduler_type: "cosine"
44
+ optim: "paged_adamw_8bit"
45
+ gradient_checkpointing: True
46
+ gradient_checkpointing_kwargs: {"use_reentrant": False}
47
+ max_grad_norm: 1.0
48
+ weight_decay: 0.01
49
+ logging_steps: 50
50
+ eval_steps: 50
51
+ save_steps: 50
52
+ eval_strategy: "steps"
53
+ save_strategy: "steps"
54
+ save_total_limit: 3
55
+ load_best_model_at_end: True
56
+ metric_for_best_model: "eval_loss"
57
+ greater_is_better: False
58
+ prediction_loss_only: True
59
+ report_to: "wandb"
60
+ dataloader_num_workers: 4
61
+ dataloader_pin_memory: True
62
+ max_seq_length: 4096
63
+ dataset_text_field: "text"
64
+ label_names: ["labels"]
65
+ neftune_noise_alpha: 5
66
+
67
+ wandb:
68
+ project_name: "Phi-3-mini-128k-instruct-metrics"
requirements.txt CHANGED
@@ -1,3 +1,11 @@
1
- altair
2
- pandas
 
 
 
 
 
 
 
 
3
  streamlit
 
1
+ transformers
2
+ datasets
3
+ peft
4
+ torch
5
+ accelerate
6
+ evaluate
7
+ sentencepiece
8
+ trl
9
+ wandb
10
+ pyyaml
11
  streamlit
setup.py ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ from setuptools import setup, find_packages
2
+
3
+ setup(
4
+ name="codeinsight",
5
+ author="Md Mohsin",
6
+ author_email="siam.mohsin2005@gmail.com",
7
+ version="0.0.1",
8
+ packages=find_packages(),
9
+ )
src/streamlit_app.py DELETED
@@ -1,40 +0,0 @@
1
- import altair as alt
2
- import numpy as np
3
- import pandas as pd
4
- import streamlit as st
5
-
6
- """
7
- # Welcome to Streamlit!
8
-
9
- Edit `/streamlit_app.py` to customize this app to your heart's desire :heart:.
10
- If you have any questions, checkout our [documentation](https://docs.streamlit.io) and [community
11
- forums](https://discuss.streamlit.io).
12
-
13
- In the meantime, below is an example of what you can do with just a few lines of code:
14
- """
15
-
16
- num_points = st.slider("Number of points in spiral", 1, 10000, 1100)
17
- num_turns = st.slider("Number of turns in spiral", 1, 300, 31)
18
-
19
- indices = np.linspace(0, 1, num_points)
20
- theta = 2 * np.pi * num_turns * indices
21
- radius = indices
22
-
23
- x = radius * np.cos(theta)
24
- y = radius * np.sin(theta)
25
-
26
- df = pd.DataFrame({
27
- "x": x,
28
- "y": y,
29
- "idx": indices,
30
- "rand": np.random.randn(num_points),
31
- })
32
-
33
- st.altair_chart(alt.Chart(df, height=700, width=700)
34
- .mark_point(filled=True)
35
- .encode(
36
- x=alt.X("x", axis=None),
37
- y=alt.Y("y", axis=None),
38
- color=alt.Color("idx", legend=None, scale=alt.Scale()),
39
- size=alt.Size("rand", legend=None, scale=alt.Scale(range=[1, 150])),
40
- ))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
template.py ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from pathlib import Path
3
+
4
+ project_name = "codeInsight"
5
+
6
+ list_of_files = [
7
+ f"{project_name}/models/__init__.py",
8
+ f"{project_name}/models/model_loader.py",
9
+ f"{project_name}/models/peft_trainer.py",
10
+
11
+ f"{project_name}/training/__init__.py",
12
+ f"{project_name}/training/train.py",
13
+
14
+ f"{project_name}/evaluation/__init__.py",
15
+ f"{project_name}/evaluation/evaluator.py",
16
+
17
+ f"{project_name}/inference/__init__.py",
18
+ f"{project_name}/inference/code_assistant.py",
19
+
20
+ f"{project_name}/data/__init__.py",
21
+ f"{project_name}/data/dataset_builder.py",
22
+
23
+ f"{project_name}/utils/__init__.py",
24
+ f"{project_name}/utils/config.py",
25
+
26
+ f"{project_name}/safety/__init__.py",
27
+ f"{project_name}/safety/safety_checker.py",
28
+
29
+ f"{project_name}/exception/__init__.py",
30
+ f"{project_name}/logger/__init__.py",
31
+
32
+ f"{project_name}/pipeline/__init__.py",
33
+ f"{project_name}/pipeline/training_pipeline.py",
34
+ f"{project_name}/pipeline/prediction_pipeline.py",
35
+
36
+ "app.py",
37
+ "Demo.py",
38
+ "requirements.txt",
39
+ "Dockerfile",
40
+ "setup.py",
41
+ ".gitignore",
42
+ "README.md",
43
+ "config/model.yaml",
44
+ ]
45
+
46
+ for filepath in list_of_files:
47
+ filepath = Path(filepath)
48
+ filedir, filename = os.path.split(filepath)
49
+
50
+ if filedir != "":
51
+ os.makedirs(filedir, exist_ok=True)
52
+
53
+ if not filepath.exists() or filepath.stat().st_size == 0:
54
+ filepath.touch()
55
+ else:
56
+ print(f'{filename} is already present in {filedir} and has some content. Skipping creation.')