nvhuynh16 commited on
Commit
6fc6360
·
verified ·
1 Parent(s): 4a3ba16

Upload 2 files

Browse files
Files changed (2) hide show
  1. app_local.py +230 -0
  2. requirements_local.txt +7 -0
app_local.py ADDED
@@ -0,0 +1,230 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Gradio demo for Gemma Code Generator.
3
+ This loads the model directly in the Space (not using Inference API).
4
+ """
5
+
6
+ import gradio as gr
7
+ import torch
8
+ import os
9
+ from transformers import AutoTokenizer, AutoModelForCausalLM
10
+ from peft import PeftModel
11
+
12
+ # Model configuration
13
+ BASE_MODEL = "google/gemma-2-2b-it"
14
+ ADAPTER_MODEL = "nvhuynh16/gemma-2b-code-alpaca"
15
+
16
+ # Get HuggingFace token from environment (set in Space secrets)
17
+ HF_TOKEN = os.environ.get("HF_TOKEN", None)
18
+
19
+ # Global variables for lazy loading
20
+ tokenizer = None
21
+ model = None
22
+
23
+ def load_model():
24
+ """Load model and tokenizer (lazy loading on first request)"""
25
+ global tokenizer, model
26
+
27
+ if model is None:
28
+ print("Loading model for the first time...")
29
+
30
+ # Load tokenizer
31
+ tokenizer = AutoTokenizer.from_pretrained(
32
+ BASE_MODEL,
33
+ token=HF_TOKEN # Use token for gated model
34
+ )
35
+
36
+ # Load base model with 4-bit quantization (fits in free Space)
37
+ model = AutoModelForCausalLM.from_pretrained(
38
+ BASE_MODEL,
39
+ device_map="auto",
40
+ torch_dtype=torch.float16,
41
+ load_in_4bit=True,
42
+ token=HF_TOKEN # Use token for gated model
43
+ )
44
+
45
+ # Load LoRA adapters
46
+ model = PeftModel.from_pretrained(
47
+ model,
48
+ ADAPTER_MODEL,
49
+ token=HF_TOKEN # Use token for adapter model too
50
+ )
51
+
52
+ print("Model loaded successfully!")
53
+
54
+ return tokenizer, model
55
+
56
+
57
+ def generate_code(instruction: str, max_tokens: int = 256, temperature: float = 0.7):
58
+ """Generate code from instruction"""
59
+
60
+ if not instruction.strip():
61
+ return "Please enter an instruction."
62
+
63
+ try:
64
+ # Load model (cached after first call)
65
+ tok, mdl = load_model()
66
+
67
+ # Format prompt in Alpaca style
68
+ prompt = f"""### Instruction:
69
+ {instruction}
70
+
71
+ ### Input:
72
+
73
+
74
+ ### Response:
75
+ """
76
+
77
+ # Tokenize
78
+ inputs = tok(prompt, return_tensors="pt").to(mdl.device)
79
+
80
+ # Generate
81
+ with torch.no_grad():
82
+ outputs = mdl.generate(
83
+ **inputs,
84
+ max_new_tokens=max_tokens,
85
+ temperature=temperature,
86
+ top_p=0.9,
87
+ do_sample=True,
88
+ pad_token_id=tok.eos_token_id,
89
+ )
90
+
91
+ # Decode and extract response
92
+ generated = tok.decode(outputs[0], skip_special_tokens=True)
93
+ response = generated.split("### Response:")[-1].strip()
94
+
95
+ return response
96
+
97
+ except Exception as e:
98
+ error_msg = str(e)
99
+ if "CUDA out of memory" in error_msg or "OutOfMemoryError" in error_msg:
100
+ return "⚠️ Out of memory. Try reducing max tokens or wait a moment."
101
+ else:
102
+ return f"Error: {error_msg}\n\nPlease try again."
103
+
104
+
105
+ # Custom CSS for better appearance
106
+ custom_css = """
107
+ .container {
108
+ max-width: 900px;
109
+ margin: auto;
110
+ }
111
+ .output-code {
112
+ font-family: 'Courier New', monospace;
113
+ font-size: 14px;
114
+ }
115
+ """
116
+
117
+ # Create Gradio interface
118
+ with gr.Blocks(theme=gr.themes.Soft(), css=custom_css) as demo:
119
+
120
+ gr.Markdown(
121
+ """
122
+ # 🤖 Gemma Code Generator
123
+
124
+ Fine-tuned Gemma-2B model for Python code generation using QLoRA.
125
+
126
+ **Performance**: Expected 75-85% syntax correctness (vs 61% baseline) | BLEU Score: 25-35 (vs 16.10 baseline)
127
+
128
+ **Note**: First request takes ~30 seconds to load the model. Subsequent requests are fast!
129
+ """
130
+ )
131
+
132
+ with gr.Row():
133
+ with gr.Column(scale=1):
134
+ instruction_input = gr.Textbox(
135
+ label="Code Instruction",
136
+ placeholder="Describe the function you want to create...",
137
+ lines=3,
138
+ )
139
+
140
+ with gr.Accordion("Advanced Settings", open=False):
141
+ max_tokens_slider = gr.Slider(
142
+ minimum=64,
143
+ maximum=512,
144
+ value=256,
145
+ step=64,
146
+ label="Max Tokens",
147
+ info="Maximum length of generated code"
148
+ )
149
+
150
+ temperature_slider = gr.Slider(
151
+ minimum=0.1,
152
+ maximum=1.5,
153
+ value=0.7,
154
+ step=0.1,
155
+ label="Temperature",
156
+ info="Higher = more creative, Lower = more deterministic"
157
+ )
158
+
159
+ generate_btn = gr.Button("Generate Code", variant="primary", size="lg")
160
+
161
+ with gr.Column(scale=1):
162
+ output_code = gr.Code(
163
+ label="Generated Code",
164
+ language="python",
165
+ elem_classes="output-code"
166
+ )
167
+
168
+ # Examples
169
+ gr.Examples(
170
+ examples=[
171
+ ["Write a function to check if a number is prime"],
172
+ ["Create a function to reverse a string"],
173
+ ["Write a function to find the factorial of a number"],
174
+ ["Implement binary search on a sorted list"],
175
+ ["Create a function to merge two sorted lists"],
176
+ ["Write a function to calculate Fibonacci numbers"],
177
+ ["Implement a function to find the longest common subsequence"],
178
+ ["Create a function to validate an email address using regex"],
179
+ ["Write a function to convert a decimal number to binary"],
180
+ ["Implement a simple LRU cache using OrderedDict"],
181
+ ],
182
+ inputs=[instruction_input],
183
+ label="Example Prompts (Click to use)"
184
+ )
185
+
186
+ # Event handler
187
+ generate_btn.click(
188
+ fn=generate_code,
189
+ inputs=[instruction_input, max_tokens_slider, temperature_slider],
190
+ outputs=[output_code],
191
+ )
192
+
193
+ # Model information footer
194
+ gr.Markdown(
195
+ """
196
+ ---
197
+
198
+ ### 📊 Model Performance
199
+
200
+ | Metric | Baseline (Pretrained) | Fine-Tuned (Expected) | Improvement |
201
+ |--------|----------------------|----------------------|-------------|
202
+ | **Syntax Correctness** | 61.0% | 75-85% | +14-24% |
203
+ | **BLEU Score** | 16.10 | 25-35 | +9-19 |
204
+ | **Trainable Parameters** | 2.5B | 3.2M (0.12%) | 100x fewer |
205
+
206
+ ### 🛠️ Technical Details
207
+
208
+ - **Base Model**: google/gemma-2-2b-it (2.5B parameters)
209
+ - **Fine-tuning**: QLoRA (4-bit quantization + LoRA rank 16)
210
+ - **Dataset**: CodeAlpaca-20k (3,600 training examples)
211
+ - **Training**: 4-6 hours on free Google Colab T4 GPU
212
+ - **Cost**: $0 (free Colab + free HF Spaces hosting)
213
+
214
+ ### 🔗 Links
215
+
216
+ [Model on HuggingFace](https://huggingface.co/nvhuynh16/gemma-2b-code-alpaca) •
217
+ [GitHub Repository](https://github.com/YOUR-USERNAME/YOUR-REPO) •
218
+ [Portfolio](https://YOUR-PORTFOLIO-SITE.com) •
219
+ [Base Model](https://huggingface.co/google/gemma-2-2b-it)
220
+
221
+ ---
222
+
223
+ **Built for portfolio demonstration** • Targeting AI/ML Applied Scientist roles
224
+
225
+ *This demo loads the model directly in HuggingFace Spaces with 4-bit quantization*
226
+ """
227
+ )
228
+
229
+ if __name__ == "__main__":
230
+ demo.launch()
requirements_local.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ gradio==5.6.0
2
+ transformers>=4.40.0
3
+ torch>=2.0.0
4
+ peft>=0.10.0
5
+ accelerate>=0.20.0
6
+ bitsandbytes>=0.41.0
7
+ scipy