Spaces:

nvhuynh16
/

gemma-code-generator

Sleeping

App Files Files Community

nvhuynh16 commited on Nov 15, 2025

Commit

6fc6360

verified ·

1 Parent(s): 4a3ba16

Upload 2 files

Browse files

Files changed (2) hide show

app_local.py +230 -0
requirements_local.txt +7 -0

app_local.py ADDED Viewed

	@@ -0,0 +1,230 @@

+"""
+Gradio demo for Gemma Code Generator.
+This loads the model directly in the Space (not using Inference API).
+"""
+import gradio as gr
+import torch
+import os
+from transformers import AutoTokenizer, AutoModelForCausalLM
+from peft import PeftModel
+# Model configuration
+BASE_MODEL = "google/gemma-2-2b-it"
+ADAPTER_MODEL = "nvhuynh16/gemma-2b-code-alpaca"
+# Get HuggingFace token from environment (set in Space secrets)
+HF_TOKEN = os.environ.get("HF_TOKEN", None)
+# Global variables for lazy loading
+tokenizer = None
+model = None
+def load_model():
+    """Load model and tokenizer (lazy loading on first request)"""
+    global tokenizer, model
+    if model is None:
+        print("Loading model for the first time...")
+        # Load tokenizer
+        tokenizer = AutoTokenizer.from_pretrained(
+            BASE_MODEL,
+            token=HF_TOKEN  # Use token for gated model
+        )
+        # Load base model with 4-bit quantization (fits in free Space)
+        model = AutoModelForCausalLM.from_pretrained(
+            BASE_MODEL,
+            device_map="auto",
+            torch_dtype=torch.float16,
+            load_in_4bit=True,
+            token=HF_TOKEN  # Use token for gated model
+        )
+        # Load LoRA adapters
+        model = PeftModel.from_pretrained(
+            model,
+            ADAPTER_MODEL,
+            token=HF_TOKEN  # Use token for adapter model too
+        )
+        print("Model loaded successfully!")
+    return tokenizer, model
+def generate_code(instruction: str, max_tokens: int = 256, temperature: float = 0.7):
+    """Generate code from instruction"""
+    if not instruction.strip():
+        return "Please enter an instruction."
+    try:
+        # Load model (cached after first call)
+        tok, mdl = load_model()
+        # Format prompt in Alpaca style
+        prompt = f"""### Instruction:
+{instruction}
+### Input:
+### Response:
+"""
+        # Tokenize
+        inputs = tok(prompt, return_tensors="pt").to(mdl.device)
+        # Generate
+        with torch.no_grad():
+            outputs = mdl.generate(
+                **inputs,
+                max_new_tokens=max_tokens,
+                temperature=temperature,
+                top_p=0.9,
+                do_sample=True,
+                pad_token_id=tok.eos_token_id,
+            )
+        # Decode and extract response
+        generated = tok.decode(outputs[0], skip_special_tokens=True)
+        response = generated.split("### Response:")[-1].strip()
+        return response
+    except Exception as e:
+        error_msg = str(e)
+        if "CUDA out of memory" in error_msg or "OutOfMemoryError" in error_msg:
+            return "⚠️ Out of memory. Try reducing max tokens or wait a moment."
+        else:
+            return f"Error: {error_msg}\n\nPlease try again."
+# Custom CSS for better appearance
+custom_css = """
+.container {
+    max-width: 900px;
+    margin: auto;
+}
+.output-code {
+    font-family: 'Courier New', monospace;
+    font-size: 14px;
+}
+"""
+# Create Gradio interface
+with gr.Blocks(theme=gr.themes.Soft(), css=custom_css) as demo:
+    gr.Markdown(
+        """
+        # 🤖 Gemma Code Generator
+        Fine-tuned Gemma-2B model for Python code generation using QLoRA.
+        **Performance**: Expected 75-85% syntax correctness (vs 61% baseline) | BLEU Score: 25-35 (vs 16.10 baseline)
+        **Note**: First request takes ~30 seconds to load the model. Subsequent requests are fast!
+        """
+    )
+    with gr.Row():
+        with gr.Column(scale=1):
+            instruction_input = gr.Textbox(
+                label="Code Instruction",
+                placeholder="Describe the function you want to create...",
+                lines=3,
+            )
+            with gr.Accordion("Advanced Settings", open=False):
+                max_tokens_slider = gr.Slider(
+                    minimum=64,
+                    maximum=512,
+                    value=256,
+                    step=64,
+                    label="Max Tokens",
+                    info="Maximum length of generated code"
+                )
+                temperature_slider = gr.Slider(
+                    minimum=0.1,
+                    maximum=1.5,
+                    value=0.7,
+                    step=0.1,
+                    label="Temperature",
+                    info="Higher = more creative, Lower = more deterministic"
+                )
+            generate_btn = gr.Button("Generate Code", variant="primary", size="lg")
+        with gr.Column(scale=1):
+            output_code = gr.Code(
+                label="Generated Code",
+                language="python",
+                elem_classes="output-code"
+            )
+    # Examples
+    gr.Examples(
+        examples=[
+            ["Write a function to check if a number is prime"],
+            ["Create a function to reverse a string"],
+            ["Write a function to find the factorial of a number"],
+            ["Implement binary search on a sorted list"],
+            ["Create a function to merge two sorted lists"],
+            ["Write a function to calculate Fibonacci numbers"],
+            ["Implement a function to find the longest common subsequence"],
+            ["Create a function to validate an email address using regex"],
+            ["Write a function to convert a decimal number to binary"],
+            ["Implement a simple LRU cache using OrderedDict"],
+        ],
+        inputs=[instruction_input],
+        label="Example Prompts (Click to use)"
+    )
+    # Event handler
+    generate_btn.click(
+        fn=generate_code,
+        inputs=[instruction_input, max_tokens_slider, temperature_slider],
+        outputs=[output_code],
+    )
+    # Model information footer
+    gr.Markdown(
+        """
+        ---
+        ### 📊 Model Performance
+        | Metric | Baseline (Pretrained) | Fine-Tuned (Expected) | Improvement |
+        |--------|----------------------|----------------------|-------------|
+        | **Syntax Correctness** | 61.0% | 75-85% | +14-24% |
+        | **BLEU Score** | 16.10 | 25-35 | +9-19 |
+        | **Trainable Parameters** | 2.5B | 3.2M (0.12%) | 100x fewer |
+        ### 🛠️ Technical Details
+        - **Base Model**: google/gemma-2-2b-it (2.5B parameters)
+        - **Fine-tuning**: QLoRA (4-bit quantization + LoRA rank 16)
+        - **Dataset**: CodeAlpaca-20k (3,600 training examples)
+        - **Training**: 4-6 hours on free Google Colab T4 GPU
+        - **Cost**: $0 (free Colab + free HF Spaces hosting)
+        ### 🔗 Links
+        [Model on HuggingFace](https://huggingface.co/nvhuynh16/gemma-2b-code-alpaca) •
+        [GitHub Repository](https://github.com/YOUR-USERNAME/YOUR-REPO) •
+        [Portfolio](https://YOUR-PORTFOLIO-SITE.com) •
+        [Base Model](https://huggingface.co/google/gemma-2-2b-it)
+        ---
+        **Built for portfolio demonstration** • Targeting AI/ML Applied Scientist roles
+        *This demo loads the model directly in HuggingFace Spaces with 4-bit quantization*
+        """
+    )
+if __name__ == "__main__":
+    demo.launch()

requirements_local.txt ADDED Viewed

	@@ -0,0 +1,7 @@

+gradio==5.6.0
+transformers>=4.40.0
+torch>=2.0.0
+peft>=0.10.0
+accelerate>=0.20.0
+bitsandbytes>=0.41.0
+scipy