File size: 9,788 Bytes
262de9f
 
4521d33
fffa819
4521d33
262de9f
4521d33
 
262de9f
fffa819
 
 
 
4521d33
fffa819
 
 
 
 
 
 
 
 
 
4521d33
fffa819
 
 
 
 
4521d33
 
 
fffa819
4521d33
 
 
 
fffa819
4521d33
 
fffa819
 
 
 
 
 
 
4521d33
 
 
fffa819
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4521d33
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
262de9f
4521d33
 
262de9f
4521d33
 
262de9f
4521d33
 
 
262de9f
4521d33
 
262de9f
4521d33
262de9f
4521d33
 
 
262de9f
4521d33
 
 
262de9f
 
4521d33
 
262de9f
4521d33
 
 
262de9f
4521d33
262de9f
4521d33
262de9f
4521d33
 
262de9f
4521d33
 
 
 
 
 
262de9f
4521d33
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
262de9f
4521d33
 
 
262de9f
4521d33
 
 
 
 
 
262de9f
 
4521d33
 
 
 
 
 
262de9f
4521d33
 
 
 
 
 
262de9f
4521d33
262de9f
4521d33
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
262de9f
4521d33
262de9f
4521d33
 
 
 
 
 
 
 
262de9f
 
4521d33
 
 
 
 
 
 
262de9f
 
4521d33
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
262de9f
 
4521d33
 
 
 
 
262de9f
 
4521d33
262de9f
4521d33
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
import gradio as gr
import torch
import pickle
import io
from transformers import GPT2Tokenizer, GPT2LMHeadModel

# Load model and tokenizer from pickle files
print("Loading model and tokenizer...")

# Determine device
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

try:
    # Custom unpickler to handle device mapping
    class CPU_Unpickler(pickle.Unpickler):
        def find_class(self, module, name):
            if module == 'torch.storage' and name == '_load_from_bytes':
                return lambda b: torch.load(io.BytesIO(b), map_location=device)
            else:
                return super().find_class(module, name)
    
    # Load LoRA model with device mapping
    print("Loading LoRA model...")
    with open('gpt2_pseudo2code_lora_model.pkl', 'rb') as f:
        if device == "cpu":
            # Use custom unpickler for CPU
            model = CPU_Unpickler(f).load()
        else:
            model = pickle.load(f)
    print("✓ Model loaded successfully")
    
    # Load tokenizer
    print("Loading tokenizer...")
    with open('gpt2_pseudo2code_tokenizer.pkl', 'rb') as f:
        tokenizer = pickle.load(f)
    print("✓ Tokenizer loaded successfully")
    
    # Ensure model is on correct device
    model = model.to(device)
    model.eval()
    print(f"✓ Model ready on {device}")
    
    # Print model info
    trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
    total_params = sum(p.numel() for p in model.parameters())
    print(f"✓ Total parameters: {total_params:,}")
    print(f"✓ Trainable parameters: {trainable_params:,}")
    
except Exception as e:
    print(f"Error loading model: {e}")
    print("\nTrying alternative loading method...")
    try:
        # Alternative method: load with torch.load directly
        import io
        
        with open('gpt2_pseudo2code_lora_model.pkl', 'rb') as f:
            buffer = io.BytesIO(f.read())
            model = torch.load(buffer, map_location=torch.device(device))
        
        with open('gpt2_pseudo2code_tokenizer.pkl', 'rb') as f:
            tokenizer = pickle.load(f)
        
        model = model.to(device)
        model.eval()
        print("✓ Model loaded successfully using alternative method")
    except Exception as e2:
        print(f"Alternative loading also failed: {e2}")
        raise

def generate_code(pseudocode, indent, line, max_length=128, temperature=0.7, top_p=0.9):
    """
    Generate code from pseudo-code with line and indent information.
    
    Args:
        pseudocode: Input pseudo-code string
        indent: Indentation level
        line: Line number
        max_length: Maximum length of generated sequence
        temperature: Sampling temperature
        top_p: Nucleus sampling parameter
    
    Returns:
        Generated code string
    """
    try:
        # Format input with line and indent information
        prompt = f"Pseudocode: {pseudocode} | Indent: {indent} | Line: {line}\nCode:"
        
        # Tokenize input
        inputs = tokenizer(prompt, return_tensors='pt', padding=True)
        
        # Move to same device as model
        device = next(model.parameters()).device
        inputs = {k: v.to(device) for k, v in inputs.items()}
        
        # Generate
        model.eval()
        with torch.no_grad():
            outputs = model.generate(
                **inputs,
                max_length=max_length,
                temperature=temperature,
                top_p=top_p,
                do_sample=True,
                pad_token_id=tokenizer.eos_token_id,
                eos_token_id=tokenizer.eos_token_id,
                num_return_sequences=1
            )
        
        # Decode output
        generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
        
        # Extract only the code part
        if "Code:" in generated_text:
            code = generated_text.split("Code:")[1].strip()
        else:
            code = generated_text.strip()
        
        return code
    
    except Exception as e:
        return f"Error generating code: {str(e)}"

def gradio_generate_code(pseudocode, indent, line, temperature=0.7, top_p=0.9, max_length=128):
    """
    Wrapper function for Gradio interface.
    """
    if not pseudocode.strip():
        return "⚠️ Please enter some pseudocode!"
    
    try:
        indent = int(indent)
        line = int(line)
        generated_code = generate_code(
            pseudocode,
            indent,
            line,
            max_length=int(max_length),
            temperature=float(temperature),
            top_p=float(top_p)
        )
        return generated_code
    except ValueError:
        return "⚠️ Indent and Line must be valid numbers!"
    except Exception as e:
        return f"❌ Error: {str(e)}"

# Example pseudocodes
examples = [
    ["create integer n", 1, 1, 0.7, 0.9, 128],
    ["read n", 1, 2, 0.7, 0.9, 128],
    ["for i from 0 to n", 1, 3, 0.7, 0.9, 128],
    ["print i", 2, 4, 0.7, 0.9, 128],
    ["if n is equal to 0", 1, 5, 0.7, 0.9, 128],
    ["create string s", 1, 1, 0.7, 0.9, 128],
    ["read s", 1, 2, 0.7, 0.9, 128],
]

# Create Gradio interface
with gr.Blocks(theme=gr.themes.Soft(), title="Pseudo-Code to Code Generator") as demo:
    gr.Markdown(
        """
        # 🐍 Pseudo-Code to Code Generator (GPT-2 + LoRA)
        
        Convert natural language pseudo-code to executable code using a fine-tuned GPT-2 model with LoRA.
        
        **Model Details:**
        - Base Model: GPT-2
        - Training: SPOC Dataset (C++ code examples)
        - Optimization: LoRA (Low-Rank Adaptation) + 16-bit precision
        - Trained on: 20,000 pseudo-code to code pairs
        
        **Note:** The model was trained on C++ code examples from the SPOC dataset, so it generates C++-style code.
        """
    )
    
    with gr.Row():
        with gr.Column(scale=1):
            gr.Markdown("### 📝 Input")
            
            pseudocode_input = gr.Textbox(
                label="Pseudocode",
                placeholder="Enter your pseudocode here...\nExample: create integer n",
                lines=5,
                max_lines=10
            )
            
            with gr.Row():
                indent_input = gr.Number(
                    label="Indent Level",
                    value=1,
                    precision=0,
                    info="Indentation level (0=no indent, 1=first level, etc.)"
                )
                
                line_input = gr.Number(
                    label="Line Number",
                    value=1,
                    precision=0,
                    info="Line number in the program"
                )
            
            gr.Markdown("### ⚙️ Generation Parameters")
            
            with gr.Row():
                temperature_slider = gr.Slider(
                    minimum=0.1,
                    maximum=1.5,
                    value=0.7,
                    step=0.1,
                    label="Temperature",
                    info="Higher = more creative/random"
                )
                
                top_p_slider = gr.Slider(
                    minimum=0.1,
                    maximum=1.0,
                    value=0.9,
                    step=0.05,
                    label="Top-p (Nucleus Sampling)",
                    info="Probability threshold for sampling"
                )
            
            max_length_slider = gr.Slider(
                minimum=64,
                maximum=256,
                value=128,
                step=16,
                label="Max Length",
                info="Maximum tokens to generate"
            )
            
            generate_btn = gr.Button("🚀 Generate Code", variant="primary", size="lg")
        
        with gr.Column(scale=1):
            gr.Markdown("### 💻 Generated Code")
            
            output = gr.Textbox(
                label="Generated Code",
                lines=15,
                max_lines=20,
                show_copy_button=True
            )
    
    gr.Markdown("### 📚 Examples")
    gr.Examples(
        examples=examples,
        inputs=[pseudocode_input, indent_input, line_input, temperature_slider, top_p_slider, max_length_slider],
        outputs=output,
        fn=gradio_generate_code,
        cache_examples=False,
    )
    
    gr.Markdown(
        """
        ---
        ### ℹ️ How to Use:
        1. **Enter pseudocode**: Write your natural language description
        2. **Set indent level**: Specify the indentation (0 for no indent, 1 for first level, etc.)
        3. **Set line number**: Indicate the line position in your program
        4. **Adjust parameters** (optional): Fine-tune temperature and top-p for different results
        5. **Click Generate**: Get your code!
        
        ### 💡 Tips:
        - Higher temperature (0.8-1.2) = more creative but potentially less accurate
        - Lower temperature (0.5-0.7) = more conservative and predictable
        - Top-p controls diversity; 0.9 is usually a good balance
        - The model generates C++-style code as it was trained on the SPOC dataset
        
        ### 🔗 Resources:
        - [SPOC Dataset](https://github.com/sumith1896/spoc)
        - [Research Paper](https://arxiv.org/pdf/1906.04908)
        - Model trained with LoRA for efficiency
        """
    )
    
    # Connect button to function
    generate_btn.click(
        fn=gradio_generate_code,
        inputs=[pseudocode_input, indent_input, line_input, temperature_slider, top_p_slider, max_length_slider],
        outputs=output
    )

# Launch the app
if __name__ == "__main__":
    demo.launch()