Carbaz commited on
Commit
3aa82ed
·
verified ·
1 Parent(s): 26bead7

Sync from GitHub

Browse files
Files changed (1) hide show
  1. python_c_ext_generator.ipynb +474 -0
python_c_ext_generator.ipynb ADDED
@@ -0,0 +1,474 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "markdown",
5
+ "id": "4a6ab9a2-28a2-445d-8512-a0dc8d1b54e9",
6
+ "metadata": {},
7
+ "source": [
8
+ "# Python C extension generator\n",
9
+ "\n",
10
+ "Use an LLM model to generate a high performance Python C extension code from Python code.\n",
11
+ "\n",
12
+ "Python C extension modules allows to integrate C coded and compiled modules into Python applications.\n",
13
+ "\n",
14
+ "* [Python C Extensions](https://docs.python.org/3.13/extending/index.html)\n",
15
+ "* [Python's C API](https://docs.python.org/3.13/c-api/index.html)"
16
+ ]
17
+ },
18
+ {
19
+ "cell_type": "code",
20
+ "execution_count": null,
21
+ "id": "e610bf56-a46e-4aff-8de1-ab49d62b1ad3",
22
+ "metadata": {},
23
+ "outputs": [],
24
+ "source": [
25
+ "# Imports.\n",
26
+ "\n",
27
+ "import os\n",
28
+ "import sys\n",
29
+ "from time import perf_counter\n",
30
+ "from timeit import timeit\n",
31
+ "\n",
32
+ "from dotenv import load_dotenv\n",
33
+ "from openai import OpenAI\n",
34
+ "from pydantic import BaseModel"
35
+ ]
36
+ },
37
+ {
38
+ "cell_type": "code",
39
+ "execution_count": null,
40
+ "id": "4f672e1c-87e9-4865-b760-370fa605e614",
41
+ "metadata": {},
42
+ "outputs": [],
43
+ "source": [
44
+ "# Load environment variables from '.env' file.\n",
45
+ "\n",
46
+ "load_dotenv(override=True)"
47
+ ]
48
+ },
49
+ {
50
+ "cell_type": "code",
51
+ "execution_count": null,
52
+ "id": "8aa149ed-9298-4d69-8fe2-8f5de0f667da",
53
+ "metadata": {},
54
+ "outputs": [],
55
+ "source": [
56
+ "# Initialize client and set the default LLM model to use.\n",
57
+ "\n",
58
+ "OPENAI_MODEL = \"gpt-5.1-codex-mini\"\n",
59
+ "\n",
60
+ "openai = OpenAI()"
61
+ ]
62
+ },
63
+ {
64
+ "cell_type": "code",
65
+ "execution_count": null,
66
+ "id": "c6f37bf0",
67
+ "metadata": {},
68
+ "outputs": [],
69
+ "source": [
70
+ "# Define Pydantic model class for GPT response parsing.\n",
71
+ "\n",
72
+ "class Extension_codes(BaseModel):\n",
73
+ " \"\"\"Pydantic model of a response containing the generated C code, the 'setup.py' code and an usage example.\"\"\"\n",
74
+ " c_code: str\n",
75
+ " setup: str\n",
76
+ " usage: str"
77
+ ]
78
+ },
79
+ {
80
+ "cell_type": "code",
81
+ "execution_count": null,
82
+ "id": "cb6ce77a",
83
+ "metadata": {},
84
+ "outputs": [],
85
+ "source": [
86
+ "# Define a function to print the optimization codes.\n",
87
+ "\n",
88
+ "def print_optimization(optimization):\n",
89
+ " \"\"\"Print the optimization codes.\"\"\"\n",
90
+ " print(f\"C CODE:\\n{optimization.c_code}\")\n",
91
+ " print(\"---------------------------\")\n",
92
+ " print(f\"setup.py:\\n{optimization.setup}\")\n",
93
+ " print(\"---------------------------\")\n",
94
+ " print(f\"USAGE:\\n{optimization.usage}\")"
95
+ ]
96
+ },
97
+ {
98
+ "cell_type": "code",
99
+ "execution_count": null,
100
+ "id": "71e1ba8c-5b05-4726-a9f3-8d8c6257350b",
101
+ "metadata": {},
102
+ "outputs": [],
103
+ "source": [
104
+ "# Define a function to write outputs to a file with a given filename.\n",
105
+ "\n",
106
+ "def write_file(data, filename):\n",
107
+ " \"\"\"Write data to a file with the specified filename.\"\"\"\n",
108
+ " with open(filename, \"w\") as file:\n",
109
+ " file.write(data)"
110
+ ]
111
+ },
112
+ {
113
+ "cell_type": "code",
114
+ "execution_count": null,
115
+ "id": "f13c9c97",
116
+ "metadata": {},
117
+ "outputs": [],
118
+ "source": [
119
+ "# Define a function to write the optimization codes to files.\n",
120
+ "\n",
121
+ "def write_optimization(optimization, module_name):\n",
122
+ " \"\"\"Write the optimization codes to files.\"\"\"\n",
123
+ " write_file(optimization.c_code, f\"{module_name}.c\")\n",
124
+ " write_file(optimization.setup, \"setup.py\")\n",
125
+ " write_file(optimization.usage, \"usage_example.py\")"
126
+ ]
127
+ },
128
+ {
129
+ "cell_type": "code",
130
+ "execution_count": null,
131
+ "id": "6896636f-923e-4a2c-9d6c-fac07828a201",
132
+ "metadata": {},
133
+ "outputs": [],
134
+ "source": [
135
+ "# Define system message for the LLM with instructions for generating the C extension code.\n",
136
+ "\n",
137
+ "system_message = \"\"\"\n",
138
+ "You are an assistant that reimplements Python code in high performance C extensions for Python.\n",
139
+ "Your responses must always be a JSON with the following structure:\n",
140
+ "\n",
141
+ "{\n",
142
+ " \"c_code\": \"Optimized C extension for Python code\",\n",
143
+ " \"setup\": \"The 'setup.py' code to compile the C extension for Python\",\n",
144
+ " \"usage\": \"An example of usage of the C extension for Python code with time measurement and comparing with the original Python code\"\n",
145
+ "}\n",
146
+ "\n",
147
+ "Use comments sparingly and do not provide any explanation other than occasional comments.\n",
148
+ "The C extension for Python needs to produce an identical output in the fastest possible time.\n",
149
+ "Make sure the C extension for Python code is correct and can be compiled with 'python setup.py build' and used in Python.\n",
150
+ "The usage example must include a time measurement and a comparison with the original Python code.\n",
151
+ "Do not include any additional text or explanation outside the JSON structure.\n",
152
+ "Make sure the JSON is correctly formatted.\n",
153
+ "\"\"\""
154
+ ]
155
+ },
156
+ {
157
+ "cell_type": "code",
158
+ "execution_count": null,
159
+ "id": "8e7b3546-57aa-4c29-bc5d-f211970d04eb",
160
+ "metadata": {},
161
+ "outputs": [],
162
+ "source": [
163
+ "# Define user prompt template and function to fill it.\n",
164
+ "\n",
165
+ "def user_prompt_for(python_code, module_name):\n",
166
+ " user_prompt = f\"\"\"\n",
167
+ " Reimplement this Python code as a C extension for Python with the fastest possible implementation that produces identical output in the least time.\n",
168
+ " Respond only with C extension for Python code, do not explain your work other than a few code comments.\n",
169
+ " The module name, used to import, must be \"{module_name}\", the generated C file will be named \"{module_name}.c\".\n",
170
+ " Pay attention to number types to ensure no int overflows.\n",
171
+ " Remember to #include all necessary C packages such as iomanip or <python.h>\n",
172
+ "\n",
173
+ " The target architecture is {sys.platform}, take that in mind while generating the C code, specially\n",
174
+ " when choosing types to use, and use the appropriate compiler flags.\n",
175
+ " Make sure to use the Python C API correctly and manage memory properly to avoid leaks or crashes.\n",
176
+ "\n",
177
+ " Here is the Python code to reimplement:\n",
178
+ "\n",
179
+ " {python_code}\"\"\"\n",
180
+ " return user_prompt"
181
+ ]
182
+ },
183
+ {
184
+ "cell_type": "code",
185
+ "execution_count": null,
186
+ "id": "c6190659-f54c-4951-bef4-4960f8e51cc4",
187
+ "metadata": {},
188
+ "outputs": [],
189
+ "source": [
190
+ "# Define function to create the messages for the LLM.\n",
191
+ "\n",
192
+ "def messages_for(python_code, module_name):\n",
193
+ " \"\"\"Create the messages for the LLM given the Python code and the desired module name.\"\"\"\n",
194
+ " return [\n",
195
+ " {\"role\": \"system\", \"content\": system_message},\n",
196
+ " {\"role\": \"user\", \"content\": user_prompt_for(python_code, module_name)}]"
197
+ ]
198
+ },
199
+ {
200
+ "cell_type": "code",
201
+ "execution_count": null,
202
+ "id": "3c57bc55",
203
+ "metadata": {},
204
+ "outputs": [],
205
+ "source": [
206
+ "# Test the messages function and print the messages.\n",
207
+ "\n",
208
+ "for message in messages_for(\"print('Hello World')\", \"say_hello\"):\n",
209
+ " print(f\"{message['role'].upper()}: {message['content']}\")\n",
210
+ " print(\"--------------------------------\")"
211
+ ]
212
+ },
213
+ {
214
+ "cell_type": "code",
215
+ "execution_count": null,
216
+ "id": "e7d2fea8-74c6-4421-8f1e-0e76d5b201b9",
217
+ "metadata": {},
218
+ "outputs": [],
219
+ "source": [
220
+ "# Define optimization function using OpenAI's GPT model.\n",
221
+ "\n",
222
+ "def optimize_gpt(python_code, module_name, model=OPENAI_MODEL):\n",
223
+ " \"\"\"Optimize the given Python code by generating a C extension for Python with the specified module name using the specified LLM model.\"\"\"\n",
224
+ " response = openai.responses.parse(\n",
225
+ " model=model,\n",
226
+ " input=messages_for(python_code, module_name),\n",
227
+ " text_format=Extension_codes).output_parsed\n",
228
+ " return response"
229
+ ]
230
+ },
231
+ {
232
+ "cell_type": "markdown",
233
+ "id": "c05b263a",
234
+ "metadata": {},
235
+ "source": [
236
+ "# Try it with a math function that calculates ***π*** using the Leibniz formula.\n",
237
+ "\n",
238
+ "This formula implies the iterative approximation of *π* using an alternating series,\n",
239
+ "the more iterations the more the precision but with a cost of more computation.\n",
240
+ "* [Leibniz formula for π](https://en.wikipedia.org/wiki/Leibniz_formula_for_%CF%80)\n",
241
+ "\n",
242
+ "This is a good candidate to get a noticeable improvement by coding and compiling it into a Python C extension. \n",
243
+ "\n",
244
+ "> NOTE:\n",
245
+ ">\n",
246
+ "> We are creating an importable module not an executable program so the code to be optimized must contain only declarations such as DEF or CLASS."
247
+ ]
248
+ },
249
+ {
250
+ "cell_type": "code",
251
+ "execution_count": null,
252
+ "id": "a1cbb778-fa57-43de-b04b-ed523f396c38",
253
+ "metadata": {},
254
+ "outputs": [],
255
+ "source": [
256
+ "# Define the Python function to be converted to a C extension and its module name.\n",
257
+ "\n",
258
+ "module_name = \"calculate_pi\"\n",
259
+ "\n",
260
+ "calculate_pi_code = f\"\"\"\n",
261
+ "def leibniz_pi(iterations):\n",
262
+ " result = 1.0\n",
263
+ " for i in range(1, iterations+1):\n",
264
+ " j = i * 4 - 1\n",
265
+ " result -= (1/j)\n",
266
+ " j = i * 4 + 1\n",
267
+ " result += (1/j)\n",
268
+ " return result * 4\n",
269
+ "\"\"\"\n",
270
+ "\n",
271
+ "# Define a function to test the performance of the calculus function.\n",
272
+ "\n",
273
+ "def test_pi_calculation(calculus_function ,iterations=100_000_000):\n",
274
+ " \"\"\"Test the performance of the given calculus function.\"\"\"\n",
275
+ " start_time = perf_counter()\n",
276
+ " result = calculus_function(iterations)\n",
277
+ " end_time = perf_counter()\n",
278
+ " print(f\"Result: {result:.12f}\")\n",
279
+ " print(f\"Execution Time: {(end_time - start_time):.6f} seconds\")\n",
280
+ "\n",
281
+ "# Execute function declaration.\n",
282
+ "exec(calculate_pi_code)"
283
+ ]
284
+ },
285
+ {
286
+ "cell_type": "code",
287
+ "execution_count": null,
288
+ "id": "7fe1cd4b-d2c5-4303-afed-2115a3fef200",
289
+ "metadata": {},
290
+ "outputs": [],
291
+ "source": [
292
+ "# Run original python code and time it.\n",
293
+ "\n",
294
+ "test_pi_calculation(leibniz_pi, 100_000_000)"
295
+ ]
296
+ },
297
+ {
298
+ "cell_type": "code",
299
+ "execution_count": null,
300
+ "id": "4c0be0f2",
301
+ "metadata": {},
302
+ "outputs": [],
303
+ "source": [
304
+ "# Average timing the original Python code running it several times.\n",
305
+ "# (Increase 'iterations' for better timing)\n",
306
+ "\n",
307
+ "print(\"Timing...\")\n",
308
+ "iterations = 5\n",
309
+ "average = timeit(lambda: leibniz_pi(100_000_000), number=iterations) / iterations\n",
310
+ "print(f\"Python average execution time: {average:.6f} seconds\")"
311
+ ]
312
+ },
313
+ {
314
+ "cell_type": "code",
315
+ "execution_count": null,
316
+ "id": "105db6f9-343c-491d-8e44-3a5328b81719",
317
+ "metadata": {},
318
+ "outputs": [],
319
+ "source": [
320
+ "# Request code optimization using GPT.\n",
321
+ "\n",
322
+ "optimization = optimize_gpt(calculate_pi_code, module_name)"
323
+ ]
324
+ },
325
+ {
326
+ "cell_type": "code",
327
+ "execution_count": null,
328
+ "id": "378981c7",
329
+ "metadata": {},
330
+ "outputs": [],
331
+ "source": [
332
+ "# Print generated extension code.\n",
333
+ "\n",
334
+ "print_optimization(optimization)"
335
+ ]
336
+ },
337
+ {
338
+ "cell_type": "code",
339
+ "execution_count": null,
340
+ "id": "ae9a4a64",
341
+ "metadata": {},
342
+ "outputs": [],
343
+ "source": [
344
+ "# Write the generated code to files.\n",
345
+ "# (Will overwrite existing files)\n",
346
+ "\n",
347
+ "write_optimization(optimization, module_name)"
348
+ ]
349
+ },
350
+ {
351
+ "cell_type": "markdown",
352
+ "id": "bf8f8018-f64d-425c-a0e1-d7862aa9592d",
353
+ "metadata": {},
354
+ "source": [
355
+ "# Compiling C Extension and executing\n",
356
+ "\n",
357
+ "The python setup command may fail inside Jupyter lab, if that's the case try it directly on the command line.\n",
358
+ "\n",
359
+ "There are two cells with WINDOWS ONLY, those are to manage the fact windows comes with two command lines,\n",
360
+ "the old CMD (MS-DOS style) and the new POWERSHELL (Unix style).\n",
361
+ "\n",
362
+ "It is controlled by the COMSPEC environment variable.\\\n",
363
+ "*(Using this variable is completely innocuous on UNIX systems, they will simply ignore it)*\n",
364
+ "\n",
365
+ "Most of command lines present here are Unix style but the building one requires CMD so\n",
366
+ "we switch to CMD before compiling to later restore the preset one."
367
+ ]
368
+ },
369
+ {
370
+ "cell_type": "code",
371
+ "execution_count": null,
372
+ "id": "22a9130e",
373
+ "metadata": {},
374
+ "outputs": [],
375
+ "source": [
376
+ "# Clean previous builds.\n",
377
+ "# (Make sure to run this cell before running the compile cell a second time only)\n",
378
+ "# (May cast errors if no previous build exists)\n",
379
+ "\n",
380
+ "!rm -r build/"
381
+ ]
382
+ },
383
+ {
384
+ "cell_type": "code",
385
+ "execution_count": null,
386
+ "id": "816e7c9d",
387
+ "metadata": {},
388
+ "outputs": [],
389
+ "source": [
390
+ "# [WINDOWS ONLY]\n",
391
+ "# Set COMSPEC to cmd.exe to avoid issues with some C compilers on Windows.\n",
392
+ "# (Remember to restore original COMSPEC after compilation and testing)\n",
393
+ "preset_comspec = os.environ.get(\"COMSPEC\")\n",
394
+ "os.environ[\"COMSPEC\"] = \"C:\\\\Windows\\\\System32\\\\cmd.exe\""
395
+ ]
396
+ },
397
+ {
398
+ "cell_type": "code",
399
+ "execution_count": null,
400
+ "id": "4194e40c-04ab-4940-9d64-b4ad37c5bb40",
401
+ "metadata": {},
402
+ "outputs": [],
403
+ "source": [
404
+ "# Compile the C extension.\n",
405
+ "# (Will fail no C compiler is installed)\n",
406
+ "# (In case of errors, try directly on the command line)\n",
407
+ "\n",
408
+ "!python setup.py build_ext --inplace"
409
+ ]
410
+ },
411
+ {
412
+ "cell_type": "code",
413
+ "execution_count": null,
414
+ "id": "8db12c4d",
415
+ "metadata": {},
416
+ "outputs": [],
417
+ "source": [
418
+ "# [WINDOWS ONLY]\n",
419
+ "# Restore original COMSPEC.\n",
420
+ "\n",
421
+ "os.environ[\"COMSPEC\"] = preset_comspec"
422
+ ]
423
+ },
424
+ {
425
+ "cell_type": "code",
426
+ "execution_count": null,
427
+ "id": "a8f5169f",
428
+ "metadata": {},
429
+ "outputs": [],
430
+ "source": [
431
+ "# Run the usage example to test the compiled C extension.\n",
432
+ "exec(optimization.usage)"
433
+ ]
434
+ },
435
+ {
436
+ "cell_type": "code",
437
+ "execution_count": null,
438
+ "id": "a1972472",
439
+ "metadata": {},
440
+ "outputs": [],
441
+ "source": [
442
+ "# Import newly created C extension and compare performance with original Python code.\n",
443
+ "\n",
444
+ "from calculate_pi import leibniz_pi as c_leibniz_pi\n",
445
+ "\n",
446
+ "print(\"Testing original Python code:\")\n",
447
+ "test_pi_calculation(leibniz_pi, 100_000_000)\n",
448
+ "print(\"Testing C extension code:\")\n",
449
+ "test_pi_calculation(c_leibniz_pi, 100_000_000)\n"
450
+ ]
451
+ }
452
+ ],
453
+ "metadata": {
454
+ "kernelspec": {
455
+ "display_name": "ai-c-extension-generator-J3XBQkYw",
456
+ "language": "python",
457
+ "name": "python3"
458
+ },
459
+ "language_info": {
460
+ "codemirror_mode": {
461
+ "name": "ipython",
462
+ "version": 3
463
+ },
464
+ "file_extension": ".py",
465
+ "mimetype": "text/x-python",
466
+ "name": "python",
467
+ "nbconvert_exporter": "python",
468
+ "pygments_lexer": "ipython3",
469
+ "version": "3.13.13"
470
+ }
471
+ },
472
+ "nbformat": 4,
473
+ "nbformat_minor": 5
474
+ }