import csv import random # ============================ # CONFIGURATION # ============================ OUTPUT_FILE = "rosetta_code_dataset.csv" SAMPLES_PER_ALGO = 500 # High variation count for better matching # ============================ # 1. THE ULTIMATE ALGORITHM LIBRARY # ============================ ALGORITHMS = { # --------------------------- # BASIC MATH & LOGIC # --------------------------- "factorial": { "prompts": ["factorial of a number", "calculate n!", "multiplication of 1 to n", "find factorial", "fact code"], "python": "def factorial(n):\n return 1 if n == 0 else n * factorial(n-1)\nnum = int(input())\nprint(factorial(num))", "cpp": "#include\nusing namespace std;\nint factorial(int n) {\n return (n == 0) ? 1 : n * factorial(n - 1);\n}\nint main() {\n int n; cin>>n;\n cout << factorial(n);\n}", "java": "import java.util.Scanner;\nclass Main {\n static int factorial(int n) {\n return (n == 0) ? 1 : n * factorial(n - 1);\n }\n public static void main(String[] args) {\n Scanner sc = new Scanner(System.in);\n System.out.println(factorial(sc.nextInt()));\n }\n}" }, "fibonacci": { "prompts": ["fibonacci series", "print fib numbers", "sequence 0 1 1 2 3", "fib series", "fibonacci recursion"], "python": "n = int(input())\na, b = 0, 1\nfor _ in range(n):\n print(a, end=' ')\n a, b = b, a+b", "cpp": "int n, a=0, b=1, next;\ncin >> n;\nfor (int i = 0; i < n; i++) {\n cout << a << \" \";\n next = a + b;\n a = b;\n b = next;\n}", "java": "int n = 10, a = 0, b = 1;\nfor (int i = 0; i < n; i++) {\n System.out.print(a + \" \");\n int next = a + b;\n a = b;\n b = next;\n}" }, "swap_two_numbers": { "prompts": ["swap two numbers", "swap variables without temp", "exchange values", "swap logic"], "python": "a = int(input())\nb = int(input())\na, b = b, a\nprint(a, b)", "cpp": "int a, b;\ncin >> a >> b;\na = a + b;\nb = a - b;\na = a - b;\ncout << a << \" \" << b;", "java": "int a = 5, b = 10;\na = a + b;\nb = a - b;\na = a - b;\nSystem.out.println(a + \" \" + b);" }, "leap_year": { "prompts": ["check leap year", "is year leap", "leap year logic", "days in february year"], "python": "year = int(input())\nif (year % 4 == 0 and year % 100 != 0) or (year % 400 == 0):\n print('Leap Year')\nelse:\n print('Not Leap Year')", "cpp": "int year;\ncin >> year;\nif ((year % 4 == 0 && year % 100 != 0) || (year % 400 == 0))\n cout << \"Leap Year\";\nelse\n cout << \"Not Leap Year\";", "java": "int year = 2024;\nif ((year % 4 == 0 && year % 100 != 0) || (year % 400 == 0))\n System.out.println(\"Leap Year\");\nelse\n System.out.println(\"Not Leap Year\");" }, "odd_even": { "prompts": ["check odd even", "is number divisible by 2", "find even number", "odd number logic"], "python": "num = int(input())\nif num % 2 == 0: print('Even')\nelse: print('Odd')", "cpp": "int n; cin >> n;\nif(n % 2 == 0) cout << \"Even\";\nelse cout << \"Odd\";", "java": "int n = 5;\nif(n % 2 == 0) System.out.println(\"Even\");\nelse System.out.println(\"Odd\");" }, "lcm_hcf": { "prompts": ["lcm and hcf", "gcd of two numbers", "least common multiple", "highest common factor"], "python": "import math\na, b = 12, 15\ngcd = math.gcd(a, b)\nlcm = (a*b)//gcd\nprint('HCF:', gcd, 'LCM:', lcm)", "cpp": "int gcd(int a, int b) { return b==0?a:gcd(b, a%b); }\nint main() {\n int a=12, b=15;\n cout<<\"HCF: \"< 1:\n for i in range(2, int(num**0.5)+1):\n if (num % i) == 0: print('Not Prime'); break\n else: print('Prime')\nelse: print('Not Prime')", "cpp": "bool isPrime(int n) {\n if (n <= 1) return false;\n for (int i = 2; i * i <= n; i++)\n if (n % i == 0) return false;\n return true;\n}", "java": "boolean isPrime(int n) {\n if (n <= 1) return false;\n for (int i = 2; i * i <= n; i++)\n if (n % i == 0) return false;\n return true;\n}" }, "armstrong": { "prompts": ["armstrong number", "sum of cubes of digits", "check armstrong", "narcissistic number"], "python": "n = int(input())\nsum = 0\ntemp = n\nwhile temp > 0:\n digit = temp % 10\n sum += digit ** 3\n temp //= 10\nif n == sum: print('Armstrong')\nelse: print('Not Armstrong')", "cpp": "int n, r, sum=0, temp;\ncin >> n;\ntemp = n;\nwhile(n>0){r=n%10;sum=sum+(r*r*r);n=n/10;}\nif(temp==sum) cout<<\"Armstrong\";\nelse cout<<\"Not\";", "java": "int n=153, r, sum=0, temp;\ntemp = n;\nwhile(n>0){r=n%10;sum=sum+(r*r*r);n=n/10;}\nif(temp==sum) System.out.println(\"Armstrong\");\nelse System.out.println(\"Not\");" }, "palindrome_number": { "prompts": ["palindrome number", "reverse number equal", "check number palindrome"], "python": "n = input()\nif n == n[::-1]: print('Palindrome')\nelse: print('Not Palindrome')", "cpp": "int n, r, sum=0, temp;\ncin >> n;\ntemp = n;\nwhile(n>0){r=n%10;sum=(sum*10)+r;n=n/10;}\nif(temp==sum) cout<<\"Palindrome\";\nelse cout<<\"Not\";", "java": "int n=121, r, sum=0, temp;\ntemp = n;\nwhile(n>0){r=n%10;sum=(sum*10)+r;n=n/10;}\nif(temp==sum) System.out.println(\"Palindrome\");\nelse System.out.println(\"Not\");" }, "sum_of_digits": { "prompts": ["sum of digits", "add all digits of number", "digit sum logic"], "python": "n = int(input())\ns = 0\nwhile n > 0:\n s += n % 10\n n //= 10\nprint(s)", "cpp": "int n, sum=0;\ncin >> n;\nwhile(n>0) { sum += n%10; n/=10; }\ncout << sum;", "java": "int n=123, sum=0;\nwhile(n>0) { sum += n%10; n/=10; }\nSystem.out.println(sum);" }, "decimal_to_binary": { "prompts": ["decimal to binary", "convert dec to bin", "binary of number"], "python": "n = int(input())\nprint(bin(n).replace('0b', ''))", "cpp": "void decToBinary(int n) {\n int binaryNum[32];\n int i = 0;\n while (n > 0) {\n binaryNum[i] = n % 2;\n n = n / 2;\n i++;\n }\n for (int j = i - 1; j >= 0; j--) cout << binaryNum[j];\n}", "java": "void decToBinary(int n) {\n System.out.println(Integer.toBinaryString(n));\n}" }, # --------------------------- # ARRAYS & MATRICES # --------------------------- "bubble_sort": { "prompts": ["bubble sort", "sort array ascending", "sorting algorithm", "arrange elements"], "python": "arr = [64, 34, 25, 12, 22, 11, 90]\nn = len(arr)\nfor i in range(n):\n for j in range(0, n-i-1):\n if arr[j] > arr[j+1]: arr[j], arr[j+1] = arr[j+1], arr[j]\nprint(arr)", "cpp": "void bubbleSort(int arr[], int n) {\n for (int i = 0; i < n-1; i++)\n for (int j = 0; j < n-i-1; j++)\n if (arr[j] > arr[j+1]) swap(arr[j], arr[j+1]);\n}", "java": "void bubbleSort(int arr[]) {\n int n = arr.length;\n for (int i = 0; i < n-1; i++)\n for (int j = 0; j < n-i-1; j++)\n if (arr[j] > arr[j+1]) {\n int temp = arr[j]; arr[j] = arr[j+1]; arr[j+1] = temp;\n }\n}" }, "linear_search": { "prompts": ["linear search", "find element in array", "search number list"], "python": "arr = [10, 20, 30, 40]\nx = 30\nif x in arr: print('Found')\nelse: print('Not Found')", "cpp": "int search(int arr[], int n, int x) {\n for (int i = 0; i < n; i++)\n if (arr[i] == x) return i;\n return -1;\n}", "java": "int search(int arr[], int x) {\n for (int i = 0; i < arr.length; i++)\n if (arr[i] == x) return i;\n return -1;\n}" }, "largest_in_array": { "prompts": ["largest element in array", "max in array", "find biggest number in list"], "python": "arr = [10, 324, 45, 90, 9808]\nprint(max(arr))", "cpp": "int largest(int arr[], int n) {\n int max = arr[0];\n for (int i = 1; i < n; i++)\n if (arr[i] > max) max = arr[i];\n return max;\n}", "java": "int largest(int arr[]) {\n int max = arr[0];\n for (int i = 1; i < arr.length; i++)\n if (arr[i] > max) max = arr[i];\n return max;\n}" }, "matrix_add": { "prompts": ["matrix addition", "add two matrices", "sum of matrix"], "python": "X = [[1,2,3], [4 ,5,6], [7 ,8,9]]\nY = [[9,8,7], [6,5,4], [3,2,1]]\nresult = [[X[i][j] + Y[i][j] for j in range(len(X[0]))] for i in range(len(X))]\nfor r in result: print(r)", "cpp": "void addMatrix(int A[3][3], int B[3][3]) {\n for(int i=0;i<3;i++) {\n for(int j=0;j<3;j++) cout<> s;\nstring rev = string(s.rbegin(), s.rend());\nif (s == rev) cout << \"Palindrome\";\nelse cout << \"Not\";", "java": "String str = \"madam\", rev = \"\";\nfor (int i = str.length() - 1; i >= 0; i--) rev = rev + str.charAt(i);\nif (str.equals(rev)) System.out.println(\"Palindrome\");" }, "vowel_count": { "prompts": ["count vowels", "number of vowels in string", "vowel consonant count"], "python": "s = input().lower()\ncount = 0\nfor char in s:\n if char in 'aeiou': count += 1\nprint(count)", "cpp": "string s; cin >> s;\nint count = 0;\nfor(char c : s) {\n if(c=='a'||c=='e'||c=='i'||c=='o'||c=='u') count++;\n}\ncout << count;", "java": "String s = \"hello\";\nint count = 0;\nfor(int i=0; i\nusing namespace std;\nint main() {\n cout << \"Hello World\";\n return 0;\n}", "java": "public class Main {\n public static void main(String[] args) {\n System.out.println(\"Hello World\");\n }\n}" } } # ============================ # 2. GENERATOR LOGIC # ============================ def generate_dataset(): print("Generating THE ULTIMATE Rosetta Stone Dataset...") data = [] for algo_key, templates in ALGORITHMS.items(): base_prompts = templates["prompts"] for _ in range(SAMPLES_PER_ALGO): # 1. Randomize Prompt prompt_base = random.choice(base_prompts) lang = random.choice(["python", "cpp", "java"]) # 2. Create natural language variation variations = [ f"{prompt_base} in {lang}", f"write {lang} code for {prompt_base}", f"how to {prompt_base} using {lang}", f"program for {prompt_base} in {lang}", f"give me {prompt_base} code {lang}" ] query = random.choice(variations) # 3. Get Code code = templates[lang] data.append([query, lang, code]) # Save with open(OUTPUT_FILE, 'w', newline='', encoding='utf-8') as f: writer = csv.writer(f) writer.writerow(["prompt", "language", "code"]) writer.writerows(data) print(f"✅ Created {len(data)} training samples covering {len(ALGORITHMS)} major topics.") print(f"Saved to {OUTPUT_FILE}") if __name__ == "__main__": generate_dataset()