| <!DOCTYPE html> |
| <html lang="en"> |
| <head> |
| <meta charset="UTF-8"> |
| <meta name="viewport" content="width=device-width, initial-scale=1.0"> |
| <title>Code Dataset Generator for Falcon 40B</title> |
| <script src="https://cdn.tailwindcss.com"></script> |
| <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.4.0/css/all.min.css"> |
| <style> |
| .file-drop-area { |
| border: 2px dashed #6366f1; |
| transition: all 0.3s ease; |
| } |
| .file-drop-area.active { |
| border-color: #4f46e5; |
| background-color: #eef2ff; |
| } |
| .progress-bar { |
| transition: width 0.3s ease; |
| } |
| .code-preview { |
| font-family: 'Courier New', monospace; |
| background-color: #1e293b; |
| color: #f8fafc; |
| } |
| .language-badge { |
| top: -10px; |
| right: -10px; |
| } |
| .slide-fade-enter-active, .slide-fade-leave-active { |
| transition: all 0.3s ease; |
| } |
| .slide-fade-enter-from, .slide-fade-leave-to { |
| opacity: 0; |
| transform: translateY(10px); |
| } |
| </style> |
| </head> |
| <body class="bg-gray-50 min-h-screen"> |
| <div class="container mx-auto px-4 py-8" id="app"> |
| |
| <header class="mb-10 text-center"> |
| <h1 class="text-4xl font-bold text-indigo-700 mb-2"> |
| <i class="fas fa-database mr-2"></i>Code Dataset Generator |
| </h1> |
| <p class="text-lg text-gray-600 max-w-2xl mx-auto"> |
| Transform your source code into fine-tuning datasets for Falcon 40B and other LLMs |
| </p> |
| </header> |
|
|
| |
| <div class="grid grid-cols-1 lg:grid-cols-3 gap-8"> |
| |
| <div class="lg:col-span-2 bg-white rounded-xl shadow-md overflow-hidden"> |
| <div class="p-6"> |
| <h2 class="text-2xl font-semibold text-gray-800 mb-4"> |
| <i class="fas fa-upload mr-2 text-indigo-600"></i>Upload Source Files |
| </h2> |
| |
| |
| <div |
| id="dropArea" |
| class="file-drop-area rounded-lg p-8 text-center cursor-pointer mb-6" |
| @dragover.prevent="dragOver = true" |
| @dragleave="dragOver = false" |
| @drop.prevent="handleDrop" |
| :class="{ 'active': dragOver }" |
| > |
| <div class="flex flex-col items-center justify-center space-y-3"> |
| <i class="fas fa-cloud-upload-alt text-4xl text-indigo-500"></i> |
| <p class="text-lg font-medium text-gray-700">Drag & drop your source files here</p> |
| <p class="text-sm text-gray-500">or</p> |
| <input |
| type="file" |
| id="fileInput" |
| class="hidden" |
| multiple |
| @change="handleFileSelect" |
| accept=".py,.cpp,.c,.go,.rs,.js,.java,.php,.rb,.ts" |
| > |
| <button |
| @click="document.getElementById('fileInput').click()" |
| class="px-4 py-2 bg-indigo-600 text-white rounded-md hover:bg-indigo-700 transition" |
| > |
| Browse Files |
| </button> |
| <p class="text-xs text-gray-400 mt-2">Supported: Python, C/C++, Rust, Go, JavaScript, Java, PHP, Ruby, TypeScript</p> |
| </div> |
| </div> |
|
|
| |
| <div v-if="selectedFiles.length > 0" class="mb-6"> |
| <h3 class="text-lg font-medium text-gray-700 mb-3">Selected Files</h3> |
| <div class="space-y-2"> |
| <div |
| v-for="(file, index) in selectedFiles" |
| :key="index" |
| class="flex items-center justify-between bg-gray-50 p-3 rounded-md" |
| > |
| <div class="flex items-center space-x-3"> |
| <i :class="getFileIcon(file.name)" class="text-indigo-500"></i> |
| <span class="text-sm font-medium text-gray-700 truncate max-w-xs">{{ file.name }}</span> |
| <span class="text-xs text-gray-500">{{ formatFileSize(file.size) }}</span> |
| </div> |
| <button |
| @click="removeFile(index)" |
| class="text-red-500 hover:text-red-700" |
| > |
| <i class="fas fa-times"></i> |
| </button> |
| </div> |
| </div> |
| </div> |
|
|
| |
| <div class="mb-6"> |
| <h3 class="text-lg font-medium text-gray-700 mb-3">Processing Options</h3> |
| <div class="grid grid-cols-1 md:grid-cols-2 gap-4"> |
| <div> |
| <label class="block text-sm font-medium text-gray-700 mb-1">Output Format</label> |
| <select v-model="outputFormat" class="w-full rounded-md border-gray-300 shadow-sm focus:border-indigo-500 focus:ring-indigo-500"> |
| <option value="jsonl">JSONL (Alpaca format)</option> |
| <option value="hf">HuggingFace Dataset</option> |
| <option value="openchat">OpenChat Format</option> |
| </select> |
| </div> |
| <div> |
| <label class="block text-sm font-medium text-gray-700 mb-1">Language Detection</label> |
| <select v-model="languageDetection" class="w-full rounded-md border-gray-300 shadow-sm focus:border-indigo-500 focus:ring-indigo-500"> |
| <option value="auto">Auto-detect (recommended)</option> |
| <option value="manual">Manual selection</option> |
| </select> |
| </div> |
| </div> |
| <div class="mt-4 space-y-2"> |
| <label class="inline-flex items-center"> |
| <input type="checkbox" v-model="extractFunctions" class="rounded border-gray-300 text-indigo-600 shadow-sm focus:border-indigo-500 focus:ring-indigo-500"> |
| <span class="ml-2 text-sm text-gray-700">Extract functions/methods</span> |
| </label> |
| <label class="inline-flex items-center"> |
| <input type="checkbox" v-model="extractClasses" class="rounded border-gray-300 text-indigo-600 shadow-sm focus:border-indigo-500 focus:ring-indigo-500"> |
| <span class="ml-2 text-sm text-gray-700">Extract classes</span> |
| </label> |
| <label class="inline-flex items-center"> |
| <input type="checkbox" v-model="includeComments" class="rounded border-gray-300 text-indigo-600 shadow-sm focus:border-indigo-500 focus:ring-indigo-500"> |
| <span class="ml-2 text-sm text-gray-700">Include comments/docstrings</span> |
| </label> |
| <label class="inline-flex items-center"> |
| <input type="checkbox" v-model="cleanCode" class="rounded border-gray-300 text-indigo-600 shadow-sm focus:border-indigo-500 focus:ring-indigo-500"> |
| <span class="ml-2 text-sm text-gray-700">Clean code (remove noise)</span> |
| </label> |
| </div> |
| </div> |
|
|
| |
| <button |
| @click="processFiles" |
| :disabled="selectedFiles.length === 0 || processing" |
| class="w-full py-3 px-4 bg-indigo-600 hover:bg-indigo-700 text-white font-medium rounded-md transition flex items-center justify-center" |
| :class="{ 'opacity-50 cursor-not-allowed': selectedFiles.length === 0 || processing }" |
| > |
| <span v-if="!processing"> |
| <i class="fas fa-cogs mr-2"></i>Generate Dataset |
| </span> |
| <span v-else> |
| <i class="fas fa-spinner fa-spin mr-2"></i>Processing... |
| </span> |
| </button> |
| </div> |
| </div> |
|
|
| |
| <div class="bg-white rounded-xl shadow-md overflow-hidden"> |
| <div class="p-6"> |
| <h2 class="text-2xl font-semibold text-gray-800 mb-4"> |
| <i class="fas fa-eye mr-2 text-indigo-600"></i>Dataset Preview |
| </h2> |
| |
| |
| <div v-if="processing" class="mb-6"> |
| <div class="flex justify-between text-sm text-gray-600 mb-1"> |
| <span>Processing files...</span> |
| <span>{{ processedFiles }} / {{ selectedFiles.length }}</span> |
| </div> |
| <div class="w-full bg-gray-200 rounded-full h-2.5"> |
| <div |
| class="progress-bar bg-indigo-600 h-2.5 rounded-full" |
| :style="{ width: (processedFiles / selectedFiles.length) * 100 + '%' }" |
| ></div> |
| </div> |
| </div> |
|
|
| |
| <div v-if="previewData" class="space-y-4"> |
| <div class="flex justify-between items-center"> |
| <h3 class="text-lg font-medium text-gray-700">Generated Samples</h3> |
| <span class="text-xs bg-indigo-100 text-indigo-800 px-2 py-1 rounded-full"> |
| {{ previewData.length }} items |
| </span> |
| </div> |
| |
| |
| <div class="relative"> |
| <select |
| v-model="selectedSampleIndex" |
| class="w-full rounded-md border-gray-300 shadow-sm focus:border-indigo-500 focus:ring-indigo-500" |
| > |
| <option v-for="(item, index) in previewData" :value="index"> |
| Sample {{ index + 1 }} - {{ item.instruction.substring(0, 30) }}... |
| </option> |
| </select> |
| </div> |
| |
| |
| <div v-if="selectedSampleIndex !== null" class="space-y-3"> |
| <div> |
| <label class="block text-sm font-medium text-gray-700 mb-1">Instruction</label> |
| <div class="bg-gray-50 p-3 rounded-md text-sm"> |
| {{ previewData[selectedSampleIndex].instruction }} |
| </div> |
| </div> |
| |
| <div> |
| <label class="block text-sm font-medium text-gray-700 mb-1">Input Code</label> |
| <div class="code-preview p-3 rounded-md text-sm overflow-x-auto"> |
| <pre>{{ previewData[selectedSampleIndex].input }}</pre> |
| </div> |
| </div> |
| |
| <div> |
| <label class="block text-sm font-medium text-gray-700 mb-1">Output</label> |
| <div class="bg-gray-50 p-3 rounded-md text-sm"> |
| {{ previewData[selectedSampleIndex].output }} |
| </div> |
| </div> |
| </div> |
| |
| |
| <button |
| v-if="previewData.length > 0" |
| @click="downloadDataset" |
| class="w-full py-2 px-4 bg-green-600 hover:bg-green-700 text-white font-medium rounded-md transition flex items-center justify-center mt-4" |
| > |
| <i class="fas fa-download mr-2"></i>Download Dataset |
| </button> |
| </div> |
| |
| |
| <div v-else class="text-center py-10"> |
| <i class="fas fa-code text-4xl text-gray-300 mb-3"></i> |
| <p class="text-gray-500">Your processed dataset will appear here</p> |
| </div> |
| </div> |
| </div> |
| </div> |
|
|
| |
| <div class="mt-16"> |
| <h2 class="text-2xl font-bold text-center text-gray-800 mb-8">Key Features</h2> |
| <div class="grid grid-cols-1 md:grid-cols-3 gap-6"> |
| <div class="bg-white p-6 rounded-xl shadow-sm border border-gray-100"> |
| <div class="text-indigo-600 mb-3"> |
| <i class="fas fa-language text-3xl"></i> |
| </div> |
| <h3 class="text-lg font-semibold mb-2">Multi-language Support</h3> |
| <p class="text-gray-600">Automatically detects and processes code in Python, C/C++, Rust, Go, JavaScript and more.</p> |
| </div> |
| <div class="bg-white p-6 rounded-xl shadow-sm border border-gray-100"> |
| <div class="text-indigo-600 mb-3"> |
| <i class="fas fa-robot text-3xl"></i> |
| </div> |
| <h3 class="text-lg font-semibold mb-2">Smart Prompt Engineering</h3> |
| <p class="text-gray-600">Automatically generates meaningful instruction-output pairs from your source code.</p> |
| </div> |
| <div class="bg-white p-6 rounded-xl shadow-sm border border-gray-100"> |
| <div class="text-indigo-600 mb-3"> |
| <i class="fas fa-database text-3xl"></i> |
| </div> |
| <h3 class="text-lg font-semibold mb-2">Falcon 40B Optimized</h3> |
| <p class="text-gray-600">Tokenization and formatting specifically optimized for Falcon 40B model fine-tuning.</p> |
| </div> |
| </div> |
| </div> |
|
|
| |
| <footer class="mt-16 text-center text-gray-500 text-sm"> |
| <p>Code Dataset Generator - Transform your source code into LLM training data</p> |
| <p class="mt-1">© 2023 AI Engineering Team. All rights reserved.</p> |
| </footer> |
| </div> |
|
|
| <script> |
| const app = { |
| data() { |
| return { |
| dragOver: false, |
| selectedFiles: [], |
| outputFormat: 'jsonl', |
| languageDetection: 'auto', |
| extractFunctions: true, |
| extractClasses: true, |
| includeComments: true, |
| cleanCode: false, |
| processing: false, |
| processedFiles: 0, |
| previewData: null, |
| selectedSampleIndex: 0 |
| } |
| }, |
| methods: { |
| handleDrop(e) { |
| this.dragOver = false; |
| const files = Array.from(e.dataTransfer.files); |
| this.addFiles(files); |
| }, |
| handleFileSelect(e) { |
| const files = Array.from(e.target.files); |
| this.addFiles(files); |
| e.target.value = ''; |
| }, |
| addFiles(files) { |
| const validExtensions = ['.py', '.cpp', '.c', '.go', '.rs', '.js', '.java', '.php', '.rb', '.ts']; |
| const filteredFiles = files.filter(file => { |
| const ext = '.' + file.name.split('.').pop().toLowerCase(); |
| return validExtensions.includes(ext); |
| }); |
| |
| if (filteredFiles.length < files.length) { |
| alert('Some files were ignored. Only source code files are supported.'); |
| } |
| |
| this.selectedFiles = [...this.selectedFiles, ...filteredFiles]; |
| }, |
| removeFile(index) { |
| this.selectedFiles.splice(index, 1); |
| }, |
| getFileIcon(filename) { |
| const ext = filename.split('.').pop().toLowerCase(); |
| const icons = { |
| py: 'fab fa-python', |
| cpp: 'fas fa-file-code', |
| c: 'fas fa-file-code', |
| h: 'fas fa-file-code', |
| go: 'fab fa-golang', |
| rs: 'fas fa-rust', |
| js: 'fab fa-js-square', |
| java: 'fab fa-java', |
| php: 'fab fa-php', |
| rb: 'fas fa-gem', |
| ts: 'fas fa-file-code' |
| }; |
| return icons[ext] || 'fas fa-file-code'; |
| }, |
| formatFileSize(bytes) { |
| if (bytes === 0) return '0 Bytes'; |
| const k = 1024; |
| const sizes = ['Bytes', 'KB', 'MB', 'GB']; |
| const i = Math.floor(Math.log(bytes) / Math.log(k)); |
| return parseFloat((bytes / Math.pow(k, i)).toFixed(2)) + ' ' + sizes[i]; |
| }, |
| processFiles() { |
| if (this.selectedFiles.length === 0) return; |
| |
| this.processing = true; |
| this.processedFiles = 0; |
| this.previewData = null; |
| |
| |
| const totalFiles = this.selectedFiles.length; |
| const interval = setInterval(() => { |
| this.processedFiles += 1; |
| |
| if (this.processedFiles >= totalFiles) { |
| clearInterval(interval); |
| this.processing = false; |
| this.generatePreviewData(); |
| } |
| }, 800); |
| }, |
| generatePreviewData() { |
| |
| this.previewData = [ |
| { |
| "instruction": "Explain the purpose of this Python function.", |
| "input": "def get_user_by_id(id):\n return db.query(User).filter(User.id == id).first()", |
| "output": "This function retrieves a user from the database by their ID." |
| }, |
| { |
| "instruction": "What does this C++ function calculate?", |
| "input": "int factorial(int n) {\n if (n <= 1) return 1;\n return n * factorial(n - 1);\n}", |
| "output": "This function calculates the factorial of a given integer n using recursion." |
| }, |
| { |
| "instruction": "Convert this JavaScript function to TypeScript with proper typing.", |
| "input": "function greet(name) {\n return `Hello, ${name}!`;\n}", |
| "output": "function greet(name: string): string {\n return `Hello, ${name}!`;\n}" |
| }, |
| { |
| "instruction": "Optimize this Go function for better performance.", |
| "input": "func sum(numbers []int) int {\n total := 0\n for _, num := range numbers {\n total += num\n }\n return total\n}", |
| "output": "The function is already quite optimized. For very large slices, you might consider parallel processing using goroutines." |
| } |
| ]; |
| this.selectedSampleIndex = 0; |
| }, |
| downloadDataset() { |
| if (!this.previewData) return; |
| |
| |
| const dataStr = JSON.stringify(this.previewData, null, 2); |
| const dataBlob = new Blob([dataStr], { type: 'application/json' }); |
| const url = URL.createObjectURL(dataBlob); |
| |
| const link = document.createElement('a'); |
| link.href = url; |
| link.download = 'code_dataset.json'; |
| document.body.appendChild(link); |
| link.click(); |
| document.body.removeChild(link); |
| URL.revokeObjectURL(url); |
| } |
| } |
| }; |
| |
| Vue.createApp(app).mount('#app'); |
| </script> |
| <script src="https://unpkg.com/vue@3/dist/vue.global.js"></script> |
| <p style="border-radius: 8px; text-align: center; font-size: 12px; color: #fff; margin-top: 16px;position: fixed; left: 8px; bottom: 8px; z-index: 10; background: rgba(0, 0, 0, 0.8); padding: 4px 8px;">Made with <img src="https://enzostvs-deepsite.hf.space/logo.svg" alt="DeepSite Logo" style="width: 16px; height: 16px; vertical-align: middle;display:inline-block;margin-right:3px;filter:brightness(0) invert(1);"><a href="https://enzostvs-deepsite.hf.space" style="color: #fff;text-decoration: underline;" target="_blank" >DeepSite</a> - <a href="https://enzostvs-deepsite.hf.space?remix=Barbuuuuuuuu/code-dataset-generator" style="color: #fff;text-decoration: underline;" target="_blank" >🧬 Remix</a></p></body> |
| </html> |