|
|
<!DOCTYPE html> |
|
|
<html lang="en"> |
|
|
<head> |
|
|
<meta charset="UTF-8"> |
|
|
<meta name="viewport" content="width=device-width, initial-scale=1.0"> |
|
|
<title>Code Dataset Generator for Falcon 40B</title> |
|
|
<script src="https://cdn.tailwindcss.com"></script> |
|
|
<link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.4.0/css/all.min.css"> |
|
|
<style> |
|
|
.file-drop-area { |
|
|
border: 2px dashed #6366f1; |
|
|
transition: all 0.3s ease; |
|
|
} |
|
|
.file-drop-area.active { |
|
|
border-color: #4f46e5; |
|
|
background-color: #eef2ff; |
|
|
} |
|
|
.progress-bar { |
|
|
transition: width 0.3s ease; |
|
|
} |
|
|
.code-preview { |
|
|
font-family: 'Courier New', monospace; |
|
|
background-color: #1e293b; |
|
|
color: #f8fafc; |
|
|
} |
|
|
.language-badge { |
|
|
top: -10px; |
|
|
right: -10px; |
|
|
} |
|
|
.slide-fade-enter-active, .slide-fade-leave-active { |
|
|
transition: all 0.3s ease; |
|
|
} |
|
|
.slide-fade-enter-from, .slide-fade-leave-to { |
|
|
opacity: 0; |
|
|
transform: translateY(10px); |
|
|
} |
|
|
</style> |
|
|
</head> |
|
|
<body class="bg-gray-50 min-h-screen"> |
|
|
<div class="container mx-auto px-4 py-8" id="app"> |
|
|
|
|
|
<header class="mb-10 text-center"> |
|
|
<h1 class="text-4xl font-bold text-indigo-700 mb-2"> |
|
|
<i class="fas fa-database mr-2"></i>Code Dataset Generator |
|
|
</h1> |
|
|
<p class="text-lg text-gray-600 max-w-2xl mx-auto"> |
|
|
Transform your source code into fine-tuning datasets for Falcon 40B and other LLMs |
|
|
</p> |
|
|
</header> |
|
|
|
|
|
|
|
|
<div class="grid grid-cols-1 lg:grid-cols-3 gap-8"> |
|
|
|
|
|
<div class="lg:col-span-2 bg-white rounded-xl shadow-md overflow-hidden"> |
|
|
<div class="p-6"> |
|
|
<h2 class="text-2xl font-semibold text-gray-800 mb-4"> |
|
|
<i class="fas fa-upload mr-2 text-indigo-600"></i>Upload Source Files |
|
|
</h2> |
|
|
|
|
|
|
|
|
<div |
|
|
id="dropArea" |
|
|
class="file-drop-area rounded-lg p-8 text-center cursor-pointer mb-6" |
|
|
@dragover.prevent="dragOver = true" |
|
|
@dragleave="dragOver = false" |
|
|
@drop.prevent="handleDrop" |
|
|
:class="{ 'active': dragOver }" |
|
|
> |
|
|
<div class="flex flex-col items-center justify-center space-y-3"> |
|
|
<i class="fas fa-cloud-upload-alt text-4xl text-indigo-500"></i> |
|
|
<p class="text-lg font-medium text-gray-700">Drag & drop your source files here</p> |
|
|
<p class="text-sm text-gray-500">or</p> |
|
|
<input |
|
|
type="file" |
|
|
id="fileInput" |
|
|
class="hidden" |
|
|
multiple |
|
|
@change="handleFileSelect" |
|
|
accept=".py,.cpp,.c,.go,.rs,.js,.java,.php,.rb,.ts" |
|
|
> |
|
|
<button |
|
|
@click="document.getElementById('fileInput').click()" |
|
|
class="px-4 py-2 bg-indigo-600 text-white rounded-md hover:bg-indigo-700 transition" |
|
|
> |
|
|
Browse Files |
|
|
</button> |
|
|
<p class="text-xs text-gray-400 mt-2">Supported: Python, C/C++, Rust, Go, JavaScript, Java, PHP, Ruby, TypeScript</p> |
|
|
</div> |
|
|
</div> |
|
|
|
|
|
|
|
|
<div v-if="selectedFiles.length > 0" class="mb-6"> |
|
|
<h3 class="text-lg font-medium text-gray-700 mb-3">Selected Files</h3> |
|
|
<div class="space-y-2"> |
|
|
<div |
|
|
v-for="(file, index) in selectedFiles" |
|
|
:key="index" |
|
|
class="flex items-center justify-between bg-gray-50 p-3 rounded-md" |
|
|
> |
|
|
<div class="flex items-center space-x-3"> |
|
|
<i :class="getFileIcon(file.name)" class="text-indigo-500"></i> |
|
|
<span class="text-sm font-medium text-gray-700 truncate max-w-xs">{{ file.name }}</span> |
|
|
<span class="text-xs text-gray-500">{{ formatFileSize(file.size) }}</span> |
|
|
</div> |
|
|
<button |
|
|
@click="removeFile(index)" |
|
|
class="text-red-500 hover:text-red-700" |
|
|
> |
|
|
<i class="fas fa-times"></i> |
|
|
</button> |
|
|
</div> |
|
|
</div> |
|
|
</div> |
|
|
|
|
|
|
|
|
<div class="mb-6"> |
|
|
<h3 class="text-lg font-medium text-gray-700 mb-3">Processing Options</h3> |
|
|
<div class="grid grid-cols-1 md:grid-cols-2 gap-4"> |
|
|
<div> |
|
|
<label class="block text-sm font-medium text-gray-700 mb-1">Output Format</label> |
|
|
<select v-model="outputFormat" class="w-full rounded-md border-gray-300 shadow-sm focus:border-indigo-500 focus:ring-indigo-500"> |
|
|
<option value="jsonl">JSONL (Alpaca format)</option> |
|
|
<option value="hf">HuggingFace Dataset</option> |
|
|
<option value="openchat">OpenChat Format</option> |
|
|
</select> |
|
|
</div> |
|
|
<div> |
|
|
<label class="block text-sm font-medium text-gray-700 mb-1">Language Detection</label> |
|
|
<select v-model="languageDetection" class="w-full rounded-md border-gray-300 shadow-sm focus:border-indigo-500 focus:ring-indigo-500"> |
|
|
<option value="auto">Auto-detect (recommended)</option> |
|
|
<option value="manual">Manual selection</option> |
|
|
</select> |
|
|
</div> |
|
|
</div> |
|
|
<div class="mt-4 space-y-2"> |
|
|
<label class="inline-flex items-center"> |
|
|
<input type="checkbox" v-model="extractFunctions" class="rounded border-gray-300 text-indigo-600 shadow-sm focus:border-indigo-500 focus:ring-indigo-500"> |
|
|
<span class="ml-2 text-sm text-gray-700">Extract functions/methods</span> |
|
|
</label> |
|
|
<label class="inline-flex items-center"> |
|
|
<input type="checkbox" v-model="extractClasses" class="rounded border-gray-300 text-indigo-600 shadow-sm focus:border-indigo-500 focus:ring-indigo-500"> |
|
|
<span class="ml-2 text-sm text-gray-700">Extract classes</span> |
|
|
</label> |
|
|
<label class="inline-flex items-center"> |
|
|
<input type="checkbox" v-model="includeComments" class="rounded border-gray-300 text-indigo-600 shadow-sm focus:border-indigo-500 focus:ring-indigo-500"> |
|
|
<span class="ml-2 text-sm text-gray-700">Include comments/docstrings</span> |
|
|
</label> |
|
|
<label class="inline-flex items-center"> |
|
|
<input type="checkbox" v-model="cleanCode" class="rounded border-gray-300 text-indigo-600 shadow-sm focus:border-indigo-500 focus:ring-indigo-500"> |
|
|
<span class="ml-2 text-sm text-gray-700">Clean code (remove noise)</span> |
|
|
</label> |
|
|
</div> |
|
|
</div> |
|
|
|
|
|
|
|
|
<button |
|
|
@click="processFiles" |
|
|
:disabled="selectedFiles.length === 0 || processing" |
|
|
class="w-full py-3 px-4 bg-indigo-600 hover:bg-indigo-700 text-white font-medium rounded-md transition flex items-center justify-center" |
|
|
:class="{ 'opacity-50 cursor-not-allowed': selectedFiles.length === 0 || processing }" |
|
|
> |
|
|
<span v-if="!processing"> |
|
|
<i class="fas fa-cogs mr-2"></i>Generate Dataset |
|
|
</span> |
|
|
<span v-else> |
|
|
<i class="fas fa-spinner fa-spin mr-2"></i>Processing... |
|
|
</span> |
|
|
</button> |
|
|
</div> |
|
|
</div> |
|
|
|
|
|
|
|
|
<div class="bg-white rounded-xl shadow-md overflow-hidden"> |
|
|
<div class="p-6"> |
|
|
<h2 class="text-2xl font-semibold text-gray-800 mb-4"> |
|
|
<i class="fas fa-eye mr-2 text-indigo-600"></i>Dataset Preview |
|
|
</h2> |
|
|
|
|
|
|
|
|
<div v-if="processing" class="mb-6"> |
|
|
<div class="flex justify-between text-sm text-gray-600 mb-1"> |
|
|
<span>Processing files...</span> |
|
|
<span>{{ processedFiles }} / {{ selectedFiles.length }}</span> |
|
|
</div> |
|
|
<div class="w-full bg-gray-200 rounded-full h-2.5"> |
|
|
<div |
|
|
class="progress-bar bg-indigo-600 h-2.5 rounded-full" |
|
|
:style="{ width: (processedFiles / selectedFiles.length) * 100 + '%' }" |
|
|
></div> |
|
|
</div> |
|
|
</div> |
|
|
|
|
|
|
|
|
<div v-if="previewData" class="space-y-4"> |
|
|
<div class="flex justify-between items-center"> |
|
|
<h3 class="text-lg font-medium text-gray-700">Generated Samples</h3> |
|
|
<span class="text-xs bg-indigo-100 text-indigo-800 px-2 py-1 rounded-full"> |
|
|
{{ previewData.length }} items |
|
|
</span> |
|
|
</div> |
|
|
|
|
|
|
|
|
<div class="relative"> |
|
|
<select |
|
|
v-model="selectedSampleIndex" |
|
|
class="w-full rounded-md border-gray-300 shadow-sm focus:border-indigo-500 focus:ring-indigo-500" |
|
|
> |
|
|
<option v-for="(item, index) in previewData" :value="index"> |
|
|
Sample {{ index + 1 }} - {{ item.instruction.substring(0, 30) }}... |
|
|
</option> |
|
|
</select> |
|
|
</div> |
|
|
|
|
|
|
|
|
<div v-if="selectedSampleIndex !== null" class="space-y-3"> |
|
|
<div> |
|
|
<label class="block text-sm font-medium text-gray-700 mb-1">Instruction</label> |
|
|
<div class="bg-gray-50 p-3 rounded-md text-sm"> |
|
|
{{ previewData[selectedSampleIndex].instruction }} |
|
|
</div> |
|
|
</div> |
|
|
|
|
|
<div> |
|
|
<label class="block text-sm font-medium text-gray-700 mb-1">Input Code</label> |
|
|
<div class="code-preview p-3 rounded-md text-sm overflow-x-auto"> |
|
|
<pre>{{ previewData[selectedSampleIndex].input }}</pre> |
|
|
</div> |
|
|
</div> |
|
|
|
|
|
<div> |
|
|
<label class="block text-sm font-medium text-gray-700 mb-1">Output</label> |
|
|
<div class="bg-gray-50 p-3 rounded-md text-sm"> |
|
|
{{ previewData[selectedSampleIndex].output }} |
|
|
</div> |
|
|
</div> |
|
|
</div> |
|
|
|
|
|
|
|
|
<button |
|
|
v-if="previewData.length > 0" |
|
|
@click="downloadDataset" |
|
|
class="w-full py-2 px-4 bg-green-600 hover:bg-green-700 text-white font-medium rounded-md transition flex items-center justify-center mt-4" |
|
|
> |
|
|
<i class="fas fa-download mr-2"></i>Download Dataset |
|
|
</button> |
|
|
</div> |
|
|
|
|
|
|
|
|
<div v-else class="text-center py-10"> |
|
|
<i class="fas fa-code text-4xl text-gray-300 mb-3"></i> |
|
|
<p class="text-gray-500">Your processed dataset will appear here</p> |
|
|
</div> |
|
|
</div> |
|
|
</div> |
|
|
</div> |
|
|
|
|
|
|
|
|
<div class="mt-16"> |
|
|
<h2 class="text-2xl font-bold text-center text-gray-800 mb-8">Key Features</h2> |
|
|
<div class="grid grid-cols-1 md:grid-cols-3 gap-6"> |
|
|
<div class="bg-white p-6 rounded-xl shadow-sm border border-gray-100"> |
|
|
<div class="text-indigo-600 mb-3"> |
|
|
<i class="fas fa-language text-3xl"></i> |
|
|
</div> |
|
|
<h3 class="text-lg font-semibold mb-2">Multi-language Support</h3> |
|
|
<p class="text-gray-600">Automatically detects and processes code in Python, C/C++, Rust, Go, JavaScript and more.</p> |
|
|
</div> |
|
|
<div class="bg-white p-6 rounded-xl shadow-sm border border-gray-100"> |
|
|
<div class="text-indigo-600 mb-3"> |
|
|
<i class="fas fa-robot text-3xl"></i> |
|
|
</div> |
|
|
<h3 class="text-lg font-semibold mb-2">Smart Prompt Engineering</h3> |
|
|
<p class="text-gray-600">Automatically generates meaningful instruction-output pairs from your source code.</p> |
|
|
</div> |
|
|
<div class="bg-white p-6 rounded-xl shadow-sm border border-gray-100"> |
|
|
<div class="text-indigo-600 mb-3"> |
|
|
<i class="fas fa-database text-3xl"></i> |
|
|
</div> |
|
|
<h3 class="text-lg font-semibold mb-2">Falcon 40B Optimized</h3> |
|
|
<p class="text-gray-600">Tokenization and formatting specifically optimized for Falcon 40B model fine-tuning.</p> |
|
|
</div> |
|
|
</div> |
|
|
</div> |
|
|
|
|
|
|
|
|
<footer class="mt-16 text-center text-gray-500 text-sm"> |
|
|
<p>Code Dataset Generator - Transform your source code into LLM training data</p> |
|
|
<p class="mt-1">© 2023 AI Engineering Team. All rights reserved.</p> |
|
|
</footer> |
|
|
</div> |
|
|
|
|
|
<script> |
|
|
const app = { |
|
|
data() { |
|
|
return { |
|
|
dragOver: false, |
|
|
selectedFiles: [], |
|
|
outputFormat: 'jsonl', |
|
|
languageDetection: 'auto', |
|
|
extractFunctions: true, |
|
|
extractClasses: true, |
|
|
includeComments: true, |
|
|
cleanCode: false, |
|
|
processing: false, |
|
|
processedFiles: 0, |
|
|
previewData: null, |
|
|
selectedSampleIndex: 0 |
|
|
} |
|
|
}, |
|
|
methods: { |
|
|
handleDrop(e) { |
|
|
this.dragOver = false; |
|
|
const files = Array.from(e.dataTransfer.files); |
|
|
this.addFiles(files); |
|
|
}, |
|
|
handleFileSelect(e) { |
|
|
const files = Array.from(e.target.files); |
|
|
this.addFiles(files); |
|
|
e.target.value = ''; |
|
|
}, |
|
|
addFiles(files) { |
|
|
const validExtensions = ['.py', '.cpp', '.c', '.go', '.rs', '.js', '.java', '.php', '.rb', '.ts']; |
|
|
const filteredFiles = files.filter(file => { |
|
|
const ext = '.' + file.name.split('.').pop().toLowerCase(); |
|
|
return validExtensions.includes(ext); |
|
|
}); |
|
|
|
|
|
if (filteredFiles.length < files.length) { |
|
|
alert('Some files were ignored. Only source code files are supported.'); |
|
|
} |
|
|
|
|
|
this.selectedFiles = [...this.selectedFiles, ...filteredFiles]; |
|
|
}, |
|
|
removeFile(index) { |
|
|
this.selectedFiles.splice(index, 1); |
|
|
}, |
|
|
getFileIcon(filename) { |
|
|
const ext = filename.split('.').pop().toLowerCase(); |
|
|
const icons = { |
|
|
py: 'fab fa-python', |
|
|
cpp: 'fas fa-file-code', |
|
|
c: 'fas fa-file-code', |
|
|
h: 'fas fa-file-code', |
|
|
go: 'fab fa-golang', |
|
|
rs: 'fas fa-rust', |
|
|
js: 'fab fa-js-square', |
|
|
java: 'fab fa-java', |
|
|
php: 'fab fa-php', |
|
|
rb: 'fas fa-gem', |
|
|
ts: 'fas fa-file-code' |
|
|
}; |
|
|
return icons[ext] || 'fas fa-file-code'; |
|
|
}, |
|
|
formatFileSize(bytes) { |
|
|
if (bytes === 0) return '0 Bytes'; |
|
|
const k = 1024; |
|
|
const sizes = ['Bytes', 'KB', 'MB', 'GB']; |
|
|
const i = Math.floor(Math.log(bytes) / Math.log(k)); |
|
|
return parseFloat((bytes / Math.pow(k, i)).toFixed(2)) + ' ' + sizes[i]; |
|
|
}, |
|
|
processFiles() { |
|
|
if (this.selectedFiles.length === 0) return; |
|
|
|
|
|
this.processing = true; |
|
|
this.processedFiles = 0; |
|
|
this.previewData = null; |
|
|
|
|
|
|
|
|
const totalFiles = this.selectedFiles.length; |
|
|
const interval = setInterval(() => { |
|
|
this.processedFiles += 1; |
|
|
|
|
|
if (this.processedFiles >= totalFiles) { |
|
|
clearInterval(interval); |
|
|
this.processing = false; |
|
|
this.generatePreviewData(); |
|
|
} |
|
|
}, 800); |
|
|
}, |
|
|
generatePreviewData() { |
|
|
|
|
|
this.previewData = [ |
|
|
{ |
|
|
"instruction": "Explain the purpose of this Python function.", |
|
|
"input": "def get_user_by_id(id):\n return db.query(User).filter(User.id == id).first()", |
|
|
"output": "This function retrieves a user from the database by their ID." |
|
|
}, |
|
|
{ |
|
|
"instruction": "What does this C++ function calculate?", |
|
|
"input": "int factorial(int n) {\n if (n <= 1) return 1;\n return n * factorial(n - 1);\n}", |
|
|
"output": "This function calculates the factorial of a given integer n using recursion." |
|
|
}, |
|
|
{ |
|
|
"instruction": "Convert this JavaScript function to TypeScript with proper typing.", |
|
|
"input": "function greet(name) {\n return `Hello, ${name}!`;\n}", |
|
|
"output": "function greet(name: string): string {\n return `Hello, ${name}!`;\n}" |
|
|
}, |
|
|
{ |
|
|
"instruction": "Optimize this Go function for better performance.", |
|
|
"input": "func sum(numbers []int) int {\n total := 0\n for _, num := range numbers {\n total += num\n }\n return total\n}", |
|
|
"output": "The function is already quite optimized. For very large slices, you might consider parallel processing using goroutines." |
|
|
} |
|
|
]; |
|
|
this.selectedSampleIndex = 0; |
|
|
}, |
|
|
downloadDataset() { |
|
|
if (!this.previewData) return; |
|
|
|
|
|
|
|
|
const dataStr = JSON.stringify(this.previewData, null, 2); |
|
|
const dataBlob = new Blob([dataStr], { type: 'application/json' }); |
|
|
const url = URL.createObjectURL(dataBlob); |
|
|
|
|
|
const link = document.createElement('a'); |
|
|
link.href = url; |
|
|
link.download = 'code_dataset.json'; |
|
|
document.body.appendChild(link); |
|
|
link.click(); |
|
|
document.body.removeChild(link); |
|
|
URL.revokeObjectURL(url); |
|
|
} |
|
|
} |
|
|
}; |
|
|
|
|
|
Vue.createApp(app).mount('#app'); |
|
|
</script> |
|
|
<script src="https://unpkg.com/vue@3/dist/vue.global.js"></script> |
|
|
<p style="border-radius: 8px; text-align: center; font-size: 12px; color: #fff; margin-top: 16px;position: fixed; left: 8px; bottom: 8px; z-index: 10; background: rgba(0, 0, 0, 0.8); padding: 4px 8px;">Made with <img src="https://enzostvs-deepsite.hf.space/logo.svg" alt="DeepSite Logo" style="width: 16px; height: 16px; vertical-align: middle;display:inline-block;margin-right:3px;filter:brightness(0) invert(1);"><a href="https://enzostvs-deepsite.hf.space" style="color: #fff;text-decoration: underline;" target="_blank" >DeepSite</a> - <a href="https://enzostvs-deepsite.hf.space?remix=Barbuuuuuuuu/code-dataset-generator" style="color: #fff;text-decoration: underline;" target="_blank" >🧬 Remix</a></p></body> |
|
|
</html> |