Barbuuuuuuuu's picture
Add 2 files
62ad90e verified
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>Code Dataset Generator for Falcon 40B</title>
<script src="https://cdn.tailwindcss.com"></script>
<link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.4.0/css/all.min.css">
<style>
.file-drop-area {
border: 2px dashed #6366f1;
transition: all 0.3s ease;
}
.file-drop-area.active {
border-color: #4f46e5;
background-color: #eef2ff;
}
.progress-bar {
transition: width 0.3s ease;
}
.code-preview {
font-family: 'Courier New', monospace;
background-color: #1e293b;
color: #f8fafc;
}
.language-badge {
top: -10px;
right: -10px;
}
.slide-fade-enter-active, .slide-fade-leave-active {
transition: all 0.3s ease;
}
.slide-fade-enter-from, .slide-fade-leave-to {
opacity: 0;
transform: translateY(10px);
}
</style>
</head>
<body class="bg-gray-50 min-h-screen">
<div class="container mx-auto px-4 py-8" id="app">
<!-- Header -->
<header class="mb-10 text-center">
<h1 class="text-4xl font-bold text-indigo-700 mb-2">
<i class="fas fa-database mr-2"></i>Code Dataset Generator
</h1>
<p class="text-lg text-gray-600 max-w-2xl mx-auto">
Transform your source code into fine-tuning datasets for Falcon 40B and other LLMs
</p>
</header>
<!-- Main Content -->
<div class="grid grid-cols-1 lg:grid-cols-3 gap-8">
<!-- Upload Section -->
<div class="lg:col-span-2 bg-white rounded-xl shadow-md overflow-hidden">
<div class="p-6">
<h2 class="text-2xl font-semibold text-gray-800 mb-4">
<i class="fas fa-upload mr-2 text-indigo-600"></i>Upload Source Files
</h2>
<!-- File Drop Area -->
<div
id="dropArea"
class="file-drop-area rounded-lg p-8 text-center cursor-pointer mb-6"
@dragover.prevent="dragOver = true"
@dragleave="dragOver = false"
@drop.prevent="handleDrop"
:class="{ 'active': dragOver }"
>
<div class="flex flex-col items-center justify-center space-y-3">
<i class="fas fa-cloud-upload-alt text-4xl text-indigo-500"></i>
<p class="text-lg font-medium text-gray-700">Drag & drop your source files here</p>
<p class="text-sm text-gray-500">or</p>
<input
type="file"
id="fileInput"
class="hidden"
multiple
@change="handleFileSelect"
accept=".py,.cpp,.c,.go,.rs,.js,.java,.php,.rb,.ts"
>
<button
@click="document.getElementById('fileInput').click()"
class="px-4 py-2 bg-indigo-600 text-white rounded-md hover:bg-indigo-700 transition"
>
Browse Files
</button>
<p class="text-xs text-gray-400 mt-2">Supported: Python, C/C++, Rust, Go, JavaScript, Java, PHP, Ruby, TypeScript</p>
</div>
</div>
<!-- Selected Files -->
<div v-if="selectedFiles.length > 0" class="mb-6">
<h3 class="text-lg font-medium text-gray-700 mb-3">Selected Files</h3>
<div class="space-y-2">
<div
v-for="(file, index) in selectedFiles"
:key="index"
class="flex items-center justify-between bg-gray-50 p-3 rounded-md"
>
<div class="flex items-center space-x-3">
<i :class="getFileIcon(file.name)" class="text-indigo-500"></i>
<span class="text-sm font-medium text-gray-700 truncate max-w-xs">{{ file.name }}</span>
<span class="text-xs text-gray-500">{{ formatFileSize(file.size) }}</span>
</div>
<button
@click="removeFile(index)"
class="text-red-500 hover:text-red-700"
>
<i class="fas fa-times"></i>
</button>
</div>
</div>
</div>
<!-- Processing Options -->
<div class="mb-6">
<h3 class="text-lg font-medium text-gray-700 mb-3">Processing Options</h3>
<div class="grid grid-cols-1 md:grid-cols-2 gap-4">
<div>
<label class="block text-sm font-medium text-gray-700 mb-1">Output Format</label>
<select v-model="outputFormat" class="w-full rounded-md border-gray-300 shadow-sm focus:border-indigo-500 focus:ring-indigo-500">
<option value="jsonl">JSONL (Alpaca format)</option>
<option value="hf">HuggingFace Dataset</option>
<option value="openchat">OpenChat Format</option>
</select>
</div>
<div>
<label class="block text-sm font-medium text-gray-700 mb-1">Language Detection</label>
<select v-model="languageDetection" class="w-full rounded-md border-gray-300 shadow-sm focus:border-indigo-500 focus:ring-indigo-500">
<option value="auto">Auto-detect (recommended)</option>
<option value="manual">Manual selection</option>
</select>
</div>
</div>
<div class="mt-4 space-y-2">
<label class="inline-flex items-center">
<input type="checkbox" v-model="extractFunctions" class="rounded border-gray-300 text-indigo-600 shadow-sm focus:border-indigo-500 focus:ring-indigo-500">
<span class="ml-2 text-sm text-gray-700">Extract functions/methods</span>
</label>
<label class="inline-flex items-center">
<input type="checkbox" v-model="extractClasses" class="rounded border-gray-300 text-indigo-600 shadow-sm focus:border-indigo-500 focus:ring-indigo-500">
<span class="ml-2 text-sm text-gray-700">Extract classes</span>
</label>
<label class="inline-flex items-center">
<input type="checkbox" v-model="includeComments" class="rounded border-gray-300 text-indigo-600 shadow-sm focus:border-indigo-500 focus:ring-indigo-500">
<span class="ml-2 text-sm text-gray-700">Include comments/docstrings</span>
</label>
<label class="inline-flex items-center">
<input type="checkbox" v-model="cleanCode" class="rounded border-gray-300 text-indigo-600 shadow-sm focus:border-indigo-500 focus:ring-indigo-500">
<span class="ml-2 text-sm text-gray-700">Clean code (remove noise)</span>
</label>
</div>
</div>
<!-- Process Button -->
<button
@click="processFiles"
:disabled="selectedFiles.length === 0 || processing"
class="w-full py-3 px-4 bg-indigo-600 hover:bg-indigo-700 text-white font-medium rounded-md transition flex items-center justify-center"
:class="{ 'opacity-50 cursor-not-allowed': selectedFiles.length === 0 || processing }"
>
<span v-if="!processing">
<i class="fas fa-cogs mr-2"></i>Generate Dataset
</span>
<span v-else>
<i class="fas fa-spinner fa-spin mr-2"></i>Processing...
</span>
</button>
</div>
</div>
<!-- Preview Section -->
<div class="bg-white rounded-xl shadow-md overflow-hidden">
<div class="p-6">
<h2 class="text-2xl font-semibold text-gray-800 mb-4">
<i class="fas fa-eye mr-2 text-indigo-600"></i>Dataset Preview
</h2>
<!-- Progress Bar -->
<div v-if="processing" class="mb-6">
<div class="flex justify-between text-sm text-gray-600 mb-1">
<span>Processing files...</span>
<span>{{ processedFiles }} / {{ selectedFiles.length }}</span>
</div>
<div class="w-full bg-gray-200 rounded-full h-2.5">
<div
class="progress-bar bg-indigo-600 h-2.5 rounded-full"
:style="{ width: (processedFiles / selectedFiles.length) * 100 + '%' }"
></div>
</div>
</div>
<!-- Result Preview -->
<div v-if="previewData" class="space-y-4">
<div class="flex justify-between items-center">
<h3 class="text-lg font-medium text-gray-700">Generated Samples</h3>
<span class="text-xs bg-indigo-100 text-indigo-800 px-2 py-1 rounded-full">
{{ previewData.length }} items
</span>
</div>
<!-- Sample Selector -->
<div class="relative">
<select
v-model="selectedSampleIndex"
class="w-full rounded-md border-gray-300 shadow-sm focus:border-indigo-500 focus:ring-indigo-500"
>
<option v-for="(item, index) in previewData" :value="index">
Sample {{ index + 1 }} - {{ item.instruction.substring(0, 30) }}...
</option>
</select>
</div>
<!-- Sample Preview -->
<div v-if="selectedSampleIndex !== null" class="space-y-3">
<div>
<label class="block text-sm font-medium text-gray-700 mb-1">Instruction</label>
<div class="bg-gray-50 p-3 rounded-md text-sm">
{{ previewData[selectedSampleIndex].instruction }}
</div>
</div>
<div>
<label class="block text-sm font-medium text-gray-700 mb-1">Input Code</label>
<div class="code-preview p-3 rounded-md text-sm overflow-x-auto">
<pre>{{ previewData[selectedSampleIndex].input }}</pre>
</div>
</div>
<div>
<label class="block text-sm font-medium text-gray-700 mb-1">Output</label>
<div class="bg-gray-50 p-3 rounded-md text-sm">
{{ previewData[selectedSampleIndex].output }}
</div>
</div>
</div>
<!-- Download Button -->
<button
v-if="previewData.length > 0"
@click="downloadDataset"
class="w-full py-2 px-4 bg-green-600 hover:bg-green-700 text-white font-medium rounded-md transition flex items-center justify-center mt-4"
>
<i class="fas fa-download mr-2"></i>Download Dataset
</button>
</div>
<!-- Empty State -->
<div v-else class="text-center py-10">
<i class="fas fa-code text-4xl text-gray-300 mb-3"></i>
<p class="text-gray-500">Your processed dataset will appear here</p>
</div>
</div>
</div>
</div>
<!-- Features Section -->
<div class="mt-16">
<h2 class="text-2xl font-bold text-center text-gray-800 mb-8">Key Features</h2>
<div class="grid grid-cols-1 md:grid-cols-3 gap-6">
<div class="bg-white p-6 rounded-xl shadow-sm border border-gray-100">
<div class="text-indigo-600 mb-3">
<i class="fas fa-language text-3xl"></i>
</div>
<h3 class="text-lg font-semibold mb-2">Multi-language Support</h3>
<p class="text-gray-600">Automatically detects and processes code in Python, C/C++, Rust, Go, JavaScript and more.</p>
</div>
<div class="bg-white p-6 rounded-xl shadow-sm border border-gray-100">
<div class="text-indigo-600 mb-3">
<i class="fas fa-robot text-3xl"></i>
</div>
<h3 class="text-lg font-semibold mb-2">Smart Prompt Engineering</h3>
<p class="text-gray-600">Automatically generates meaningful instruction-output pairs from your source code.</p>
</div>
<div class="bg-white p-6 rounded-xl shadow-sm border border-gray-100">
<div class="text-indigo-600 mb-3">
<i class="fas fa-database text-3xl"></i>
</div>
<h3 class="text-lg font-semibold mb-2">Falcon 40B Optimized</h3>
<p class="text-gray-600">Tokenization and formatting specifically optimized for Falcon 40B model fine-tuning.</p>
</div>
</div>
</div>
<!-- Footer -->
<footer class="mt-16 text-center text-gray-500 text-sm">
<p>Code Dataset Generator - Transform your source code into LLM training data</p>
<p class="mt-1">© 2023 AI Engineering Team. All rights reserved.</p>
</footer>
</div>
<script>
const app = {
data() {
return {
dragOver: false,
selectedFiles: [],
outputFormat: 'jsonl',
languageDetection: 'auto',
extractFunctions: true,
extractClasses: true,
includeComments: true,
cleanCode: false,
processing: false,
processedFiles: 0,
previewData: null,
selectedSampleIndex: 0
}
},
methods: {
handleDrop(e) {
this.dragOver = false;
const files = Array.from(e.dataTransfer.files);
this.addFiles(files);
},
handleFileSelect(e) {
const files = Array.from(e.target.files);
this.addFiles(files);
e.target.value = ''; // Reset input to allow selecting same file again
},
addFiles(files) {
const validExtensions = ['.py', '.cpp', '.c', '.go', '.rs', '.js', '.java', '.php', '.rb', '.ts'];
const filteredFiles = files.filter(file => {
const ext = '.' + file.name.split('.').pop().toLowerCase();
return validExtensions.includes(ext);
});
if (filteredFiles.length < files.length) {
alert('Some files were ignored. Only source code files are supported.');
}
this.selectedFiles = [...this.selectedFiles, ...filteredFiles];
},
removeFile(index) {
this.selectedFiles.splice(index, 1);
},
getFileIcon(filename) {
const ext = filename.split('.').pop().toLowerCase();
const icons = {
py: 'fab fa-python',
cpp: 'fas fa-file-code',
c: 'fas fa-file-code',
h: 'fas fa-file-code',
go: 'fab fa-golang',
rs: 'fas fa-rust',
js: 'fab fa-js-square',
java: 'fab fa-java',
php: 'fab fa-php',
rb: 'fas fa-gem',
ts: 'fas fa-file-code'
};
return icons[ext] || 'fas fa-file-code';
},
formatFileSize(bytes) {
if (bytes === 0) return '0 Bytes';
const k = 1024;
const sizes = ['Bytes', 'KB', 'MB', 'GB'];
const i = Math.floor(Math.log(bytes) / Math.log(k));
return parseFloat((bytes / Math.pow(k, i)).toFixed(2)) + ' ' + sizes[i];
},
processFiles() {
if (this.selectedFiles.length === 0) return;
this.processing = true;
this.processedFiles = 0;
this.previewData = null;
// Simulate processing (in a real app, this would be API calls)
const totalFiles = this.selectedFiles.length;
const interval = setInterval(() => {
this.processedFiles += 1;
if (this.processedFiles >= totalFiles) {
clearInterval(interval);
this.processing = false;
this.generatePreviewData();
}
}, 800);
},
generatePreviewData() {
// Generate mock preview data
this.previewData = [
{
"instruction": "Explain the purpose of this Python function.",
"input": "def get_user_by_id(id):\n return db.query(User).filter(User.id == id).first()",
"output": "This function retrieves a user from the database by their ID."
},
{
"instruction": "What does this C++ function calculate?",
"input": "int factorial(int n) {\n if (n <= 1) return 1;\n return n * factorial(n - 1);\n}",
"output": "This function calculates the factorial of a given integer n using recursion."
},
{
"instruction": "Convert this JavaScript function to TypeScript with proper typing.",
"input": "function greet(name) {\n return `Hello, ${name}!`;\n}",
"output": "function greet(name: string): string {\n return `Hello, ${name}!`;\n}"
},
{
"instruction": "Optimize this Go function for better performance.",
"input": "func sum(numbers []int) int {\n total := 0\n for _, num := range numbers {\n total += num\n }\n return total\n}",
"output": "The function is already quite optimized. For very large slices, you might consider parallel processing using goroutines."
}
];
this.selectedSampleIndex = 0;
},
downloadDataset() {
if (!this.previewData) return;
// In a real app, this would download the actual processed dataset
const dataStr = JSON.stringify(this.previewData, null, 2);
const dataBlob = new Blob([dataStr], { type: 'application/json' });
const url = URL.createObjectURL(dataBlob);
const link = document.createElement('a');
link.href = url;
link.download = 'code_dataset.json';
document.body.appendChild(link);
link.click();
document.body.removeChild(link);
URL.revokeObjectURL(url);
}
}
};
Vue.createApp(app).mount('#app');
</script>
<script src="https://unpkg.com/vue@3/dist/vue.global.js"></script>
<p style="border-radius: 8px; text-align: center; font-size: 12px; color: #fff; margin-top: 16px;position: fixed; left: 8px; bottom: 8px; z-index: 10; background: rgba(0, 0, 0, 0.8); padding: 4px 8px;">Made with <img src="https://enzostvs-deepsite.hf.space/logo.svg" alt="DeepSite Logo" style="width: 16px; height: 16px; vertical-align: middle;display:inline-block;margin-right:3px;filter:brightness(0) invert(1);"><a href="https://enzostvs-deepsite.hf.space" style="color: #fff;text-decoration: underline;" target="_blank" >DeepSite</a> - <a href="https://enzostvs-deepsite.hf.space?remix=Barbuuuuuuuu/code-dataset-generator" style="color: #fff;text-decoration: underline;" target="_blank" >🧬 Remix</a></p></body>
</html>