advanced-tokenizer-system / file_upload_interface.py
9x25dillon's picture
Upload folder using huggingface_hub
968c919 verified
#!/usr/bin/env python3
"""
File Upload Interface
====================
Web-based file upload interface for high capacity input processing.
"""
from flask import Flask, request, jsonify, render_template_string, redirect, url_for
import os
import json
from pathlib import Path
from werkzeug.utils import secure_filename
from high_capacity_input_processor import HighCapacityInputProcessor
import threading
import time
app = Flask(__name__)
app.config['MAX_CONTENT_LENGTH'] = 100 * 1024 * 1024 # 100MB max file size
# Initialize processor
processor = HighCapacityInputProcessor()
# HTML template for the upload interface
UPLOAD_TEMPLATE = """
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>LiMp High Capacity Input Processor</title>
<style>
body {
font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
max-width: 1200px;
margin: 0 auto;
padding: 20px;
background-color: #f5f5f5;
}
.container {
background: white;
padding: 30px;
border-radius: 10px;
box-shadow: 0 2px 10px rgba(0,0,0,0.1);
}
.header {
text-align: center;
margin-bottom: 30px;
}
.header h1 {
color: #333;
margin-bottom: 10px;
}
.header p {
color: #666;
font-size: 16px;
}
.upload-section {
border: 2px dashed #ddd;
border-radius: 10px;
padding: 40px;
text-align: center;
margin-bottom: 30px;
transition: border-color 0.3s;
}
.upload-section:hover {
border-color: #4CAF50;
}
.upload-section.dragover {
border-color: #4CAF50;
background-color: #f0f8f0;
}
.file-input {
margin: 20px 0;
}
.file-input input[type="file"] {
display: none;
}
.file-input label {
display: inline-block;
padding: 12px 24px;
background-color: #4CAF50;
color: white;
border-radius: 5px;
cursor: pointer;
font-size: 16px;
transition: background-color 0.3s;
}
.file-input label:hover {
background-color: #45a049;
}
.text-input {
margin: 20px 0;
}
.text-input textarea {
width: 100%;
height: 200px;
padding: 15px;
border: 1px solid #ddd;
border-radius: 5px;
font-family: monospace;
font-size: 14px;
resize: vertical;
}
.submit-btn {
background-color: #2196F3;
color: white;
padding: 15px 30px;
border: none;
border-radius: 5px;
font-size: 16px;
cursor: pointer;
margin: 10px 5px;
transition: background-color 0.3s;
}
.submit-btn:hover {
background-color: #1976D2;
}
.submit-btn:disabled {
background-color: #ccc;
cursor: not-allowed;
}
.progress {
width: 100%;
height: 20px;
background-color: #f0f0f0;
border-radius: 10px;
overflow: hidden;
margin: 20px 0;
display: none;
}
.progress-bar {
height: 100%;
background-color: #4CAF50;
width: 0%;
transition: width 0.3s;
}
.results {
margin-top: 30px;
padding: 20px;
background-color: #f9f9f9;
border-radius: 5px;
display: none;
}
.stats {
display: grid;
grid-template-columns: repeat(auto-fit, minmax(200px, 1fr));
gap: 20px;
margin: 20px 0;
}
.stat-card {
background: white;
padding: 20px;
border-radius: 5px;
text-align: center;
box-shadow: 0 2px 5px rgba(0,0,0,0.1);
}
.stat-number {
font-size: 24px;
font-weight: bold;
color: #4CAF50;
}
.stat-label {
color: #666;
margin-top: 5px;
}
.file-list {
margin-top: 20px;
}
.file-item {
background: white;
padding: 15px;
margin: 10px 0;
border-radius: 5px;
box-shadow: 0 2px 5px rgba(0,0,0,0.1);
}
.file-name {
font-weight: bold;
color: #333;
}
.file-info {
color: #666;
font-size: 14px;
margin-top: 5px;
}
.chunk-info {
color: #888;
font-size: 12px;
margin-top: 5px;
}
.error {
color: #f44336;
background-color: #ffebee;
padding: 15px;
border-radius: 5px;
margin: 20px 0;
}
.success {
color: #4CAF50;
background-color: #e8f5e8;
padding: 15px;
border-radius: 5px;
margin: 20px 0;
}
.download-btn {
background-color: #FF9800;
color: white;
padding: 8px 16px;
border: none;
border-radius: 3px;
font-size: 12px;
cursor: pointer;
margin-top: 10px;
}
.download-btn:hover {
background-color: #F57C00;
}
</style>
</head>
<body>
<div class="container">
<div class="header">
<h1>🧠 LiMp High Capacity Input Processor</h1>
<p>Upload files or enter large text for intelligent chunking and training data generation</p>
</div>
<form id="uploadForm" enctype="multipart/form-data">
<div class="upload-section" id="uploadSection">
<h3>πŸ“ File Upload</h3>
<p>Drag and drop files here or click to select</p>
<div class="file-input">
<label for="fileInput">Choose Files</label>
<input type="file" id="fileInput" name="files" multiple accept=".txt,.md,.py,.js,.html,.css,.json,.jsonl,.csv,.pdf,.doc,.docx,.xml,.yaml,.yml">
</div>
<p><small>Supported formats: TXT, MD, PY, JS, HTML, CSS, JSON, CSV, PDF, DOC, DOCX, XML, YAML</small></p>
</div>
<div class="text-input">
<h3>πŸ“ Large Text Input</h3>
<textarea id="textInput" name="text" placeholder="Enter large text content here (up to 100MB)..."></textarea>
<p><small>Character count: <span id="charCount">0</span></small></p>
</div>
<div>
<button type="submit" class="submit-btn" id="submitBtn">Process Input</button>
<button type="button" class="submit-btn" onclick="generateTrainingData()">Generate Training Data</button>
<button type="button" class="submit-btn" onclick="clearAll()">Clear All</button>
</div>
<div class="progress" id="progress">
<div class="progress-bar" id="progressBar"></div>
</div>
</form>
<div class="results" id="results">
<h3>πŸ“Š Processing Results</h3>
<div id="resultsContent"></div>
</div>
</div>
<script>
let uploads = [];
// File input handling
const fileInput = document.getElementById('fileInput');
const uploadSection = document.getElementById('uploadSection');
const textInput = document.getElementById('textInput');
const charCount = document.getElementById('charCount');
const submitBtn = document.getElementById('submitBtn');
const progress = document.getElementById('progress');
const progressBar = document.getElementById('progressBar');
const results = document.getElementById('results');
const resultsContent = document.getElementById('resultsContent');
// Character count update
textInput.addEventListener('input', function() {
charCount.textContent = this.value.length.toLocaleString();
});
// Drag and drop handling
uploadSection.addEventListener('dragover', function(e) {
e.preventDefault();
this.classList.add('dragover');
});
uploadSection.addEventListener('dragleave', function(e) {
e.preventDefault();
this.classList.remove('dragover');
});
uploadSection.addEventListener('drop', function(e) {
e.preventDefault();
this.classList.remove('dragover');
fileInput.files = e.dataTransfer.files;
updateFileList();
});
fileInput.addEventListener('change', updateFileList);
function updateFileList() {
const files = fileInput.files;
if (files.length > 0) {
let fileList = '<h4>Selected Files:</h4>';
for (let file of files) {
fileList += `<div class="file-item">
<div class="file-name">${file.name}</div>
<div class="file-info">Size: ${(file.size / 1024 / 1024).toFixed(2)} MB, Type: ${file.type}</div>
</div>`;
}
uploadSection.innerHTML = fileList + uploadSection.innerHTML;
}
}
// Form submission
document.getElementById('uploadForm').addEventListener('submit', async function(e) {
e.preventDefault();
const formData = new FormData();
const files = fileInput.files;
const text = textInput.value.trim();
if (files.length === 0 && text === '') {
alert('Please select files or enter text content');
return;
}
// Add files
for (let file of files) {
formData.append('files', file);
}
// Add text
if (text) {
formData.append('text', text);
}
submitBtn.disabled = true;
progress.style.display = 'block';
progressBar.style.width = '0%';
try {
// Simulate progress
let progressValue = 0;
const progressInterval = setInterval(() => {
progressValue += Math.random() * 15;
if (progressValue > 90) progressValue = 90;
progressBar.style.width = progressValue + '%';
}, 200);
const response = await fetch('/upload', {
method: 'POST',
body: formData
});
clearInterval(progressInterval);
progressBar.style.width = '100%';
const result = await response.json();
if (result.success) {
uploads = result.uploads || [];
showResults(result);
} else {
showError(result.error);
}
} catch (error) {
showError('Upload failed: ' + error.message);
} finally {
submitBtn.disabled = false;
setTimeout(() => {
progress.style.display = 'none';
progressBar.style.width = '0%';
}, 1000);
}
});
async function generateTrainingData() {
if (uploads.length === 0) {
alert('No uploads available. Please upload files or enter text first.');
return;
}
try {
const response = await fetch('/generate_training_data', {
method: 'POST',
headers: {
'Content-Type': 'application/json',
},
body: JSON.stringify({ file_ids: uploads.map(u => u.file_id) })
});
const result = await response.json();
if (result.success) {
showSuccess(`Training data generated: ${result.training_data_file}`);
// Add download link
const downloadLink = document.createElement('a');
downloadLink.href = `/download/${result.training_data_file}`;
downloadLink.textContent = 'Download Training Data';
downloadLink.className = 'download-btn';
resultsContent.appendChild(downloadLink);
} else {
showError(result.error);
}
} catch (error) {
showError('Training data generation failed: ' + error.message);
}
}
function showResults(data) {
let html = '<div class="success">Processing completed successfully!</div>';
// Statistics
html += '<div class="stats">';
html += `<div class="stat-card">
<div class="stat-number">${data.stats.total_files}</div>
<div class="stat-label">Files Processed</div>
</div>`;
html += `<div class="stat-card">
<div class="stat-number">${data.stats.total_chunks}</div>
<div class="stat-label">Chunks Created</div>
</div>`;
html += `<div class="stat-card">
<div class="stat-number">${data.stats.total_size_mb.toFixed(2)}</div>
<div class="stat-label">Total Size (MB)</div>
</div>`;
html += '</div>';
// File list
if (data.uploads && data.uploads.length > 0) {
html += '<div class="file-list">';
html += '<h4>Processed Files:</h4>';
for (let upload of data.uploads) {
html += `<div class="file-item">
<div class="file-name">${upload.filename}</div>
<div class="file-info">Size: ${(upload.file_size / 1024 / 1024).toFixed(2)} MB, Type: ${upload.mime_type}</div>
<div class="chunk-info">Chunks: ${upload.chunks.length}</div>
</div>`;
}
html += '</div>';
}
resultsContent.innerHTML = html;
results.style.display = 'block';
}
function showError(message) {
resultsContent.innerHTML = `<div class="error">Error: ${message}</div>`;
results.style.display = 'block';
}
function showSuccess(message) {
resultsContent.innerHTML = `<div class="success">${message}</div>`;
results.style.display = 'block';
}
function clearAll() {
fileInput.value = '';
textInput.value = '';
charCount.textContent = '0';
uploads = [];
results.style.display = 'none';
uploadSection.innerHTML = `
<h3>πŸ“ File Upload</h3>
<p>Drag and drop files here or click to select</p>
<div class="file-input">
<label for="fileInput">Choose Files</label>
<input type="file" id="fileInput" name="files" multiple accept=".txt,.md,.py,.js,.html,.css,.json,.jsonl,.csv,.pdf,.doc,.docx,.xml,.yaml,.yml">
</div>
<p><small>Supported formats: TXT, MD, PY, JS, HTML, CSS, JSON, CSV, PDF, DOC, DOCX, XML, YAML</small></p>
`;
}
</script>
</body>
</html>
"""
@app.route('/')
def index():
"""Main upload page."""
return render_template_string(UPLOAD_TEMPLATE)
@app.route('/upload', methods=['POST'])
def upload_files():
"""Handle file uploads and text input."""
try:
uploads = []
# Process uploaded files
if 'files' in request.files:
files = request.files.getlist('files')
for file in files:
if file.filename:
# Save uploaded file temporarily
filename = secure_filename(file.filename)
temp_path = Path(processor.upload_dir) / filename
file.save(str(temp_path))
# Process file
file_upload = processor.process_file_upload(temp_path)
uploads.append({
'file_id': file_upload.file_id,
'filename': file_upload.filename,
'file_size': file_upload.file_size,
'mime_type': file_upload.mime_type,
'chunks': len(file_upload.chunks)
})
# Process text input
text_content = request.form.get('text', '').strip()
if text_content:
chunks = processor.process_high_capacity_input(text_content)
uploads.append({
'file_id': 'text_input_' + str(int(time.time())),
'filename': 'text_input.txt',
'file_size': len(text_content),
'mime_type': 'text/plain',
'chunks': len(chunks)
})
# Get processing stats
stats = processor.get_processing_stats()
return jsonify({
'success': True,
'uploads': uploads,
'stats': stats
})
except Exception as e:
return jsonify({
'success': False,
'error': str(e)
}), 400
@app.route('/generate_training_data', methods=['POST'])
def generate_training_data():
"""Generate training data from processed uploads."""
try:
data = request.get_json()
file_ids = data.get('file_ids', [])
# Get all uploads
all_uploads = processor.get_all_uploads()
# Filter by file IDs if provided
if file_ids:
filtered_uploads = [upload for upload in all_uploads if upload.file_id in file_ids]
else:
filtered_uploads = all_uploads
# Generate training data
training_data_file = processor.create_training_data_from_chunks(
filtered_uploads,
output_format='jsonl',
include_metadata=True
)
return jsonify({
'success': True,
'training_data_file': Path(training_data_file).name,
'file_path': training_data_file,
'total_examples': sum(len(upload.chunks) for upload in filtered_uploads)
})
except Exception as e:
return jsonify({
'success': False,
'error': str(e)
}), 400
@app.route('/download/<filename>')
def download_file(filename):
"""Download generated training data file."""
file_path = processor.training_data_dir / filename
if file_path.exists():
return send_file(str(file_path), as_attachment=True)
else:
return "File not found", 404
@app.route('/stats')
def get_stats():
"""Get processing statistics."""
stats = processor.get_processing_stats()
return jsonify(stats)
@app.route('/uploads')
def list_uploads():
"""List all uploads."""
uploads = processor.get_all_uploads()
return jsonify([{
'file_id': upload.file_id,
'filename': upload.filename,
'file_size': upload.file_size,
'mime_type': upload.mime_type,
'upload_timestamp': upload.upload_timestamp,
'chunks': len(upload.chunks)
} for upload in uploads])
if __name__ == '__main__':
print("πŸš€ Starting LiMp High Capacity Input Processor")
print("πŸ“ Upload directory:", processor.upload_dir)
print("πŸ“ Chunk directory:", processor.chunk_dir)
print("πŸ“ Training data directory:", processor.training_data_dir)
print("🌐 Web interface: http://localhost:5000")
app.run(debug=True, host='0.0.0.0', port=5000)