Samfredoly commited on
Commit
1a4ef63
·
verified ·
1 Parent(s): bd044f5

Create main.py

Browse files
Files changed (1) hide show
  1. main.py +385 -0
main.py ADDED
@@ -0,0 +1,385 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ FastAPI application for HF Uploader Dashboard
3
+ """
4
+ import os
5
+ import asyncio
6
+ import logging
7
+ from contextlib import asynccontextmanager
8
+ from fastapi import FastAPI, Depends, HTTPException, UploadFile, File
9
+ from fastapi.responses import FileResponse, JSONResponse
10
+ from fastapi.staticfiles import StaticFiles
11
+ from sqlalchemy.orm import Session
12
+ from datetime import datetime
13
+
14
+ from database import init_db, get_db, SessionLocal
15
+ from models import (
16
+ HFConfig,
17
+ UploadQueue,
18
+ UploadStatusEnum,
19
+ ProcessingState,
20
+ ProcessingStatusEnum,
21
+ UploadErrorLog,
22
+ )
23
+ from schemas import (
24
+ HFConfigUpdate,
25
+ HFConfigResponse,
26
+ UploadQueueItem,
27
+ UploadQueueStats,
28
+ ProcessingStateResponse,
29
+ ProcessRequest,
30
+ UploadRequest,
31
+ FilePreviewRequest,
32
+ FilePreviewResponse,
33
+ RateLimitStatus,
34
+ UploadStatus,
35
+ ErrorLogResponse,
36
+ )
37
+ from file_processor import FileProcessor
38
+ from hf_uploader import HFUploader
39
+
40
+ # Configure logging
41
+ logging.basicConfig(level=logging.INFO)
42
+ logger = logging.getLogger(__name__)
43
+
44
+ # Global instances
45
+ file_processor = FileProcessor(processed_files_dir="processed_files")
46
+ hf_uploader_instance = None
47
+
48
+
49
+ @asynccontextmanager
50
+ async def lifespan(app: FastAPI):
51
+ """Lifespan context manager for startup and shutdown"""
52
+ # Startup
53
+ logger.info("Initializing database...")
54
+ init_db()
55
+ logger.info("Application started")
56
+ yield
57
+ # Shutdown
58
+ logger.info("Application shutting down...")
59
+
60
+
61
+ # Create FastAPI app
62
+ app = FastAPI(
63
+ title="HF Uploader Dashboard",
64
+ description="Hugging Face Dataset Uploader with Rate Limiting",
65
+ version="1.0.0",
66
+ lifespan=lifespan,
67
+ )
68
+
69
+
70
+ # ============================================================================
71
+ # Configuration Endpoints
72
+ # ============================================================================
73
+
74
+
75
+ @app.get("/api/config", response_model=HFConfigResponse)
76
+ async def get_config(db: Session = Depends(get_db)):
77
+ """Get current HF configuration"""
78
+ config = db.query(HFConfig).first()
79
+ if not config:
80
+ # Create default config
81
+ config = HFConfig()
82
+ db.add(config)
83
+ db.commit()
84
+ db.refresh(config)
85
+ return config
86
+
87
+
88
+ @app.post("/api/config", response_model=HFConfigResponse)
89
+ async def update_config(
90
+ config_update: HFConfigUpdate, db: Session = Depends(get_db)
91
+ ):
92
+ """Update HF configuration"""
93
+ config = db.query(HFConfig).first()
94
+ if not config:
95
+ config = HFConfig()
96
+ db.add(config)
97
+
98
+ # Update fields
99
+ update_data = config_update.model_dump(exclude_unset=True)
100
+ for field, value in update_data.items():
101
+ setattr(config, field, value)
102
+
103
+ config.updated_at = datetime.utcnow()
104
+ db.commit()
105
+ db.refresh(config)
106
+
107
+ # Update global uploader instance
108
+ global hf_uploader_instance
109
+ if config.hf_token:
110
+ hf_uploader_instance = HFUploader(config.hf_token, config.target_repo)
111
+
112
+ return config
113
+
114
+
115
+ # ============================================================================
116
+ # Upload Queue Endpoints
117
+ # ============================================================================
118
+
119
+
120
+ @app.get("/api/queue", response_model=list[UploadQueueItem])
121
+ async def get_upload_queue(
122
+ status: str = None, db: Session = Depends(get_db)
123
+ ):
124
+ """Get upload queue items"""
125
+ query = db.query(UploadQueue)
126
+ if status:
127
+ query = query.filter(UploadQueue.status == status)
128
+ return query.all()
129
+
130
+
131
+ @app.get("/api/queue/stats", response_model=UploadQueueStats)
132
+ async def get_queue_stats(db: Session = Depends(get_db)):
133
+ """Get upload queue statistics"""
134
+ all_items = db.query(UploadQueue).all()
135
+ return UploadQueueStats(
136
+ pending=len([i for i in all_items if i.status == UploadStatusEnum.PENDING]),
137
+ uploading=len(
138
+ [i for i in all_items if i.status == UploadStatusEnum.UPLOADING]
139
+ ),
140
+ completed=len(
141
+ [i for i in all_items if i.status == UploadStatusEnum.COMPLETED]
142
+ ),
143
+ failed=len([i for i in all_items if i.status == UploadStatusEnum.FAILED]),
144
+ total=len(all_items),
145
+ )
146
+
147
+
148
+ @app.post("/api/queue/add")
149
+ async def add_to_queue(db: Session = Depends(get_db)):
150
+ """Add processed files to upload queue"""
151
+ processed_files = file_processor.get_processed_files()
152
+ added_count = 0
153
+
154
+ for filename in processed_files:
155
+ # Check if already in queue
156
+ existing = (
157
+ db.query(UploadQueue)
158
+ .filter(UploadQueue.file_name == filename)
159
+ .first()
160
+ )
161
+ if existing:
162
+ continue
163
+
164
+ file_path = str(file_processor.processed_files_dir / filename)
165
+ file_size = file_processor.get_file_size(filename) or 0
166
+
167
+ queue_item = UploadQueue(
168
+ file_name=filename,
169
+ file_path=file_path,
170
+ file_size=file_size,
171
+ status=UploadStatusEnum.PENDING,
172
+ )
173
+ db.add(queue_item)
174
+ added_count += 1
175
+
176
+ db.commit()
177
+ return {"added": added_count, "total": len(processed_files)}
178
+
179
+
180
+ @app.post("/api/queue/clear")
181
+ async def clear_queue(db: Session = Depends(get_db)):
182
+ """Clear upload queue"""
183
+ db.query(UploadQueue).delete()
184
+ db.commit()
185
+ return {"message": "Queue cleared"}
186
+
187
+
188
+ # ============================================================================
189
+ # Processing Endpoints
190
+ # ============================================================================
191
+
192
+
193
+ @app.get("/api/processing/state", response_model=ProcessingStateResponse)
194
+ async def get_processing_state(db: Session = Depends(get_db)):
195
+ """Get current processing state"""
196
+ state = db.query(ProcessingState).first()
197
+ if not state:
198
+ state = ProcessingState(status=ProcessingStatusEnum.IDLE)
199
+ db.add(state)
200
+ db.commit()
201
+ db.refresh(state)
202
+ return state
203
+
204
+
205
+ @app.post("/api/processing/start")
206
+ async def start_processing(
207
+ request: ProcessRequest, db: Session = Depends(get_db)
208
+ ):
209
+ """Start dataset processing"""
210
+ config = db.query(HFConfig).first()
211
+ if not config:
212
+ raise HTTPException(status_code=400, detail="Configuration not set")
213
+
214
+ if not config.hf_token:
215
+ raise HTTPException(status_code=400, detail="HF token not configured")
216
+
217
+ # Run processing in background
218
+ asyncio.create_task(
219
+ file_processor.process_datasets(
220
+ all_repo_id=config.source_all_repo,
221
+ ato_repo_id=config.source_ato_repo,
222
+ hf_token=config.hf_token,
223
+ max_files=request.max_files,
224
+ db=db,
225
+ )
226
+ )
227
+
228
+ return {"message": "Processing started"}
229
+
230
+
231
+ @app.get("/api/processing/files")
232
+ async def get_processed_files():
233
+ """Get list of processed files"""
234
+ files = file_processor.get_processed_files()
235
+ return {"files": files, "count": len(files)}
236
+
237
+
238
+ # ============================================================================
239
+ # Upload Endpoints
240
+ # ============================================================================
241
+
242
+
243
+ @app.post("/api/upload/start")
244
+ async def start_upload(request: UploadRequest, db: Session = Depends(get_db)):
245
+ """Start uploading files"""
246
+ global hf_uploader_instance
247
+
248
+ config = db.query(HFConfig).first()
249
+ if not config or not config.hf_token:
250
+ raise HTTPException(status_code=400, detail="HF token not configured")
251
+
252
+ if not hf_uploader_instance:
253
+ hf_uploader_instance = HFUploader(config.hf_token, config.target_repo)
254
+
255
+ # Get files to upload
256
+ files_to_upload = (
257
+ db.query(UploadQueue)
258
+ .filter(UploadQueue.id.in_(request.file_ids))
259
+ .all()
260
+ )
261
+
262
+ if not files_to_upload:
263
+ raise HTTPException(status_code=400, detail="No files found")
264
+
265
+ # Convert to upload format
266
+ upload_files = [
267
+ {
268
+ "id": f.id,
269
+ "file_name": f.file_name,
270
+ "file_path": f.file_path,
271
+ }
272
+ for f in files_to_upload
273
+ ]
274
+
275
+ # Run upload in background
276
+ asyncio.create_task(
277
+ hf_uploader_instance.upload_files_batch(
278
+ files=upload_files,
279
+ db=db,
280
+ batch_size=config.upload_batch_size,
281
+ )
282
+ )
283
+
284
+ return {"message": "Upload started", "file_count": len(upload_files)}
285
+
286
+
287
+ @app.get("/api/upload/status", response_model=UploadStatus)
288
+ async def get_upload_status(db: Session = Depends(get_db)):
289
+ """Get current upload status"""
290
+ global hf_uploader_instance
291
+
292
+ config = db.query(HFConfig).first()
293
+ if not config or not config.hf_token:
294
+ raise HTTPException(status_code=400, detail="HF token not configured")
295
+
296
+ if not hf_uploader_instance:
297
+ hf_uploader_instance = HFUploader(config.hf_token, config.target_repo)
298
+
299
+ return await hf_uploader_instance.get_upload_status(db)
300
+
301
+
302
+ @app.post("/api/upload/retry/{item_id}")
303
+ async def retry_upload(item_id: int, db: Session = Depends(get_db)):
304
+ """Retry uploading a failed file"""
305
+ queue_item = db.query(UploadQueue).filter(UploadQueue.id == item_id).first()
306
+ if not queue_item:
307
+ raise HTTPException(status_code=404, detail="Item not found")
308
+
309
+ queue_item.status = UploadStatusEnum.PENDING
310
+ queue_item.retry_count = 0
311
+ db.commit()
312
+
313
+ return {"message": "Retry scheduled"}
314
+
315
+
316
+ # ============================================================================
317
+ # File Preview Endpoints
318
+ # ============================================================================
319
+
320
+
321
+ @app.post("/api/file/preview", response_model=FilePreviewResponse)
322
+ async def preview_file(request: FilePreviewRequest):
323
+ """Get file preview"""
324
+ content = file_processor.get_file_content(request.filename)
325
+ if not content:
326
+ raise HTTPException(status_code=404, detail="File not found")
327
+
328
+ file_size = file_processor.get_file_size(request.filename) or 0
329
+
330
+ return FilePreviewResponse(
331
+ filename=request.filename, size=file_size, content=content
332
+ )
333
+
334
+
335
+ # ============================================================================
336
+ # Error Log Endpoints
337
+ # ============================================================================
338
+
339
+
340
+ @app.get("/api/errors", response_model=list[ErrorLogResponse])
341
+ async def get_error_logs(limit: int = 50, db: Session = Depends(get_db)):
342
+ """Get error logs"""
343
+ return (
344
+ db.query(UploadErrorLog)
345
+ .order_by(UploadErrorLog.created_at.desc())
346
+ .limit(limit)
347
+ .all()
348
+ )
349
+
350
+
351
+ @app.delete("/api/errors")
352
+ async def clear_error_logs(db: Session = Depends(get_db)):
353
+ """Clear error logs"""
354
+ db.query(UploadErrorLog).delete()
355
+ db.commit()
356
+ return {"message": "Error logs cleared"}
357
+
358
+
359
+ # ============================================================================
360
+ # Static Files
361
+ # ============================================================================
362
+
363
+
364
+ # Create static directory if it doesn't exist
365
+ os.makedirs("static", exist_ok=True)
366
+
367
+ # Mount static files
368
+ app.mount("/static", StaticFiles(directory="static"), name="static")
369
+
370
+
371
+ # ============================================================================
372
+ # Root Endpoint
373
+ # ============================================================================
374
+
375
+
376
+ @app.get("/")
377
+ async def root():
378
+ """Root endpoint - serve dashboard"""
379
+ return FileResponse("static/index.html")
380
+
381
+
382
+ if __name__ == "__main__":
383
+ import uvicorn
384
+
385
+ uvicorn.run(app, host="0.0.0.0", port=8000)