paijo77 commited on
Commit
91310c5
·
verified ·
1 Parent(s): 19d1ee9

update app/admin/scraping_admin.py

Browse files
Files changed (1) hide show
  1. app/admin/scraping_admin.py +780 -0
app/admin/scraping_admin.py ADDED
@@ -0,0 +1,780 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Admin Panel untuk Enhanced Scraping Configuration
3
+
4
+ Panel admin lengkap untuk mengelola konfigurasi scraping module,
5
+ monitoring statistik, dan mengatur scraping operations.
6
+ """
7
+
8
+ import asyncio
9
+ from typing import List, Dict, Any, Optional
10
+ from datetime import datetime, timedelta
11
+ from fastapi import APIRouter, Depends, BackgroundTasks, HTTPException, Query, Request
12
+ from sqlalchemy.ext.asyncio import AsyncSession
13
+ from sqlalchemy import select, func, and_
14
+ from pydantic import BaseModel, Field
15
+ from fastapi import BackgroundTasks
16
+
17
+ from app.database import get_db
18
+ from app.dependencies import require_admin
19
+ from app.db_storage_extended import extended_db_storage
20
+ from app.db_models import ProxySource, User, ValidationHistory, ScrapingSession
21
+ from app.grabber.scraping_enhancements import (
22
+ EnhancedScrapingService,
23
+ ScrapingEnhancementConfig,
24
+ )
25
+ from app.grabber.scraping_utils import (
26
+ RequestQueue,
27
+ RateLimiter,
28
+ ExponentialBackoff,
29
+ ProxyRotator,
30
+ calculate_proxy_score,
31
+ format_bytes,
32
+ generate_session_id,
33
+ )
34
+
35
+
36
+ router = APIRouter(
37
+ prefix="/api/v1/admin/scraping",
38
+ tags=["admin", "scraping"],
39
+ dependencies=[Depends(require_admin), Depends(get_db)],
40
+ )
41
+
42
+
43
+ # Pydantic models untuk admin panel
44
+ class ScrapingConfigResponse(BaseModel):
45
+ """Response model untuk scraping configuration"""
46
+
47
+ global_config: Dict[str, Any]
48
+ module_configs: Dict[str, Any]
49
+ active_sessions: List[Dict[str, Any]]
50
+ rate_limiter_status: Dict[str, Any]
51
+ performance_stats: Dict[str, Any]
52
+ performance_stats: Dict[str, Any]
53
+
54
+
55
+ class ScrapingConfigRequest(BaseModel):
56
+ """Request model untuk memperbarui konfigurasi"""
57
+
58
+ module_name: str
59
+ settings: Dict[str, Any]
60
+
61
+
62
+ class SessionResponse(BaseModel):
63
+ """Response model untuk session scraping"""
64
+
65
+ session_id: str
66
+ start_time: datetime
67
+ end_time: Optional[datetime]
68
+ duration: Optional[float]
69
+ requests_made: int
70
+ successful_requests: int
71
+ failed_requests: int
72
+ success_rate: float
73
+ total_data_bytes: int
74
+ avg_response_time: float
75
+ proxies_used: List[str]
76
+ proxies_tested: int
77
+
78
+
79
+ class SessionStatsResponse(BaseModel):
80
+ """Response model untuk statistik session"""
81
+
82
+ total_sessions: int
83
+ active_sessions: int
84
+ total_requests: int
85
+ successful_requests: int
86
+ avg_session_duration: float
87
+ total_data_bytes: int
88
+ success_rate: float
89
+
90
+
91
+ class ProxySourceManagementResponse(BaseModel):
92
+ """Response model untuk manajemen proxy sources"""
93
+
94
+ sources: List[Dict[str, Any]]
95
+ total: int
96
+ pending_approval: int
97
+ auto_discovered: int
98
+
99
+
100
+ class AdvancedScrapingRequest(BaseModel):
101
+ """Request model untuk advanced scraping configuration"""
102
+
103
+ enable_scheduler: bool = False
104
+ schedule: Optional[Dict[str, Any]] = None
105
+ enable_proxy_testing: bool = False
106
+ proxy_rotation_enabled: bool = False
107
+ max_proxies_per_source: Optional[int] = None
108
+ test_urls: Optional[List[str]] = None
109
+
110
+
111
+ # Background tasks
112
+ background_tasks = BackgroundTasks()
113
+
114
+
115
+ # Global scraping service instance
116
+ enhanced_service: Optional[EnhancedScrapingService] = None
117
+
118
+
119
+ def get_enhanced_service() -> EnhancedScrapingService:
120
+ """Get atau buat enhanced scraping service instance"""
121
+ global enhanced_service
122
+ if not enhanced_service:
123
+ default_config = ScrapingEnhancementConfig(
124
+ enable_proxy_rotation=True,
125
+ max_concurrent_requests=10,
126
+ rate_limit_per_second=5,
127
+ enable_retry=True,
128
+ max_retries=3,
129
+ timeout_seconds=30,
130
+ )
131
+ enhanced_service = EnhancedScrapingService(config=default_config)
132
+
133
+ return enhanced_service
134
+
135
+
136
+ # Global session manager
137
+ active_sessions: Dict[str, ScrapingSession] = {}
138
+
139
+
140
+ @router.get("/scraping/config", response_model=ScrapingConfigResponse)
141
+ async def get_scraping_config() -> ScrapingConfigResponse:
142
+ """Dapatkan konfigurasi scraping global dan per-module"""
143
+ service = get_enhanced_service()
144
+
145
+ # Get global config
146
+ global_config = service.config_manager.get_global_config()
147
+
148
+ # Get module configs
149
+ module_configs = {}
150
+ for module_name in ["github_grabber", "subscription_grabber", "advanced_grabber"]:
151
+ module_configs[module_name] = service.config_manager.get_config(module_name)
152
+
153
+ # Get active sessions
154
+ global_sessions = {
155
+ session_id: session_data.session_id
156
+ for session_id, session_data in active_sessions.items()
157
+ }
158
+
159
+ # Get rate limiter status
160
+ rate_limiter_status = service.config_manager.config.get("global", {})
161
+
162
+ return ScrapingConfigResponse(
163
+ global_config=global_config,
164
+ module_configs=module_configs,
165
+ active_sessions=global_sessions,
166
+ rate_limiter_status=rate_limiter_status,
167
+ performance_stats=service.performance_monitor.get_overall_stats()
168
+ if service.performance_monitor
169
+ else {},
170
+ )
171
+
172
+
173
+ @router.post("/scraping/config/{module_name}", response_model=dict)
174
+ async def update_scraping_config(
175
+ request: ScrapingConfigRequest,
176
+ module_name: str,
177
+ db: AsyncSession = Depends(get_db),
178
+ ) -> Dict[str, Any]:
179
+ """Perbarui konfigurasi scraping untuk module spesifik"""
180
+ service = get_enhanced_service()
181
+
182
+ # Validate request
183
+ if not module_name or module_name not in [
184
+ "github_grabber",
185
+ "subscription_grabber",
186
+ "advanced_grabber",
187
+ ]:
188
+ raise HTTPException(status_code=400, detail="Invalid module name")
189
+
190
+ # Validate settings
191
+ for key, value in request.settings.items():
192
+ if key not in [
193
+ "max_concurrent_requests",
194
+ "default_timeout",
195
+ "max_retries",
196
+ "retry_delay",
197
+ "github_timeout",
198
+ "github_max_retries",
199
+ "subscription_timeout",
200
+ "subscription_max_retries",
201
+ "enable_base64_padding_fix",
202
+ "supported_formats",
203
+ "enable_batching",
204
+ "batch_size",
205
+ "min_proxy_quality",
206
+ "enable_duplicate_filtering",
207
+ "enable_user_agent_rotation",
208
+ "enable_proxy_rotation",
209
+ "max_proxies_per_source",
210
+ "success_rate_threshold",
211
+ ]:
212
+ raise HTTPException(status_code=400, detail=f"Invalid setting: {key}")
213
+
214
+ try:
215
+ # Update konfigurasi
216
+ success = await service.config_manager.update_config(
217
+ module_name, request.settings
218
+ )
219
+ if not success:
220
+ raise HTTPException(
221
+ status_code=500, detail=f"Gagal memperbarui konfigurasi {module_name}"
222
+ )
223
+
224
+ return {
225
+ "message": f"Konfigurasi {module_name} berhasil diperbarui",
226
+ "success": True,
227
+ }
228
+
229
+ except Exception as e:
230
+ raise HTTPException(
231
+ status_code=500, detail=f"Error memperbarui konfigurasi: {str(e)}"
232
+ )
233
+
234
+
235
+ @router.post("/scraping/start-session", response_model=dict)
236
+ async def start_scraping_session(
237
+ request: Dict[str, Any],
238
+ db: AsyncSession = Depends(get_db),
239
+ ) -> Dict[str, Any]:
240
+ """Memulai sesi scraping baru"""
241
+
242
+ service = get_enhanced_service()
243
+
244
+ # Dapatkan konfigurasi
245
+ module_configs = {}
246
+ for module_name in ["github_grabber", "subscription_grabber", "advanced_grabber"]:
247
+ module_configs[module_name] = service.config_manager.get_config(module_name)
248
+
249
+ # Validasi input
250
+ source_config = request.get("source_config", {})
251
+ if not source_config:
252
+ raise HTTPException(status_code=400, detail="Source config required")
253
+
254
+ # Create session ID
255
+ session_id = generate_session_id()
256
+ session = ScrapingSession(session_id=session_id, start_time=datetime.now())
257
+
258
+ # Create scraping session
259
+ task_id = background_tasks.add_task(
260
+ service.run_scraping_session(
261
+ session=session, source_config=source_config, module_configs=module_configs
262
+ ),
263
+ name=f"scraping-session-{session_id[:8]}",
264
+ )
265
+
266
+ active_sessions[session_id] = session
267
+
268
+ return {
269
+ "message": f"Sesi scraping dimulai dengan ID: {session_id}",
270
+ "session_id": session_id,
271
+ "task_id": task_id,
272
+ }
273
+
274
+
275
+ @router.get("/scraping/sessions/{session_id}", response_model=SessionResponse)
276
+ async def get_scraping_session(
277
+ session_id: str,
278
+ db: AsyncSession = Depends(get_db),
279
+ ) -> SessionResponse:
280
+ """Dapatkan detail sesi scraping"""
281
+ service = get_enhanced_service()
282
+
283
+ if session_id not in active_sessions:
284
+ raise HTTPException(status_code=404, detail="Session not found")
285
+
286
+ session = active_sessions[session_id]
287
+
288
+ return SessionResponse(
289
+ session_id=session_id,
290
+ start_time=session.start_time.isoformat(),
291
+ end_time=session.end_time.isoformat() if session.end_time else None,
292
+ duration=(session.end_time - session.start_time).total_seconds()
293
+ if session.end_time
294
+ else None,
295
+ requests_made=session.requests_made,
296
+ successful_requests=session.successful_requests,
297
+ failed_requests=session.failed_requests,
298
+ success_rate=session.success_rate,
299
+ total_data_bytes=session.total_data_bytes,
300
+ avg_response_time=session.avg_response_time,
301
+ proxies_used=session.proxies_used,
302
+ )
303
+
304
+
305
+ @router.post("/scraping/sessions/{session_id}/stop", response_model=dict)
306
+ async def stop_scraping_session(
307
+ session_id: str,
308
+ db: AsyncSession = Depends(get_db),
309
+ ) -> Dict[str, Any]:
310
+ """Hentikan sesi scraping"""
311
+ service = get_enhanced_service()
312
+
313
+ if session_id not in active_sessions:
314
+ raise HTTPException(status_code=404, detail="Session not found")
315
+
316
+ session = active_sessions[session_id]
317
+
318
+ # Simpanikan background task
319
+ task_id = session.task_id
320
+ if task_id and background_tasks:
321
+ background_tasks.cancel_task(task_id)
322
+
323
+ # Mark session as ended
324
+ session.end_time = datetime.now()
325
+
326
+ # Update session data
327
+ success = await extended_db_storage.update_scraping_session(
328
+ session=db,
329
+ session_id=session.session_id,
330
+ status="completed",
331
+ proxies_found=len(session.proxies_used),
332
+ proxies_valid=session.proxies_tested,
333
+ )
334
+
335
+ # Remove dari aktif
336
+ del active_sessions[session_id]
337
+
338
+ return {"message": f"Sesi {session_id} dihentikan", "session_id": session_id}
339
+
340
+
341
+ @router.get("/scraping/stats/overview", response_model=dict)
342
+ async def get_scraping_overview(db: AsyncSession = Depends(get_db)) -> Dict[str, Any]:
343
+ """Dapatkan statistik overview scraping"""
344
+ service = get_enhanced_service()
345
+
346
+ # Dapatkan statistik dari performance monitor
347
+ overview_stats = service.performance_monitor.get_overall_stats()
348
+
349
+ # Dapatkan statistik proxy sources
350
+ total_sources = await db_storage.count_sources(db)
351
+ hunter_stats = await extended_db_storage.get_hunter_statistics(db)
352
+ pending_approval = hunter_stats.get("total_candidates", 0)
353
+
354
+ total_sessions = 0
355
+
356
+ total_proxies = await db_storage.count_proxies(db)
357
+ validated_proxies = await db_storage.count_proxies(
358
+ db, validation_status="validated"
359
+ )
360
+
361
+ return {
362
+ "proxy_sources": {
363
+ "total": total_sources,
364
+ "auto_discovered": pending_approval,
365
+ "manual": total_sources - pending_approval,
366
+ },
367
+ "sessions": {
368
+ "total": total_sessions,
369
+ "active": active_sessions,
370
+ "completed": total_sessions - active_sessions,
371
+ },
372
+ "validation": {
373
+ "total_proxies": total_proxies,
374
+ "validated": validated_proxies,
375
+ "validation_rate": (validated_proxies / total_proxies) * 100
376
+ if total_proxies > 0
377
+ else 0,
378
+ },
379
+ "performance": overview_stats,
380
+ }
381
+
382
+
383
+ @router.get(
384
+ "/scraping/stats/sessions/{session_id}", response_model=SessionStatsResponse
385
+ )
386
+ async def get_session_stats(session_id: str) -> SessionStatsResponse:
387
+ """Dapatkan statistik detail sesi spesifik"""
388
+ service = get_enhanced_service()
389
+
390
+ if session_id not in active_sessions:
391
+ raise HTTPException(status_code=404, detail="Session not found")
392
+
393
+ session = active_sessions[session_id]
394
+
395
+ # Calculate detailed statistics
396
+ # Implementasi...
397
+ session_start = session.start_time
398
+ session_end = session.end_time or datetime.now()
399
+
400
+ return SessionStatsResponse(
401
+ session_id=session_id,
402
+ start_time=session_start.isoformat(),
403
+ end_time=session_end.isoformat(),
404
+ duration=(session_end - session_start).total_seconds(),
405
+ requests_made=session.requests_made,
406
+ successful_requests=session.successful_requests,
407
+ failed_requests=session.failed_requests,
408
+ success_rate=session.success_rate,
409
+ total_data_bytes=session.total_data_bytes,
410
+ avg_response_time=session.avg_response_time,
411
+ proxies_used=len(session.proxies_used) if session.proxies_used else 0,
412
+ )
413
+
414
+
415
+ @router.get("/scraping/proxy-sources", response_model=ProxySourceManagementResponse)
416
+ async def get_proxy_sources(
417
+ db: AsyncSession = Depends(get_db),
418
+ status: Optional[str] = None,
419
+ limit: int = 50,
420
+ offset: int = 0,
421
+ ) -> ProxySourceManagementResponse:
422
+ """Dapatkan daftar proxy sources dengan management tools"""
423
+
424
+ # Get proxy sources
425
+ sources = await db_storage.get_sources(db, enabled_only=(status == "enabled"))
426
+
427
+ if status == "pending":
428
+ hunter_stats = await extended_db_storage.get_hunter_statistics(db)
429
+ pending_approval = (
430
+ hunter_stats.get("status_stats", {}).get("pending", {}).get("count", 0)
431
+ )
432
+ else:
433
+ pending_approval = 0
434
+
435
+ return ProxySourceManagementResponse(
436
+ sources=[
437
+ {
438
+ "id": source.id,
439
+ "url": source.url,
440
+ "type": source.type.value,
441
+ "name": source.name,
442
+ "description": source.description,
443
+ "enabled": source.enabled,
444
+ "validated": source.validated,
445
+ "validation_error": source.validation_error,
446
+ "total_scraped": source.total_scraped,
447
+ "success_rate": source.success_rate,
448
+ "created_at": source.created_at.isoformat(),
449
+ "updated_at": source.updated_at.isoformat(),
450
+ "is_admin_source": source.is_admin_source,
451
+ "candidate_id": source.candidate_id,
452
+ }
453
+ for source in sources
454
+ ],
455
+ total=len(sources),
456
+ pending_approval=pending_approval,
457
+ )
458
+
459
+
460
+ @router.post("/scraping/proxy-sources", response_model=dict)
461
+ async def create_proxy_source(
462
+ request: Dict[str, Any],
463
+ db: AsyncSession = Depends(get_db),
464
+ ) -> Dict[str, Any]:
465
+ """Tambah proxy source baru"""
466
+ service = get_enhanced_service()
467
+
468
+ try:
469
+ # Validasi input
470
+ if not request.get("url"):
471
+ raise HTTPException(status_code=400, detail="URL is required")
472
+
473
+ url = request.get("url")
474
+ if not url.startswith(("http://", "https://")):
475
+ raise HTTPException(
476
+ status_code=400, detail="URL must start with http:// or https://"
477
+ )
478
+
479
+ # Validasi URL
480
+ parsed_url = urlparse(url)
481
+ domain = parsed_url.netloc
482
+
483
+ # Cek domain yang diblokir
484
+ blocked_domains = ["example.com", "test.com"]
485
+ if any(blocked in domain for blocked in blocked_domains):
486
+ raise HTTPException(status_code=400, detail=f"Domain {domain} is blocked")
487
+
488
+ source = ProxySource(
489
+ url=url,
490
+ type=SourceType.MANUAL,
491
+ name=request.get("name", f"Source-{domain}"),
492
+ description=request.get("description", ""),
493
+ enabled=True,
494
+ is_admin_source=False,
495
+ )
496
+
497
+ # Simpankan dengan database
498
+ await db_storage.create_proxy_source(db, source)
499
+
500
+ return {"message": "Proxy source created successfully", "source_id": source.id}
501
+
502
+ except Exception as e:
503
+ raise HTTPException(
504
+ status_code=500, detail=f"Error creating proxy source: {str(e)}"
505
+ )
506
+
507
+
508
+ @router.put("/scraping/proxy-sources/{source_id}", response_model=dict)
509
+ async def update_proxy_source(
510
+ source_id: int,
511
+ request: Dict[str, Any],
512
+ db: AsyncSession = Depends(get_db),
513
+ ) -> Dict[str, Any]:
514
+ """Update proxy source"""
515
+ service = get_enhanced_service()
516
+
517
+ # Get existing source
518
+ stmt = select(ProxySource).where(ProxySource.id == source_id)
519
+ result = await db.execute(stmt)
520
+ source = result.scalar_one_or_none()
521
+
522
+ if not source:
523
+ raise HTTPException(status_code=404, detail="Source not found")
524
+
525
+ # Update fields
526
+ for field in ["name", "description", "enabled", "validated"]:
527
+ if field in request:
528
+ setattr(source, field, request[field])
529
+
530
+ try:
531
+ session.add(source)
532
+ await session.commit()
533
+
534
+ return {
535
+ "message": f"Proxy source {source_id} updated successfully",
536
+ "source_id": source.id,
537
+ }
538
+
539
+ except Exception as e:
540
+ raise HTTPException(
541
+ status_code=500, detail=f"Error updating proxy source: {str(e)}"
542
+ )
543
+
544
+
545
+ @router.delete("/scraping/proxy-sources/{source_id}", response_model=dict)
546
+ async def delete_proxy_source(
547
+ source_id: int,
548
+ db: AsyncSession = Depends(get_db),
549
+ ) -> Dict[str, Any]:
550
+ """Hapus proxy source"""
551
+ service = get_enhanced_service()
552
+
553
+ # Get existing source
554
+ stmt = select(ProxySource).where(ProxySource.id == source_id)
555
+ result = await db.execute(stmt)
556
+ source = result.scalar_one_or_none()
557
+
558
+ if not source:
559
+ raise HTTPException(status_code=404, detail="Source not found")
560
+
561
+ try:
562
+ await session.delete(source)
563
+ await session.commit()
564
+
565
+ return {
566
+ "message": f"Proxy source {source_id} deleted successfully",
567
+ "source_id": source_id,
568
+ }
569
+
570
+ except Exception as e:
571
+ raise HTTPException(
572
+ status_code=500, detail=f"Error deleting proxy source: {str(e)}"
573
+ )
574
+
575
+
576
+ @router.post("/scraping/proxy-sources/{source_id}/validate", response_model=dict)
577
+ async def validate_proxy_source(
578
+ source_id: int,
579
+ db: AsyncSession = Depends(get_db),
580
+ ) -> Dict[str, Any]:
581
+ """Validasi proxy source"""
582
+ service = get_enhanced_service()
583
+
584
+ # Get existing source
585
+ stmt = select(ProxySource).where(ProxySource.id == source_id)
586
+ result = await db.execute(stmt)
587
+ source = result.scalar_one_or_none()
588
+
589
+ if not source:
590
+ raise HTTPException(status_code=404, detail="Source not found")
591
+
592
+ try:
593
+ # Trigger validation
594
+ task_id = background_tasks.add_task(
595
+ service.validate_proxy_source(db=db, source_id=source_id),
596
+ name=f"validate-source-{source_id}",
597
+ )
598
+
599
+ return {
600
+ "message": f"Validation started for proxy source {source_id}",
601
+ "task_id": task_id,
602
+ }
603
+
604
+ except Exception as e:
605
+ raise HTTPException(
606
+ status_code=500, detail=f"Error starting validation: {str(e)}"
607
+ )
608
+
609
+
610
+ @router.get("/scraping/hunter", response_model=dict)
611
+ async def get_hunter_status(db: AsyncSession = Depends(get_db)) -> Dict[str, Any]:
612
+ """Dapatkan status Hunter Protocol"""
613
+ service = get_enhanced_service()
614
+
615
+ # Get Hunter statistics
616
+ hunter_stats = await extended_db_storage.get_hunter_statistics(db)
617
+
618
+ # Get active Hunter tasks
619
+ active_tasks = background_tasks.list_tasks()
620
+
621
+ return {
622
+ "active_tasks": active_tasks,
623
+ "statistics": hunter_stats,
624
+ "candidates_found": hunter_stats.get("candidates_found", 0),
625
+ "sources_approved": hunter_stats.get("sources_approved", 0),
626
+ "approval_rate": hunter_stats.get("approval_rate", 0.0),
627
+ "last_run": hunter_stats.get("last_run"),
628
+ }
629
+
630
+
631
+ @router.post("/scraping/hunter/trigger", response_model=dict)
632
+ async def trigger_hunter_manual(
633
+ db: AsyncSession = Depends(get_db),
634
+ ) -> Dict[str, Any]:
635
+ """Manual trigger Hunter Protocol"""
636
+ service = get_enhanced_service()
637
+
638
+ try:
639
+ task_id = background_tasks.add_task(
640
+ service.run_hunter(),
641
+ name=f"hunter-manual-trigger-{datetime.now().isoformat()}",
642
+ )
643
+
644
+ return {"message": "Hunter Protocol triggered manually", "task_id": task_id}
645
+
646
+ except Exception as e:
647
+ raise HTTPException(
648
+ status_code=500, detail=f"Error triggering Hunter: {str(e)}"
649
+ )
650
+
651
+
652
+ @router.get("/scraping/queue", response_model=dict)
653
+ async def get_request_queue(db: AsyncSession = Depends(get_db)) -> Dict[str, Any]:
654
+ """Dapatkan antrian request queue"""
655
+ service = get_enhanced_service()
656
+ queue = service.request_queue
657
+
658
+ return {
659
+ "queue_size": queue.size(),
660
+ "active_requests": len(queue.active_requests),
661
+ "completed_requests": len(queue.completed_requests),
662
+ "failed_requests": len(queue.failed_requests),
663
+ "global_requests": queue.global_requests,
664
+ "rate_limiter_stats": {
665
+ "domain_requests": queue.limiter.domain_requests,
666
+ "ip_requests": queue.limiter.ip_requests,
667
+ "global_rate_limiting": queue.limiter.global_rate_limiting,
668
+ },
669
+ }
670
+
671
+
672
+ @router.post("/scraping/queue/clear", response_model=dict)
673
+ async def clear_request_queue(db: AsyncSession = Depends(get_db)) -> Dict[str, Any]:
674
+ """Bersihkan antrian request queue"""
675
+ service = get_enhanced_service()
676
+
677
+ try:
678
+ await service.request_queue.clear()
679
+
680
+ return {"message": "Request queue cleared successfully"}
681
+
682
+ except Exception as e:
683
+ raise HTTPException(status_code=500, detail=f"Error clearing queue: {str(e)}")
684
+
685
+
686
+ @router.get("/scraping/advanced-config", response_model=AdvancedScrapingRequest)
687
+ async def get_advanced_config() -> AdvancedScrapingRequest:
688
+ """Dapatkan konfigurasi advanced scraping"""
689
+ service = get_enhanced_service()
690
+
691
+ config = service.config_manager.get_global_config()
692
+ advanced_config = config.get("enhanced", {})
693
+
694
+ return AdvancedScrapingRequest(
695
+ enable_scheduler=advanced_config.get("enable_scheduler", False),
696
+ schedule=advanced_config.get("schedule"),
697
+ enable_proxy_testing=advanced_config.get("enable_proxy_testing", False),
698
+ proxy_rotation_enabled=advanced_config.get("enable_proxy_rotation", False),
699
+ max_proxies_per_source=advanced_config.get("max_proxies_per_source"),
700
+ test_urls=advanced_config.get("test_urls", []),
701
+ )
702
+
703
+
704
+ @router.post("/scraping/advanced-config", response_model=dict)
705
+ async def update_advanced_config(
706
+ request: AdvancedScrapingRequest,
707
+ db: AsyncSession = Depends(get_db),
708
+ ) -> Dict[str, Any]:
709
+ """Perbarui konfigurasi advanced scraping"""
710
+ service = get_enhanced_service()
711
+
712
+ try:
713
+ success = await service.config_manager.update_config(
714
+ "enhanced", request.settings
715
+ )
716
+ if not success:
717
+ raise HTTPException(
718
+ status_code=500, detail=f"Gagal memperbarui konfigurasi advanced"
719
+ )
720
+
721
+ return {"message": "Konfigurasi advanced berhasil diperbarui", "success": True}
722
+
723
+ except Exception as e:
724
+ raise HTTPException(
725
+ status_code=500, detail=f"Error memperbarui konfigurasi advanced: {str(e)}"
726
+ )
727
+
728
+
729
+ # Module: app/db_storage extension
730
+ @router.get("/scraping/operations", response_model=dict)
731
+ async def get_scraping_operations(db: AsyncSession = Depends(get_db)) -> Dict[str, Any]:
732
+ """Dapatkan operasi scraping yang tersedia"""
733
+
734
+ operations = {
735
+ "validate_proxy_source": "validasi proxy source",
736
+ "run_hunter_protocol": "jalankan Hunter Protocol",
737
+ "start_scraping_session": "memulai sesi scraping",
738
+ "stop_scraping_session": "hentikan sesi scraping",
739
+ "manage_proxy_sources": "kelola proxy sources",
740
+ "view_scraping_stats": "lihat statistik scraping",
741
+ "advanced_configuration": "konfigurasi advanced",
742
+ "queue_management": "kelola antrian request queue",
743
+ }
744
+
745
+ return operations
746
+
747
+
748
+ @router.post("/scraping/operations/{operation}", response_model=dict)
749
+ async def execute_operation(
750
+ operation: str,
751
+ params: Dict[str, Any],
752
+ db: AsyncSession = Depends(get_db),
753
+ ) -> Dict[str, Any]:
754
+ """Eksekusi operasi scraping"""
755
+ service = get_enhanced_service()
756
+
757
+ try:
758
+ if operation == "validate_proxy_source":
759
+ return await validate_proxy_source(
760
+ params["source_id"], db, background_tasks
761
+ )
762
+ elif operation == "run_hunter_protocol":
763
+ return await trigger_hunter_manual(db, background_tasks)
764
+ elif operation == "start_scraping_session":
765
+ return await start_scraping_session(params, db, background_tasks)
766
+ elif operation == "stop_scraping_session":
767
+ return await stop_scraping_session(
768
+ params["session_id"], db, background_tasks
769
+ )
770
+ elif operation == "clear_request_queue":
771
+ return await clear_request_queue(db, background_tasks)
772
+ else:
773
+ raise HTTPException(
774
+ status_code=400, detail=f"Invalid operation: {operation}"
775
+ )
776
+
777
+ except Exception as e:
778
+ raise HTTPException(
779
+ status_code=500, detail=f"Error executing operation {operation}: {str(e)}"
780
+ )