rdsarjito commited on
Commit
c4aea7e
·
1 Parent(s): 6afe076
Files changed (1) hide show
  1. app.py +177 -125
app.py CHANGED
@@ -362,8 +362,8 @@ def predict_single_url(url):
362
  screenshot_path = take_screenshot(url)
363
  if not screenshot_path:
364
  error_label = {"Error": 1.0, "Non-Gambling": 0.0, "Gambling": 0.0}
365
- error_msg = f"**❌ Error:** Unable to capture screenshot for `{url}`\n\n**Possible reasons:**\n• Too many redirects\n• Website blocking automated access\n• Network connectivity issues\n• Invalid URL"
366
- return error_label, error_msg, None, "", "", "**Model:** Screenshot capture failed"
367
 
368
  text = extract_text_from_image(screenshot_path)
369
  raw_text = text # Store raw text before cleaning
@@ -389,9 +389,10 @@ def predict_single_url(url):
389
  }
390
 
391
  confidence = gambling_prob if is_gambling else non_gambling_prob
392
- confidence_md = f"**Confidence:** {confidence:.1%}\n\n**Model Used:** Image-Only Model (EfficientNet-B3)\n\n**Prediction:** {'🟥 Gambling' if is_gambling else '🟩 Non-Gambling'}"
 
393
 
394
- model_info = f"**Model Type:** Image-Only\n**Architecture:** EfficientNet-B3\n**Gambling Probability:** {gambling_prob:.1%}\n**Non-Gambling Probability:** {non_gambling_prob:.1%}"
395
 
396
  print(f"[Image-Only] URL: {url}")
397
  print(f"Prediction: {'Gambling' if is_gambling else 'Non-Gambling'} | Confidence: {confidence:.2f}\n")
@@ -431,16 +432,17 @@ def predict_single_url(url):
431
  image_weight = 0.5
432
  text_weight = 0.5
433
 
434
- confidence_md = f"**Confidence:** {confidence:.1%}\n\n**Model Used:** Fusion Model (Image + Text)\n\n**Prediction:** {'🟥 Gambling' if is_gambling else '🟩 Non-Gambling'}"
 
435
 
436
- model_info = f"""**Model Type:** Fusion Model (MLP)
437
  **Image Model:** EfficientNet-B3
438
  **Text Model:** IndoBERT
439
 
440
- **Individual Predictions:**
441
- - 🖼️ Image Model: {image_probs[0].item():.1%}
442
- - 📝 Text Model: {text_probs[0].item():.1%}
443
- - 🔗 Fusion Result: {gambling_prob:.1%}"""
444
 
445
  # ✨ Log detail
446
  print(f"[Fusion Model] URL: {url}")
@@ -471,143 +473,190 @@ def predict_batch_urls(file_obj):
471
 
472
  # --- Gradio App ---
473
 
474
- # Custom CSS for professional styling
475
  custom_css = """
476
- .main-header {
477
- text-align: center;
478
- padding: 2rem 0;
479
- background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
480
- color: white;
481
- border-radius: 10px;
482
- margin-bottom: 2rem;
483
  }
484
- .main-header h1 {
 
 
 
485
  margin: 0;
486
- font-size: 2.5rem;
487
- font-weight: 700;
488
  }
489
- .main-header p {
490
- margin: 0.5rem 0 0 0;
491
- font-size: 1.1rem;
492
- opacity: 0.9;
493
  }
494
- .result-card {
495
- background: #f8f9fa;
496
- padding: 1.5rem;
497
- border-radius: 10px;
498
- border: 2px solid #e9ecef;
499
- margin: 1rem 0;
500
  }
501
- .info-box {
502
- background: #e7f3ff;
503
- padding: 1rem;
504
  border-radius: 8px;
505
- border-left: 4px solid #2196F3;
506
- margin: 1rem 0;
507
  }
508
- .success-box {
509
- background: #d4edda;
510
- border-left-color: #28a745;
 
 
 
 
511
  }
512
- .warning-box {
513
- background: #fff3cd;
514
- border-left-color: #ffc107;
 
 
515
  }
516
- .gradio-container {
517
- max-width: 1200px;
518
- margin: 0 auto;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
519
  }
520
  """
521
 
522
- with gr.Blocks(theme=gr.themes.Soft(), css=custom_css, title="Gambling Website Detector") as app:
523
- # Header Section
524
  with gr.Row():
525
  gr.HTML("""
526
- <div class="main-header">
527
- <h1>🕵️ Gambling Website Detection System</h1>
528
- <p>AI-Powered URL Analysis using Deep Learning Fusion Model</p>
 
 
529
  </div>
530
  """)
531
 
532
- # Info Section
533
  with gr.Row():
534
- gr.Markdown("""
535
- ### 📋 About This Tool
536
-
537
- This advanced detection system uses a **fusion model** combining:
538
- - 🖼️ **Image Analysis**: EfficientNet-B3 for visual content detection
539
- - 📝 **Text Analysis**: IndoBERT for Indonesian text understanding
540
- - 🔗 **Fusion Learning**: Intelligent combination of both modalities
541
-
542
- Simply enter a website URL to analyze whether it contains gambling-related content.
543
- """)
544
 
545
  with gr.Tabs():
546
- with gr.Tab("🔍 Single URL Analysis", id="single"):
547
  with gr.Row():
548
- with gr.Column(scale=2):
549
- gr.Markdown("### Enter Website URL")
 
 
 
 
 
 
 
 
 
 
550
  url_input = gr.Textbox(
551
- label="Website URL",
552
  placeholder="https://example.com",
553
- info="Enter the full URL of the website you want to analyze",
554
- lines=1
555
  )
556
  predict_button = gr.Button(
557
- "🔎 Analyze Website",
558
  variant="primary",
559
  size="lg"
560
  )
561
 
562
- gr.Markdown("---")
 
 
 
 
 
 
 
 
563
 
564
- # Results Section
565
  with gr.Row():
566
  with gr.Column(scale=1):
567
- gr.Markdown("### 📊 Detection Results")
568
  label_output = gr.Label(
569
- label="Prediction Result",
570
  value={"Gambling": 0.0, "Non-Gambling": 0.0},
571
- num_top_classes=2
 
572
  )
573
  confidence_output = gr.Markdown(
574
- value="**Confidence:** Waiting for analysis...",
575
- label="Confidence Score"
 
576
  )
577
  model_info_output = gr.Markdown(
578
  value="",
579
- label="Model Information"
 
580
  )
581
 
582
  with gr.Column(scale=1):
583
- gr.Markdown("### 📸 Website Screenshot")
584
  screenshot_output = gr.Image(
585
- label="Captured Screenshot",
586
  type="filepath",
587
- height=400
 
588
  )
589
 
590
- gr.Markdown("---")
591
-
592
- # Text Analysis Section
593
- with gr.Accordion("📝 Text Analysis Details", open=False):
594
- with gr.Row():
595
- with gr.Column():
596
- gr.Markdown("#### Raw OCR Text")
597
- raw_text_output = gr.Textbox(
598
- label="Extracted Text (Raw)",
599
- lines=8,
600
- interactive=False,
601
- placeholder="Raw text extracted from the screenshot will appear here..."
602
- )
603
- with gr.Column():
604
- gr.Markdown("#### Processed Text")
605
- cleaned_text_output = gr.Textbox(
606
- label="Cleaned Text (Processed)",
607
- lines=8,
608
- interactive=False,
609
- placeholder="Processed and cleaned text will appear here..."
610
- )
611
 
612
  predict_button.click(
613
  fn=predict_single_url,
@@ -622,36 +671,40 @@ with gr.Blocks(theme=gr.themes.Soft(), css=custom_css, title="Gambling Website D
622
  ]
623
  )
624
 
625
- with gr.Tab("📦 Batch URL Analysis", id="batch"):
626
- gr.Markdown("""
627
- ### Batch Processing
628
-
629
- Upload a text file containing multiple URLs (one per line) to analyze them all at once.
630
- The results will be displayed in a table format.
631
- """)
 
 
 
 
632
 
633
  with gr.Row():
634
  with gr.Column():
635
  file_input = gr.File(
636
- label="Upload URL File (.txt)",
637
- file_types=[".txt"]
 
638
  )
639
- gr.Markdown("💡 **Tip:** Upload a .txt file with one URL per line")
640
  batch_predict_button = gr.Button(
641
- "🚀 Process Batch",
642
  variant="primary",
643
  size="lg"
644
  )
645
 
646
- gr.Markdown("---")
647
-
648
  with gr.Row():
649
- gr.Markdown("### 📋 Batch Results")
650
- batch_output = gr.DataFrame(
651
- label="Analysis Results",
652
- wrap=True,
653
- interactive=False
654
- )
 
655
 
656
  batch_predict_button.click(
657
  fn=predict_batch_urls,
@@ -660,12 +713,11 @@ with gr.Blocks(theme=gr.themes.Soft(), css=custom_css, title="Gambling Website D
660
  )
661
 
662
  # Footer
663
- gr.Markdown("---")
664
- gr.Markdown("""
665
- <div style="text-align: center; color: #666; padding: 1rem;">
666
- <p>Powered by PyTorch Gradio EfficientNet IndoBERT</p>
667
- <p style="font-size: 0.9rem;">⚠️ This tool is for educational and research purposes only</p>
668
- </div>
669
  """)
670
 
671
  app.launch()
 
362
  screenshot_path = take_screenshot(url)
363
  if not screenshot_path:
364
  error_label = {"Error": 1.0, "Non-Gambling": 0.0, "Gambling": 0.0}
365
+ error_msg = f"**Gagal mengambil screenshot**\n\nURL: `{url}`\n\n**Kemungkinan penyebab:**\n• Terlalu banyak redirect\n• Website memblokir akses otomatis\n• Masalah koneksi jaringan\n• URL tidak valid"
366
+ return error_label, error_msg, None, "", "", "**Status:** Gagal mengambil screenshot"
367
 
368
  text = extract_text_from_image(screenshot_path)
369
  raw_text = text # Store raw text before cleaning
 
389
  }
390
 
391
  confidence = gambling_prob if is_gambling else non_gambling_prob
392
+ result_text = "Gambling" if is_gambling else "Non-Gambling"
393
+ confidence_md = f"**Tingkat Keyakinan:** {confidence:.1%}\n\n**Model:** Image-Only (EfficientNet-B3)\n\n**Hasil:** {result_text}"
394
 
395
+ model_info = f"**Tipe Model:** Image-Only\n**Arsitektur:** EfficientNet-B3\n**Probabilitas Gambling:** {gambling_prob:.1%}\n**Probabilitas Non-Gambling:** {non_gambling_prob:.1%}"
396
 
397
  print(f"[Image-Only] URL: {url}")
398
  print(f"Prediction: {'Gambling' if is_gambling else 'Non-Gambling'} | Confidence: {confidence:.2f}\n")
 
432
  image_weight = 0.5
433
  text_weight = 0.5
434
 
435
+ result_text = "Gambling" if is_gambling else "Non-Gambling"
436
+ confidence_md = f"**Tingkat Keyakinan:** {confidence:.1%}\n\n**Model:** Fusion Model (Image + Text)\n\n**Hasil:** {result_text}"
437
 
438
+ model_info = f"""**Tipe Model:** Fusion Model (MLP)
439
  **Image Model:** EfficientNet-B3
440
  **Text Model:** IndoBERT
441
 
442
+ **Prediksi Individual:**
443
+ - Image Model: {image_probs[0].item():.1%}
444
+ - Text Model: {text_probs[0].item():.1%}
445
+ - Hasil Fusion: {gambling_prob:.1%}"""
446
 
447
  # ✨ Log detail
448
  print(f"[Fusion Model] URL: {url}")
 
473
 
474
  # --- Gradio App ---
475
 
476
+ # Custom CSS - Tokopedia style
477
  custom_css = """
478
+ .header-container {
479
+ background: #fff;
480
+ border-bottom: 1px solid #e5e5e5;
481
+ padding: 20px 0;
482
+ margin-bottom: 30px;
 
 
483
  }
484
+ .header-title {
485
+ font-size: 24px;
486
+ font-weight: 600;
487
+ color: #333;
488
  margin: 0;
489
+ padding: 0;
 
490
  }
491
+ .header-subtitle {
492
+ font-size: 14px;
493
+ color: #666;
494
+ margin: 5px 0 0 0;
495
  }
496
+ .content-container {
497
+ max-width: 1200px;
498
+ margin: 0 auto;
499
+ padding: 0 20px;
 
 
500
  }
501
+ .card {
502
+ background: #fff;
503
+ border: 1px solid #e5e5e5;
504
  border-radius: 8px;
505
+ padding: 24px;
506
+ margin-bottom: 20px;
507
  }
508
+ .section-title {
509
+ font-size: 18px;
510
+ font-weight: 600;
511
+ color: #333;
512
+ margin: 0 0 20px 0;
513
+ padding-bottom: 12px;
514
+ border-bottom: 2px solid #42b549;
515
  }
516
+ .info-text {
517
+ font-size: 14px;
518
+ color: #666;
519
+ line-height: 1.6;
520
+ margin: 0;
521
  }
522
+ .button-primary {
523
+ background: #42b549;
524
+ color: #fff;
525
+ border: none;
526
+ padding: 12px 32px;
527
+ border-radius: 4px;
528
+ font-weight: 500;
529
+ cursor: pointer;
530
+ }
531
+ .button-primary:hover {
532
+ background: #3aa040;
533
+ }
534
+ .result-box {
535
+ background: #f8f9fa;
536
+ border: 1px solid #e5e5e5;
537
+ border-radius: 8px;
538
+ padding: 20px;
539
+ margin: 15px 0;
540
+ }
541
+ .footer-text {
542
+ text-align: center;
543
+ color: #999;
544
+ font-size: 12px;
545
+ padding: 20px 0;
546
+ border-top: 1px solid #e5e5e5;
547
+ margin-top: 40px;
548
  }
549
  """
550
 
551
+ with gr.Blocks(theme=gr.themes.Default(), css=custom_css, title="Gambling Website Detector") as app:
552
+ # Header
553
  with gr.Row():
554
  gr.HTML("""
555
+ <div class="header-container">
556
+ <div class="content-container">
557
+ <h1 class="header-title">Gambling Website Detector</h1>
558
+ <p class="header-subtitle">Analisis website untuk mendeteksi konten perjudian menggunakan teknologi deep learning</p>
559
+ </div>
560
  </div>
561
  """)
562
 
563
+ # Main Content
564
  with gr.Row():
565
+ with gr.Column():
566
+ gr.HTML("""
567
+ <div class="content-container">
568
+ <div class="card">
569
+ <p class="info-text">
570
+ Sistem ini menggunakan model fusion yang menggabungkan analisis gambar dan teks untuk mendeteksi konten perjudian pada website. Masukkan URL website yang ingin dianalisis.
571
+ </p>
572
+ </div>
573
+ </div>
574
+ """)
575
 
576
  with gr.Tabs():
577
+ with gr.Tab("Analisis URL", id="single"):
578
  with gr.Row():
579
+ with gr.Column():
580
+ gr.HTML("""
581
+ <div class="content-container">
582
+ <div class="card">
583
+ <h2 class="section-title">Masukkan URL Website</h2>
584
+ <p class="info-text" style="margin-bottom: 20px;">Masukkan URL lengkap website yang ingin dianalisis. Sistem akan mengambil screenshot dan menganalisis kontennya.</p>
585
+ </div>
586
+ </div>
587
+ """)
588
+
589
+ with gr.Row():
590
+ with gr.Column():
591
  url_input = gr.Textbox(
592
+ label="URL Website",
593
  placeholder="https://example.com",
594
+ lines=1,
595
+ container=False
596
  )
597
  predict_button = gr.Button(
598
+ "Analisis Website",
599
  variant="primary",
600
  size="lg"
601
  )
602
 
603
+ with gr.Row():
604
+ with gr.Column():
605
+ gr.HTML("""
606
+ <div class="content-container">
607
+ <div class="card">
608
+ <h2 class="section-title">Hasil Analisis</h2>
609
+ </div>
610
+ </div>
611
+ """)
612
 
 
613
  with gr.Row():
614
  with gr.Column(scale=1):
 
615
  label_output = gr.Label(
616
+ label="Hasil Prediksi",
617
  value={"Gambling": 0.0, "Non-Gambling": 0.0},
618
+ num_top_classes=2,
619
+ container=False
620
  )
621
  confidence_output = gr.Markdown(
622
+ value="",
623
+ label="Tingkat Keyakinan",
624
+ container=False
625
  )
626
  model_info_output = gr.Markdown(
627
  value="",
628
+ label="Informasi Model",
629
+ container=False
630
  )
631
 
632
  with gr.Column(scale=1):
 
633
  screenshot_output = gr.Image(
634
+ label="Screenshot Website",
635
  type="filepath",
636
+ height=400,
637
+ container=False
638
  )
639
 
640
+ with gr.Row():
641
+ with gr.Column():
642
+ with gr.Accordion("Detail Analisis Teks", open=False):
643
+ with gr.Row():
644
+ with gr.Column():
645
+ raw_text_output = gr.Textbox(
646
+ label="Teks Mentah (Raw OCR)",
647
+ lines=6,
648
+ interactive=False,
649
+ placeholder="Teks yang diekstrak dari screenshot akan muncul di sini...",
650
+ container=False
651
+ )
652
+ with gr.Column():
653
+ cleaned_text_output = gr.Textbox(
654
+ label="Teks yang Diproses",
655
+ lines=6,
656
+ interactive=False,
657
+ placeholder="Teks yang sudah dibersihkan akan muncul di sini...",
658
+ container=False
659
+ )
 
660
 
661
  predict_button.click(
662
  fn=predict_single_url,
 
671
  ]
672
  )
673
 
674
+ with gr.Tab("Analisis Batch", id="batch"):
675
+ with gr.Row():
676
+ with gr.Column():
677
+ gr.HTML("""
678
+ <div class="content-container">
679
+ <div class="card">
680
+ <h2 class="section-title">Analisis Multiple URL</h2>
681
+ <p class="info-text">Upload file teks (.txt) yang berisi beberapa URL (satu URL per baris) untuk dianalisis sekaligus. Hasil akan ditampilkan dalam format tabel.</p>
682
+ </div>
683
+ </div>
684
+ """)
685
 
686
  with gr.Row():
687
  with gr.Column():
688
  file_input = gr.File(
689
+ label="Upload File URL (.txt)",
690
+ file_types=[".txt"],
691
+ container=False
692
  )
693
+ gr.Markdown("**Format file:** Satu URL per baris", container=False)
694
  batch_predict_button = gr.Button(
695
+ "Proses Batch",
696
  variant="primary",
697
  size="lg"
698
  )
699
 
 
 
700
  with gr.Row():
701
+ with gr.Column():
702
+ batch_output = gr.DataFrame(
703
+ label="Hasil Analisis",
704
+ wrap=True,
705
+ interactive=False,
706
+ container=False
707
+ )
708
 
709
  batch_predict_button.click(
710
  fn=predict_batch_urls,
 
713
  )
714
 
715
  # Footer
716
+ gr.HTML("""
717
+ <div class="footer-text">
718
+ <p>Powered by PyTorch Gradio EfficientNet • IndoBERT</p>
719
+ <p style="margin-top: 8px;">Tool ini untuk keperluan edukasi dan penelitian</p>
720
+ </div>
 
721
  """)
722
 
723
  app.launch()