Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| import os | |
| import re | |
| import time | |
| import torch | |
| import torch.nn as nn | |
| from PIL import Image | |
| import requests | |
| import easyocr | |
| from transformers import AutoTokenizer, AutoModel | |
| from torchvision import transforms | |
| from torchvision import models | |
| from torchvision.transforms import functional as F | |
| import pandas as pd | |
| from huggingface_hub import hf_hub_download | |
| import warnings | |
| warnings.filterwarnings("ignore") | |
| # --- Setup --- | |
| # Device setup | |
| device = torch.device("cuda" if torch.cuda.is_available() else "cpu") | |
| print(f"Using device: {device}") | |
| # Load tokenizer | |
| tokenizer = AutoTokenizer.from_pretrained('indobenchmark/indobert-base-p1') | |
| # Image transformation | |
| class ResizePadToSquare: | |
| def __init__(self, target_size=300): | |
| self.target_size = target_size | |
| def __call__(self, img): | |
| img = img.convert("RGB") | |
| img.thumbnail((self.target_size, self.target_size), Image.BILINEAR) | |
| delta_w = self.target_size - img.size[0] | |
| delta_h = self.target_size - img.size[1] | |
| padding = (delta_w // 2, delta_h // 2, delta_w - delta_w // 2, delta_h - delta_h // 2) | |
| img = F.pad(img, padding, fill=0, padding_mode='constant') | |
| return img | |
| transform = transforms.Compose([ | |
| ResizePadToSquare(300), | |
| transforms.ToTensor(), | |
| transforms.Normalize(mean=[0.485, 0.456, 0.406], | |
| std=[0.229, 0.224, 0.225]), | |
| ]) | |
| # Screenshot folder | |
| SCREENSHOT_DIR = "screenshots" | |
| os.makedirs(SCREENSHOT_DIR, exist_ok=True) | |
| # Create OCR reader | |
| reader = easyocr.Reader(['id']) # Indonesia language | |
| print("OCR reader initialized.") | |
| # --- Model --- | |
| class TextModelWithClassifier(nn.Module): | |
| def __init__(self, base_model): | |
| super(TextModelWithClassifier, self).__init__() | |
| self.bert = base_model # Use 'bert' to match saved state_dict keys | |
| self.classifier = nn.Linear(base_model.config.hidden_size, 1) | |
| def forward(self, input_ids, attention_mask): | |
| outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask) | |
| pooled_output = outputs.pooler_output if hasattr(outputs, 'pooler_output') else outputs.last_hidden_state[:, 0] | |
| logits = self.classifier(pooled_output) | |
| return type('Output', (), {'logits': logits})() | |
| class LateFusionModel(nn.Module): | |
| def __init__(self, image_model, text_model): | |
| super(LateFusionModel, self).__init__() | |
| self.image_model = image_model | |
| self.text_model = text_model | |
| # MLP fusion layer (matching saved model structure) | |
| # Structure: Linear(2, hidden) -> ReLU -> Dropout -> Linear(hidden, 1) | |
| hidden_dim = 16 # Matching saved model: [16, 2] -> [16] -> [1, 16] | |
| self.fusion_mlp = nn.Sequential( | |
| nn.Linear(2, hidden_dim), # layer 0: [16, 2] | |
| nn.ReLU(), # layer 1 (no params) | |
| nn.Dropout(0.1), # layer 2 (no params) | |
| nn.Linear(hidden_dim, 1) # layer 3: [1, 16] | |
| ) | |
| def forward(self, images, input_ids, attention_mask): | |
| with torch.no_grad(): | |
| image_logits = self.image_model(images).squeeze(1) | |
| text_logits = self.text_model(input_ids=input_ids, attention_mask=attention_mask).logits.squeeze(1) | |
| # Stack logits and pass through MLP | |
| stacked_logits = torch.stack([image_logits, text_logits], dim=1) | |
| fused_logits = self.fusion_mlp(stacked_logits).squeeze(1) | |
| # For compatibility, create dummy weights | |
| weights = torch.tensor([0.5, 0.5], device=fused_logits.device) | |
| return fused_logits, image_logits, text_logits, weights | |
| # Load Fusion Model | |
| # Create model architecture first | |
| image_model_for_fusion = models.efficientnet_b3(weights=models.EfficientNet_B3_Weights.DEFAULT) | |
| num_features = image_model_for_fusion.classifier[1].in_features | |
| # Match saved model structure: classifier.1 instead of classifier | |
| image_model_for_fusion.classifier = nn.Sequential( | |
| nn.Dropout(p=0.3, inplace=True), | |
| nn.Linear(num_features, 1) | |
| ) | |
| text_base_model = AutoModel.from_pretrained('indobenchmark/indobert-base-p1') | |
| text_model = TextModelWithClassifier(text_base_model) | |
| fusion_model = LateFusionModel(image_model_for_fusion, text_model) | |
| # Load state_dict | |
| model_path = "models/best_mlp_fusion_model_state_dict.pt" | |
| if os.path.exists(model_path): | |
| state_dict = torch.load(model_path, map_location=device) | |
| try: | |
| fusion_model.load_state_dict(state_dict, strict=True) | |
| print("Fusion model loaded from local state_dict successfully!") | |
| except RuntimeError as e: | |
| print(f"Warning: Some keys didn't match. Trying with strict=False...") | |
| print(f"Error details: {str(e)[:500]}") | |
| fusion_model.load_state_dict(state_dict, strict=False) | |
| print("Fusion model loaded with strict=False (some keys may be missing)") | |
| else: | |
| print("Fusion model not found locally. Downloading from Hugging Face Hub...") | |
| model_path = hf_hub_download(repo_id="azzandr/gambling-fusion-model", filename="best_mlp_fusion_model_state_dict.pt") | |
| state_dict = torch.load(model_path, map_location=device) | |
| try: | |
| fusion_model.load_state_dict(state_dict, strict=True) | |
| print("Fusion model downloaded and loaded successfully!") | |
| except RuntimeError as e: | |
| print(f"Warning: Some keys didn't match. Trying with strict=False...") | |
| print(f"Error details: {str(e)[:500]}") | |
| fusion_model.load_state_dict(state_dict, strict=False) | |
| print("Fusion model loaded with strict=False (some keys may be missing)") | |
| fusion_model.to(device) | |
| fusion_model.eval() | |
| print("Fusion model ready!") | |
| # Load Image-Only Model | |
| # Load image model from state_dict | |
| image_model_path = "models/best_image_model_Adam_lr0.0001_bs32_state_dict.pt" | |
| if os.path.exists(image_model_path): | |
| image_only_model = models.efficientnet_b3(weights=models.EfficientNet_B3_Weights.DEFAULT) | |
| num_features = image_only_model.classifier[1].in_features | |
| image_only_model.classifier = nn.Linear(num_features, 1) | |
| image_only_model.load_state_dict(torch.load(image_model_path, map_location=device)) | |
| image_only_model.to(device) | |
| image_only_model.eval() | |
| print("Image-only model loaded from state_dict successfully!") | |
| else: | |
| print("Image-only model not found locally. Downloading from Hugging Face Hub...") | |
| image_model_path = hf_hub_download(repo_id="azzandr/gambling-image-model", filename="best_image_model_Adam_lr0.0001_bs32_state_dict.pt") | |
| image_only_model = models.efficientnet_b3(weights=models.EfficientNet_B3_Weights.DEFAULT) | |
| num_features = image_only_model.classifier[1].in_features | |
| image_only_model.classifier = nn.Linear(num_features, 1) | |
| image_only_model.load_state_dict(torch.load(image_model_path, map_location=device)) | |
| image_only_model.to(device) | |
| image_only_model.eval() | |
| print("Image-only model downloaded and loaded successfully!") | |
| # --- Functions --- | |
| def clean_text(text): | |
| exceptions = { | |
| "di", "ke", "ya" | |
| } | |
| # ----- BASIC CLEANING ----- | |
| text = re.sub(r"http\S+", "", text) # Hapus URL | |
| text = re.sub(r"\n", " ", text) # Ganti newline dengan spasi | |
| text = re.sub(r"[^a-zA-Z']", " ", text) # Hanya sisakan huruf dan apostrof | |
| text = re.sub(r"\s{2,}", " ", text).strip().lower() # Hapus spasi ganda, ubah ke lowercase | |
| # ----- FILTERING ----- | |
| words = text.split() | |
| filtered_words = [ | |
| w for w in words | |
| if (len(w) > 2 or w in exceptions) # Simpan kata >2 huruf atau ada di exceptions | |
| ] | |
| text = ' '.join(filtered_words) | |
| # ----- REMOVE UNWANTED PATTERNS ----- | |
| text = re.sub(r'\b[aeiou]+\b', '', text) # Hapus kata semua vokal (panjang berapa pun) | |
| text = re.sub(r'\b[^aeiou\s]+\b', '', text) # Hapus kata semua konsonan (panjang berapa pun) | |
| text = re.sub(r'\b\w{20,}\b', '', text) # Hapus kata sangat panjang (≥20 huruf) | |
| text = re.sub(r'\s+', ' ', text).strip() # Bersihkan spasi ekstra | |
| # check words number | |
| if len(text.split()) < 5: | |
| print(f"Cleaned text too short ({len(text.split())} words). Ignoring text.") | |
| return "" # empty return to use image-only | |
| return text | |
| # Your API key | |
| SCREENSHOT_API_KEY = os.getenv("SCREENSHOT_API_KEY") # Ambil dari environment variable | |
| # Constants for screenshot configuration | |
| CLOUDFLARE_CHECK_KEYWORDS = ["Checking your browser", "Just a moment", "Cloudflare"] | |
| def ensure_http(url): | |
| if not url.startswith(('http://', 'https://')): | |
| return 'http://' + url | |
| return url | |
| def sanitize_filename(url): | |
| return re.sub(r'[^\w\-_\. ]', '_', url) | |
| def take_screenshot(url): | |
| url = ensure_http(url) | |
| filename = sanitize_filename(url) + '.png' | |
| filepath = os.path.join(SCREENSHOT_DIR, filename) | |
| try: | |
| if not SCREENSHOT_API_KEY: | |
| print("SCREENSHOT_API_KEY not found in environment.") | |
| return None | |
| api_url = "https://api.apiflash.com/v1/urltoimage" | |
| # Base parameters - only using supported parameters | |
| params = { | |
| "access_key": SCREENSHOT_API_KEY, | |
| "url": url, | |
| "format": "png", | |
| "wait_until": "network_idle", | |
| "delay": 2, | |
| "fail_on_status": "400,401,402,403,404,500,502,503,504", | |
| "fresh": "true", # Don't use cached version | |
| "response_type": "image", | |
| "wait_for": "body" # Wait for body to be present | |
| } | |
| print(f"Taking screenshot of: {url}") | |
| response = requests.get(api_url, params=params) | |
| if response.status_code == 200: | |
| # Check if response is actually an image | |
| if response.headers.get('content-type', '').startswith('image'): | |
| with open(filepath, 'wb') as f: | |
| f.write(response.content) | |
| print(f"Screenshot taken successfully for URL: {url}") | |
| return filepath | |
| else: | |
| print(f"API returned non-image content") | |
| return None | |
| else: | |
| error_msg = response.text | |
| print(f"Screenshot failed: {error_msg}") | |
| # Check for Cloudflare detection | |
| if any(keyword.lower() in error_msg.lower() for keyword in CLOUDFLARE_CHECK_KEYWORDS): | |
| print("Cloudflare challenge detected, retrying with different parameters...") | |
| # Retry with different parameters for Cloudflare | |
| params.update({ | |
| "wait_until": "load", | |
| "delay": 5 | |
| }) | |
| response = requests.get(api_url, params=params) | |
| if response.status_code == 200 and response.headers.get('content-type', '').startswith('image'): | |
| with open(filepath, 'wb') as f: | |
| f.write(response.content) | |
| print(f"Screenshot taken successfully after Cloudflare retry") | |
| return filepath | |
| return None | |
| except Exception as e: | |
| print(f"Error taking screenshot: {e}") | |
| return None | |
| def resize_if_needed(image_path, max_mb=1, target_height=720): | |
| file_size = os.path.getsize(image_path) / (1024 * 1024) # dalam MB | |
| if file_size > max_mb: | |
| try: | |
| with Image.open(image_path) as img: | |
| width, height = img.size | |
| if height > target_height: | |
| ratio = target_height / float(height) | |
| new_width = int(float(width) * ratio) | |
| img = img.resize((new_width, target_height), Image.Resampling.LANCZOS) | |
| img.save(image_path, optimize=True, quality=85) | |
| print(f"Image resized to {new_width}x{target_height}") | |
| except Exception as e: | |
| print(f"Resize error: {e}") | |
| def easyocr_extract(image_path): | |
| try: | |
| results = reader.readtext(image_path, detail=0) | |
| text = " ".join(results) | |
| print(f"OCR text extracted from EasyOCR: {len(text)} characters") | |
| return text.strip() | |
| except Exception as e: | |
| print(f"EasyOCR error: {e}") | |
| return "" | |
| # def extract_text_from_image(image_path): | |
| # print("Skipping OCR. Forcing Image-Only prediction.") | |
| # return "" | |
| def extract_text_from_image(image_path): | |
| try: | |
| resize_if_needed(image_path, max_mb=1, target_height=720) # Tambahkan ini di awal | |
| file_size = os.path.getsize(image_path) / (1024 * 1024) # ukuran MB | |
| if file_size < 1: | |
| print(f"Using OCR.Space API for image ({file_size:.2f} MB)") | |
| api_key = os.getenv("OCR_SPACE_API_KEY") | |
| if not api_key: | |
| print("OCR_SPACE_API_KEY not found in environment. Using EasyOCR as fallback.") | |
| return easyocr_extract(image_path) | |
| with open(image_path, 'rb') as f: | |
| payload = { | |
| 'isOverlayRequired': False, | |
| 'apikey': api_key, | |
| 'language': 'eng' | |
| } | |
| r = requests.post('https://api.ocr.space/parse/image', | |
| files={'filename': f}, | |
| data=payload) | |
| result = r.json() | |
| if result.get('IsErroredOnProcessing', False): | |
| print(f"OCR.Space API Error: {result.get('ErrorMessage')}") | |
| return easyocr_extract(image_path) | |
| text = result['ParsedResults'][0]['ParsedText'] | |
| print(f"OCR text extracted from OCR.Space: {len(text)} characters") | |
| return text.strip() | |
| else: | |
| print(f"Using EasyOCR for image ({file_size:.2f} MB)") | |
| return easyocr_extract(image_path) | |
| except Exception as e: | |
| print(f"OCR error: {e}") | |
| return "" | |
| def prepare_data_for_model(image_path, text): | |
| image = Image.open(image_path) | |
| image_tensor = transform(image).unsqueeze(0).to(device) | |
| clean_text_data = clean_text(text) | |
| encoding = tokenizer.encode_plus( | |
| clean_text_data, | |
| add_special_tokens=True, | |
| max_length=128, | |
| padding='max_length', | |
| truncation=True, | |
| return_tensors='pt' | |
| ) | |
| input_ids = encoding['input_ids'].to(device) | |
| attention_mask = encoding['attention_mask'].to(device) | |
| return image_tensor, input_ids, attention_mask | |
| def predict_single_url(url): | |
| print(f"Processing URL: {url}") | |
| screenshot_path = take_screenshot(url) | |
| if not screenshot_path: | |
| error_label = {"Error": 1.0, "Non-Gambling": 0.0, "Gambling": 0.0} | |
| error_msg = f"**Error:** Unable to capture screenshot for `{url}`\n\n**Possible reasons:**\n• Too many redirects\n• Website blocking automated access\n• Network connectivity issues\n• Invalid URL" | |
| return error_label, error_msg, None, "", "", "**Model:** Screenshot capture failed" | |
| text = extract_text_from_image(screenshot_path) | |
| raw_text = text # Store raw text before cleaning | |
| if not text.strip(): # Jika text kosong | |
| print(f"No OCR text found for {url}. Using Image-Only Model.") | |
| image = Image.open(screenshot_path) | |
| image_tensor = transform(image).unsqueeze(0).to(device) | |
| with torch.no_grad(): | |
| image_logits = image_only_model(image_tensor).squeeze(1) | |
| image_probs = torch.sigmoid(image_logits) | |
| threshold = 0.6 | |
| is_gambling = image_probs[0] > threshold | |
| gambling_prob = image_probs[0].item() | |
| non_gambling_prob = 1 - gambling_prob | |
| label_dict = { | |
| "Gambling": gambling_prob, | |
| "Non-Gambling": non_gambling_prob | |
| } | |
| confidence = gambling_prob if is_gambling else non_gambling_prob | |
| confidence_md = f"**Confidence:** {confidence:.1%}\n\n**Model Used:** Image-Only Model (EfficientNet-B3)\n\n**Prediction:** {'Gambling' if is_gambling else 'Non-Gambling'}" | |
| model_info = f"**Model Type:** Image-Only\n**Architecture:** EfficientNet-B3\n**Gambling Probability:** {gambling_prob:.1%}\n**Non-Gambling Probability:** {non_gambling_prob:.1%}" | |
| print(f"[Image-Only] URL: {url}") | |
| print(f"Prediction: {'Gambling' if is_gambling else 'Non-Gambling'} | Confidence: {confidence:.2f}\n") | |
| return label_dict, confidence_md, screenshot_path, raw_text, "", model_info | |
| else: | |
| clean_text_data = clean_text(text) | |
| image_tensor, input_ids, attention_mask = prepare_data_for_model(screenshot_path, text) | |
| with torch.no_grad(): | |
| fused_logits, image_logits, text_logits, weights = fusion_model(image_tensor, input_ids, attention_mask) | |
| fused_probs = torch.sigmoid(fused_logits) | |
| image_probs = torch.sigmoid(image_logits) | |
| text_probs = torch.sigmoid(text_logits) | |
| threshold = 0.6 | |
| is_gambling = fused_probs[0] > threshold | |
| gambling_prob = fused_probs[0].item() | |
| non_gambling_prob = 1 - gambling_prob | |
| label_dict = { | |
| "Gambling": gambling_prob, | |
| "Non-Gambling": non_gambling_prob | |
| } | |
| confidence = gambling_prob if is_gambling else non_gambling_prob | |
| # Calculate relative contribution (approximation for MLP fusion) | |
| image_contrib = abs(image_probs[0].item() - 0.5) | |
| text_contrib = abs(text_probs[0].item() - 0.5) | |
| total_contrib = image_contrib + text_contrib | |
| if total_contrib > 0: | |
| image_weight = image_contrib / total_contrib | |
| text_weight = text_contrib / total_contrib | |
| else: | |
| image_weight = 0.5 | |
| text_weight = 0.5 | |
| confidence_md = f"**Confidence:** {confidence:.1%}\n\n**Model Used:** Fusion Model (Image + Text)\n\n**Prediction:** {'Gambling' if is_gambling else 'Non-Gambling'}" | |
| model_info = f"""**Model Type:** Fusion Model (MLP) | |
| **Image Model:** EfficientNet-B3 | |
| **Text Model:** IndoBERT | |
| **Individual Predictions:** | |
| - Image Model: {image_probs[0].item():.1%} | |
| - Text Model: {text_probs[0].item():.1%} | |
| - Fusion Result: {gambling_prob:.1%}""" | |
| # ✨ Log detail | |
| print(f"[Fusion Model] URL: {url}") | |
| print(f"Image Model Prediction Probability: {image_probs[0]:.2f}") | |
| print(f"Text Model Prediction Probability: {text_probs[0]:.2f}") | |
| print(f"Fusion Final Prediction: {'Gambling' if is_gambling else 'Non-Gambling'} | Confidence: {confidence:.2f}\n") | |
| return label_dict, confidence_md, screenshot_path, raw_text, clean_text_data, model_info | |
| def predict_batch_urls(file_obj): | |
| results = [] | |
| content = file_obj.read().decode('utf-8') | |
| urls = [line.strip() for line in content.splitlines() if line.strip()] | |
| for url in urls: | |
| label, confidence, screenshot_path, raw_text, cleaned_text = predict_single_url(url) | |
| results.append({ | |
| "url": url, | |
| "label": label, | |
| "confidence": confidence, | |
| "screenshot_path": screenshot_path, | |
| "raw_text": raw_text, | |
| "cleaned_text": cleaned_text | |
| }) | |
| df = pd.DataFrame(results) | |
| print(f"Batch prediction completed for {len(urls)} URLs.") | |
| return df | |
| # --- Gradio App --- | |
| # Custom CSS for professional styling | |
| custom_css = """ | |
| .main-header { | |
| text-align: center; | |
| padding: 2rem 0; | |
| background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); | |
| color: white; | |
| border-radius: 10px; | |
| margin-bottom: 2rem; | |
| } | |
| .main-header h1 { | |
| margin: 0; | |
| font-size: 2.5rem; | |
| font-weight: 700; | |
| } | |
| .main-header p { | |
| margin: 0.5rem 0 0 0; | |
| font-size: 1.1rem; | |
| opacity: 0.9; | |
| } | |
| .result-card { | |
| background: #f8f9fa; | |
| padding: 1.5rem; | |
| border-radius: 10px; | |
| border: 2px solid #e9ecef; | |
| margin: 1rem 0; | |
| } | |
| .info-box { | |
| background: #e7f3ff; | |
| padding: 1rem; | |
| border-radius: 8px; | |
| border-left: 4px solid #2196F3; | |
| margin: 1rem 0; | |
| } | |
| .success-box { | |
| background: #d4edda; | |
| border-left-color: #28a745; | |
| } | |
| .warning-box { | |
| background: #fff3cd; | |
| border-left-color: #ffc107; | |
| } | |
| .gradio-container { | |
| max-width: 1200px; | |
| margin: 0 auto; | |
| } | |
| """ | |
| with gr.Blocks(theme=gr.themes.Soft(), css=custom_css, title="Gambling Website Detector") as app: | |
| # Header Section | |
| with gr.Row(): | |
| gr.HTML(""" | |
| <div class="main-header"> | |
| <h1>Gambling Website Detection System</h1> | |
| <p>AI-Powered URL Analysis using Deep Learning Fusion Model</p> | |
| </div> | |
| """) | |
| # Info Section | |
| with gr.Row(): | |
| gr.Markdown(""" | |
| ### About This Tool | |
| This advanced detection system uses a **fusion model** combining: | |
| - **Image Analysis**: EfficientNet-B3 for visual content detection | |
| - **Text Analysis**: IndoBERT for Indonesian text understanding | |
| - **Fusion Learning**: Intelligent combination of both modalities | |
| Simply enter a website URL to analyze whether it contains gambling-related content. | |
| """) | |
| with gr.Tabs(): | |
| with gr.Tab("Single URL Analysis", id="single"): | |
| with gr.Row(): | |
| with gr.Column(scale=2): | |
| gr.Markdown("### Enter Website URL") | |
| url_input = gr.Textbox( | |
| label="Website URL", | |
| placeholder="https://example.com", | |
| info="Enter the full URL of the website you want to analyze", | |
| lines=1 | |
| ) | |
| predict_button = gr.Button( | |
| "Analyze Website", | |
| variant="primary", | |
| size="lg" | |
| ) | |
| gr.Markdown("---") | |
| # Results Section | |
| with gr.Row(): | |
| with gr.Column(scale=1): | |
| gr.Markdown("### Detection Results") | |
| label_output = gr.Label( | |
| label="Prediction Result", | |
| value={"Gambling": 0.0, "Non-Gambling": 0.0}, | |
| num_top_classes=2 | |
| ) | |
| confidence_output = gr.Markdown( | |
| value="**Confidence:** Waiting for analysis...", | |
| label="Confidence Score" | |
| ) | |
| model_info_output = gr.Markdown( | |
| value="", | |
| label="Model Information" | |
| ) | |
| with gr.Column(scale=1): | |
| gr.Markdown("### Website Screenshot") | |
| screenshot_output = gr.Image( | |
| label="Captured Screenshot", | |
| type="filepath", | |
| height=400 | |
| ) | |
| gr.Markdown("---") | |
| # Text Analysis Section | |
| with gr.Accordion("Text Analysis Details", open=False): | |
| with gr.Row(): | |
| with gr.Column(): | |
| gr.Markdown("#### Raw OCR Text") | |
| raw_text_output = gr.Textbox( | |
| label="Extracted Text (Raw)", | |
| lines=8, | |
| interactive=False, | |
| placeholder="Raw text extracted from the screenshot will appear here..." | |
| ) | |
| with gr.Column(): | |
| gr.Markdown("#### Processed Text") | |
| cleaned_text_output = gr.Textbox( | |
| label="Cleaned Text (Processed)", | |
| lines=8, | |
| interactive=False, | |
| placeholder="Processed and cleaned text will appear here..." | |
| ) | |
| predict_button.click( | |
| fn=predict_single_url, | |
| inputs=url_input, | |
| outputs=[ | |
| label_output, | |
| confidence_output, | |
| screenshot_output, | |
| raw_text_output, | |
| cleaned_text_output, | |
| model_info_output | |
| ] | |
| ) | |
| with gr.Tab("Batch URL Analysis", id="batch"): | |
| gr.Markdown(""" | |
| ### Batch Processing | |
| Upload a text file containing multiple URLs (one per line) to analyze them all at once. | |
| The results will be displayed in a table format. | |
| """) | |
| with gr.Row(): | |
| with gr.Column(): | |
| file_input = gr.File( | |
| label="Upload URL File (.txt)", | |
| file_types=[".txt"] | |
| ) | |
| gr.Markdown("**Tip:** Upload a .txt file with one URL per line") | |
| batch_predict_button = gr.Button( | |
| "Process Batch", | |
| variant="primary", | |
| size="lg" | |
| ) | |
| gr.Markdown("---") | |
| with gr.Row(): | |
| gr.Markdown("### Batch Results") | |
| batch_output = gr.DataFrame( | |
| label="Analysis Results", | |
| wrap=True, | |
| interactive=False | |
| ) | |
| batch_predict_button.click( | |
| fn=predict_batch_urls, | |
| inputs=file_input, | |
| outputs=batch_output | |
| ) | |
| # Footer | |
| gr.Markdown("---") | |
| gr.Markdown(""" | |
| <div style="text-align: center; color: #666; padding: 1rem;"> | |
| <p>Powered by PyTorch • Gradio • EfficientNet • IndoBERT</p> | |
| <p style="font-size: 0.9rem;">This tool is for educational and research purposes only</p> | |
| </div> | |
| """) | |
| app.launch() |