import time import torch import pandas as pd import gradio as gr from transformers import ( InstructBlipProcessor, InstructBlipForConditionalGeneration ) # ===================================================== # MODEL # ===================================================== MODEL_ID = "MRaudhatul/instructblip-coco-captioning" print("Loading processor...") processor = InstructBlipProcessor.from_pretrained(MODEL_ID) print("Loading model...") model = InstructBlipForConditionalGeneration.from_pretrained( MODEL_ID, torch_dtype=torch.float32 ) model.eval() print("Model loaded successfully") # ===================================================== # DATASET STATS # ===================================================== TRAIN_IMAGES = 26613 VALID_IMAGES = 2958 TRAIN_CAPTIONS = 133142 VALID_CAPTIONS = 14794 # ===================================================== # METRICS # ===================================================== BLEU1 = 0.7798 BLEU2 = 0.6066 BLEU3 = 0.4547 BLEU4 = 0.3290 ROUGE_L = 0.5909 METEOR = 0.5790 CIDER = 0.9931 metrics_df = pd.DataFrame({ "Metric": ["BLEU-1","BLEU-2","BLEU-3","BLEU-4","ROUGE-L","METEOR","CIDEr"], "Score": [BLEU1, BLEU2, BLEU3, BLEU4, ROUGE_L, METEOR, CIDER] }) # ===================================================== # TASK PROMPTS # ===================================================== PROMPTS = { "Generate Caption": "Describe this image.", "Detailed Caption": "Describe this image in detail.", "Identify Main Objects": "What are the main objects in this image?", "Explain Scene": "Explain what is happening in this image." } # ===================================================== # CSS — fully responsive, mobile-first # ===================================================== css = """ /* ── reset & base ── */ *, *::before, *::after { box-sizing: border-box; } html, body { width: 100% !important; overflow-x: hidden !important; } .gradio-container { min-width: unset !important; width: 100% !important; max-width: 960px !important; margin: 0 auto !important; padding: 0 12px !important; } /* paksa semua elemen tidak overflow */ .block, .form, .panel { min-width: unset !important; width: 100% !important; } footer { display: none !important; } /* ── title block ── */ .main-title { text-align: center; font-size: clamp(22px, 5vw, 38px); font-weight: 700; margin: 16px 0 6px; line-height: 1.2; word-break: break-word; } .subtitle { text-align: center; font-size: clamp(13px, 3vw, 16px); color: #666; margin-bottom: 20px; } /* ── tab labels ── */ .tab-nav button { font-size: clamp(12px, 2.5vw, 15px) !important; padding: 8px 10px !important; } /* ── generate tab: stack on mobile, side-by-side on desktop ── */ .img-row { display: flex; flex-wrap: wrap; gap: 12px; } .img-row > * { flex: 1 1 280px; min-width: 0; } /* ── stats row ── */ .stats-row { display: flex; flex-wrap: wrap; gap: 10px; } .stats-row > * { flex: 1 1 130px; min-width: 0; } /* ── metrics row ── */ .metrics-row { display: flex; flex-wrap: wrap; gap: 10px; margin-bottom: 12px; } .metrics-row > * { flex: 1 1 100px; min-width: 0; } /* ── inputs & buttons ── */ button.primary { width: 100% !important; font-size: clamp(14px, 3vw, 16px) !important; padding: 10px 16px !important; } /* ── labels: prevent overflow ── */ label span { white-space: normal !important; word-break: break-word !important; font-size: clamp(11px, 2.5vw, 14px) !important; } /* ── dataframe: horizontal scroll on small screens ── */ .gr-dataframe, .svelte-table-wrap, table { overflow-x: auto !important; display: block !important; width: 100% !important; font-size: clamp(11px, 2.5vw, 14px) !important; } /* ── confidence / time row ── */ .result-row { display: flex; flex-wrap: wrap; gap: 10px; } .result-row > * { flex: 1 1 140px; min-width: 0; } /* ── markdown content ── */ .gr-markdown p, .gr-markdown li { font-size: clamp(13px, 2.8vw, 15px) !important; line-height: 1.6 !important; } .gr-markdown h2 { font-size: clamp(16px, 4vw, 22px) !important; } .gr-markdown h3 { font-size: clamp(14px, 3vw, 18px) !important; } /* ── image components ── */ .gr-image img { max-width: 100% !important; height: auto !important; } /* ── small screens ── */ @media (max-width: 480px) { .gradio-container { padding: 0 8px !important; } .tab-nav button { font-size: 11px !important; padding: 6px 6px !important; } } """ # ===================================================== # SHOW/HIDE CUSTOM PROMPT # ===================================================== def toggle_prompt(task): if task == "Custom Prompt": return gr.update(visible=True) return gr.update(visible=False) # ===================================================== # INFERENCE # ===================================================== def generate_response(image, task, custom_prompt): if image is None: return (None, "Please upload an image.", "-", "-") if task == "Custom Prompt": prompt = custom_prompt.strip() if len(prompt) == 0: prompt = "Describe this image." else: prompt = PROMPTS[task] start = time.time() inputs = processor(images=image, text=prompt, return_tensors="pt") with torch.no_grad(): outputs = model.generate( **inputs, max_new_tokens=60, output_scores=True, return_dict_in_generate=True ) generated_text = processor.batch_decode( outputs.sequences, skip_special_tokens=True )[0] # ── confidence score ── token_confidences = [] for score in outputs.scores: probs = torch.softmax(score, dim=-1) max_prob = probs.max().item() token_confidences.append(max_prob) if len(token_confidences) > 0: confidence = (sum(token_confidences) / len(token_confidences)) * 100 else: confidence = 0 confidence = f"{confidence:.2f}%" inference_time = time.time() - start return (image, generated_text, confidence, f"{inference_time:.2f} sec") # ===================================================== # UI # ===================================================== with gr.Blocks( title="AI Image Captioning System", theme=gr.themes.Soft(primary_hue="blue", secondary_hue="sky"), css=css ) as demo: gr.HTML('