# app.py - Hugging Face Spaces App import gradio as gr import torch import cv2 import numpy as np from PIL import Image from transformers import AutoProcessor, LlavaNextForConditionalGeneration, BitsAndBytesConfig from peft import PeftModel import tempfile import os # Model configuration MODEL_ID = "llava-hf/llava-v1.6-mistral-7b-hf" PEFT_MODEL_ID = "arjunanand13/gas_pipe_llava_finetunedv2" @torch.no_grad() def load_model(): """Load the fine-tuned PEFT model""" print("Loading PEFT model...") # Quantization config bnb_config = BitsAndBytesConfig( load_in_4bit=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.float16, bnb_4bit_use_double_quant=True, bnb_4bit_quant_storage=torch.uint8 ) # Load processor processor = AutoProcessor.from_pretrained(PEFT_MODEL_ID) # Load base model base_model = LlavaNextForConditionalGeneration.from_pretrained( MODEL_ID, torch_dtype=torch.float16, quantization_config=bnb_config, device_map="auto", low_cpu_mem_usage=True, trust_remote_code=True ) # Load PEFT adapters model = PeftModel.from_pretrained(base_model, PEFT_MODEL_ID) print("Model loaded successfully!") return processor, model # Load model once at startup processor, model = load_model() def preprocess_video(video_path): """Extract 4 frames and create 2x2 grid""" cap = cv2.VideoCapture(video_path) if not cap.isOpened(): raise ValueError(f"Cannot open video: {video_path}") total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT)) frame_indices = np.linspace(0, max(0, total_frames - 1), 4, dtype=int) frames = [] for frame_idx in frame_indices: cap.set(cv2.CAP_PROP_POS_FRAMES, frame_idx) ret, frame = cap.read() if ret: frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) frame_pil = Image.fromarray(frame_rgb).resize((112, 112)) frames.append(frame_pil) cap.release() while len(frames) < 4: if frames: frames.append(frames[-1].copy()) else: frames.append(Image.new('RGB', (112, 112), color='black')) grid_image = Image.new('RGB', (224, 224)) positions = [(0, 0), (112, 0), (0, 112), (112, 112)] for i, frame in enumerate(frames[:4]): grid_image.paste(frame, positions[i]) return grid_image def predict_cheating(video_file): """Analyze video for gas pipe testing compliance""" if video_file is None: return "Please upload a video file", None try: # Process video grid_image = preprocess_video(video_file) # Use forced JSON prompt prompt = "[INST] \nGas pipe test result? [/INST] {\"cheating\":" inputs = processor(text=prompt, images=grid_image, return_tensors="pt") # Move to device inputs = {k: v.to(model.device) if hasattr(v, 'to') else v for k, v in inputs.items()} # Generate prediction generated_ids = model.generate( **inputs, max_new_tokens=16, do_sample=False, pad_token_id=processor.tokenizer.eos_token_id ) # Decode result result = processor.decode(generated_ids[0][len(inputs['input_ids'][0]):], skip_special_tokens=True) # Parse result full_result = '{"cheating":' + result if "true" in result.lower(): prediction = "CHEATING DETECTED" explanation = "The system detected violations in the testing procedure." status = "🚨" else: prediction = "COMPLIANT PROCEDURE" explanation = "The testing procedure appears to follow proper protocols." status = "✅" formatted_result = f"{status} **{prediction}**\n\n{explanation}\n\nModel output: {full_result}" return formatted_result, grid_image except Exception as e: return f"Error processing video: {str(e)}", None # Gradio Interface with gr.Blocks( title="Gas Pipe Quality Control Detection", theme=gr.themes.Soft(), css=""" .gradio-container { max-width: 1200px; margin: auto; } """ ) as demo: gr.HTML("""

🔧 Gas Pipe Quality Control Detection

AI-powered detection of compliance violations in gas pipe testing procedures

""") with gr.Row(): with gr.Column(scale=2): video_input = gr.Video( label="Upload Gas Pipe Testing Video", height=320 ) analyze_btn = gr.Button( "Analyze Video", variant="primary", size="lg" ) gr.Markdown(""" **Supported formats:** MP4, AVI, MOV, WMV **Max duration:** 60 seconds recommended """) with gr.Column(scale=2): result_text = gr.Textbox( label="Detection Result", lines=8, max_lines=12 ) processed_image = gr.Image( label="Processed Video Frames (2×2 Grid)", height=320 ) analyze_btn.click( fn=predict_cheating, inputs=[video_input], outputs=[result_text, processed_image] ) gr.HTML("""

Model: Fine-tuned LLaVA v1.6 Mistral 7B with LoRA adapters

Repository: arjunanand13/gas_pipe_llava_finetunedv2

Developed by: Arjun Anand

""") if __name__ == "__main__": demo.launch()