File size: 4,835 Bytes
88c69e1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
import gradio as gr
import spaces
import torch
import json
import re
import urllib.request
import os
from transformers import AutoProcessor, AutoModelForMultimodalLM

MODEL_ID = "google/gemma-4-E2B-it"

print(f"Loading {MODEL_ID}...")
processor = AutoProcessor.from_pretrained(MODEL_ID)
model = AutoModelForMultimodalLM.from_pretrained(
    MODEL_ID,
    torch_dtype=torch.bfloat16,
    device_map="auto"
)
print("Model loaded successfully.")

os.makedirs("sample_data", exist_ok=True)
SAMPLE_IMAGE = "sample_data/car_damage.jpg"
SAMPLE_AUDIO = "sample_data/driver_statement.wav"

if not os.path.exists(SAMPLE_IMAGE):
    urllib.request.urlretrieve(
        "https://www.driving.org/wp-content/uploads/2023/11/driver-hand-examining-dented-car-with-damaged-fend-2023-07-17-20-53-56-utc-e1699944140557.jpg",
        SAMPLE_IMAGE
    )

if not os.path.exists(SAMPLE_AUDIO):
    urllib.request.urlretrieve(
        "https://raw.githubusercontent.com/google-gemma/cookbook/refs/heads/main/apps/sample-data/journal1.wav",
        SAMPLE_AUDIO
    )

@spaces.GPU
def process_insurance_claim(image_path, audio_path):
    if not image_path or not audio_path:
        return {"error": "Both an image of the damage and an audio statement are required."}

    system_prompt = """You are an expert AI Auto Insurance Claim Adjuster. 
Your task is to analyze the provided image of vehicle damage and the audio statement from the driver. 
Cross-reference the audio description with the visual evidence.
You must output ONLY a valid JSON object. Do not include markdown formatting like ```json.
The JSON must strictly follow this schema:
{
  "damage_severity": "Low|Medium|High|Total Loss",
  "affected_parts": ["list", "of", "damaged", "car", "parts"],
  "driver_statement_summary": "Short 1-sentence summary of the audio transcript",
  "consistency_check": "Match|Mismatch",
  "flagged_for_review": true|false,
  "reasoning": "Brief explanation of why it matches or doesn't match the visual evidence."
}"""

    messages = [
        {
            "role": "system", 
            "content": [
                {"type": "text", "text": system_prompt}
            ]
        },
        {
            "role": "user",
            "content": [
                {"type": "image", "url": image_path},
                {"type": "audio", "audio": audio_path},
                {"type": "text", "text": "Analyze this insurance claim and output the JSON report."}
            ]
        }
    ]

    inputs = processor.apply_chat_template(
        messages,
        tokenize=True,
        return_dict=True,
        return_tensors="pt",
        add_generation_prompt=True,
        enable_thinking=False
    ).to(model.device)
    
    input_len = inputs["input_ids"].shape[-1]

    outputs = model.generate(
        **inputs, 
        max_new_tokens=512,
        temperature=0.2,
        top_p=0.95,
        top_k=64
    )
    
    response = processor.decode(outputs[0][input_len:], skip_special_tokens=True)
    
    clean_response = re.sub(r"^```(?:json)?\s*", "", response).strip()
    clean_response = re.sub(r"\s*```$", "", clean_response).strip()
    
    try:
        json_output = json.loads(clean_response)
        return json_output
    except json.JSONDecodeError:
        return {
            "error": "Failed to parse JSON output.",
            "raw_output": response
        }

css = """
#component-0 { max-width: 900px; margin: auto; }
.gr-button { background-color: #2563eb !important; color: white !important; }
"""

with gr.Blocks(theme=gr.themes.Soft(), css=css) as demo:
    gr.Markdown(
        """
        # 🚗 AI Auto Claim Adjuster (Gemma 4 E2B)
        Upload a photo of the vehicle damage alongside an audio statement from the driver describing the incident. 
        Gemma 4 E2B natively processes **both the audio wave and the image** simultaneously, transcribing the story, analyzing the visual damage, and outputting a structured JSON claim adjustment report.
        """
    )
    
    with gr.Row():
        with gr.Column(scale=1):
            img_input = gr.Image(type="filepath", label="1. Upload Vehicle Damage Image")
            audio_input = gr.Audio(type="filepath", label="2. Upload Driver Audio Statement")
            submit_btn = gr.Button("Generate Claim Report", size="lg")
            
        with gr.Column(scale=1):
            json_output = gr.JSON(label="Structured Claim JSON Output")

    submit_btn.click(
        fn=process_insurance_claim,
        inputs=[img_input, audio_input],
        outputs=[json_output]
    )
    
    gr.Examples(
        examples=[[SAMPLE_IMAGE, SAMPLE_AUDIO]],
        inputs=[img_input, audio_input],
        outputs=[json_output],
        fn=process_insurance_claim,
        cache_examples=False,
        label="Try Demo Example"
    )

if __name__ == "__main__":
    demo.launch()