File size: 5,281 Bytes
165189d
627d59b
165189d
1fb1e3b
5253b0d
1fb1e3b
3565497
306ab5e
165189d
5253b0d
3565497
5253b0d
306ab5e
4decfa0
 
5253b0d
4decfa0
5253b0d
 
 
4decfa0
 
 
3565497
9cd6aba
3565497
306ab5e
5253b0d
3565497
5253b0d
306ab5e
3565497
306ab5e
4decfa0
306ab5e
3565497
 
4decfa0
 
306ab5e
 
 
 
 
 
 
 
 
 
 
 
 
4decfa0
 
306ab5e
4decfa0
0e3d6b3
4decfa0
3565497
 
306ab5e
 
 
 
 
 
 
 
4decfa0
3565497
306ab5e
4decfa0
 
0e3d6b3
 
f745b5c
 
0e3d6b3
f745b5c
3565497
306ab5e
4decfa0
3565497
0e3d6b3
 
 
 
 
 
f745b5c
0e3d6b3
f745b5c
0e3d6b3
3565497
4decfa0
9cd6aba
0e3d6b3
165189d
9cd6aba
5253b0d
 
 
27c0f8e
3565497
306ab5e
 
 
 
27c0f8e
306ab5e
 
 
 
 
 
 
 
b18ef1e
5253b0d
 
3c36d2f
971f1e0
3c36d2f
 
 
 
27c0f8e
306ab5e
 
 
 
 
27c0f8e
17f5b16
306ab5e
3c36d2f
 
 
 
 
 
 
3565497
3c36d2f
7caebc5
 
27c0f8e
165189d
 
306ab5e
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
import base64
import io
import gradio as gr
from fastmcp import Client
from fastmcp.client import StreamableHttpTransport
import asyncio
import ast
import json

# -------------------------------
# MCP server info
# -------------------------------
ROBOT_ID = "Robot_MCP_Client"
MCP_SERVER_URL = "https://oppaai-robot-mcp-server.hf.space/gradio_api/mcp/"
SERVER_NAME = "Robot_MCP_Server"
TOOL_NAME = "Robot_MCP_Server_robot_watch"

# -------------------------------
# Initialize MCP client globally
# -------------------------------
HTTP_TRANSPORT = StreamableHttpTransport(url=MCP_SERVER_URL)
MCP_CLIENT = Client(transport=HTTP_TRANSPORT, name=SERVER_NAME)

# -------------------------------
# Async function using user's HF token
# -------------------------------
async def process_webcam_stream_async(image, oauth_token: gr.OAuthToken | None = None):
    """
    Send webcam image to MCP server using user's HF token and process the response.
    """
    # 1. CHECK LOGIN: If no token, ask user to log in
    if oauth_token is None:
        return "Please log in using the button above.", "", "", "", "", "", "", ""

    # 2. CHECK IMAGE: If camera hasn't loaded yet
    if image is None:
        return "", "", "", "", "", "", "", ""

    try:
        # 3. PREPARE IMAGE: Convert to Base64
        buffered = io.BytesIO()
        image.save(buffered, format="JPEG")
        b64_img = base64.b64encode(buffered.getvalue()).decode("utf-8")

        # 4. PREPARE PAYLOAD: Inject the user's token
        payload = {
            "hf_token_input": oauth_token.token, # <--- Token used here
            "robot_id_input": ROBOT_ID,
            "image_b64_input": b64_img
        }

        # 5. CALL MCP SERVER
        async with MCP_CLIENT:
            response = await MCP_CLIENT.call_tool(TOOL_NAME, payload)
            
            if response.is_error:
                error_text = response.content.text if response.content else "Unknown error"
                raise Exception(f"MCP Tool Error: {error_text}")

            raw_text = response.content.text
            
            # 6. PARSE RESPONSE (Handle both JSON and Python Dict strings)
            try:
                response_dict = json.loads(raw_text)
            except json.JSONDecodeError:
                # Fallback if server returns single quotes
                response_dict = ast.literal_eval(raw_text)
            
            vlm_result = response_dict.get("result", {})

            # 7. EXTRACT DATA
            description_out = vlm_result.get("description", "")
            environment_out = vlm_result.get("environment", "")
            indoor_outdoor_out = vlm_result.get("indoor_or_outdoor", "")
            lighting_condition_out = vlm_result.get("lighting_condition", "")
            human_out = vlm_result.get("human", "")
            animals_out = vlm_result.get("animals", "")
            objects_list = vlm_result.get("objects", [])
            hazards_out = vlm_result.get("hazards", "")

            # Convert list to string
            objects_str = ", ".join(objects_list) if isinstance(objects_list, list) else str(objects_list)

            return (
                description_out,
                environment_out,
                indoor_outdoor_out,
                lighting_condition_out,
                human_out,
                animals_out,
                objects_str,
                hazards_out
            )

    except Exception as e:
        print(f"Error calling MCP API: {e}")
        return f"Error: {e}", "", "", "", "", "", "", ""


# -------------------------------
# Gradio UI
# -------------------------------
with gr.Blocks() as demo:
    gr.Markdown("## 🎥 Robot Vision Webcam Stream (MCP Client)")
    
    # The Login Button (Required for oauth_token)
    gr.LoginButton()

    with gr.Row():
        # Input: Webcam
        webcam_input = gr.Image(
            label="Captured from Web-Cam", 
            sources=["webcam"], 
            type="pil"
        )
        
        # Outputs
        with gr.Column():
            description_out = gr.Textbox(label="Description", lines=5)
            environment_out = gr.Textbox(label="Environment", lines=3)
            indoor_outdoor_out = gr.Textbox(label="Indoor/Outdoor", lines=1)
            lighting_condition_out = gr.Textbox(label="Lighting Condition", lines=1)
            human_out = gr.Textbox(label="Human Detected", lines=3)
            animals_out = gr.Textbox(label="Animals Detected", lines=2)
            objects_out = gr.Textbox(label="Objects Detected", lines=2)
            hazards_out = gr.Textbox(label="Hazards Identified", lines=2)

    # -------------------------------
    # Event Trigger
    # -------------------------------
    # CRITICAL FIX: Do NOT include gr.OAuthToken() in inputs.
    # Gradio automatically injects it because it's in the function signature.
    webcam_input.stream(
        process_webcam_stream_async,
        inputs=[webcam_input], 
        outputs=[
            description_out,
            environment_out,
            indoor_outdoor_out,
            lighting_condition_out,
            human_out,
            animals_out,
            objects_out,
            hazards_out
        ],
        stream_every=1.0
    )

if __name__ == "__main__":
    demo.launch()