Spaces:
Sleeping
Sleeping
| # define analysis engine | |
| from google.genai.types import Tool, GenerateContentConfig, GoogleSearch | |
| from google.genai import types | |
| from google import genai | |
| from io import BytesIO | |
| from PIL import Image | |
| import pandas as pd | |
| import gradio as gr | |
| import base64 | |
| import json | |
| import os | |
| def cv2net(image_path,api_key): | |
| # authenticate gemini client | |
| client = genai.Client(api_key=api_key) | |
| # call Google Search tool | |
| google_search_tool = Tool( | |
| google_search = GoogleSearch() | |
| ) | |
| with open(image_path, 'rb') as f: | |
| image_data = f.read() | |
| prompt = """ | |
| I want you to carefully analyze the image(s) and map the functional relationship between every single identified entity in the image. | |
| Do not ignore small or partially visible items. Collect the following information from the image(s) and DO NOT include items, objects, or things that are not in the image(s): | |
| - Specific object name or person | |
| - Precise functional relationship verb | |
| - Class: object, person, animal, environment, text, brand | |
| - Primary function or role | |
| - Dominant color | |
| - Small, medium, large, tiny, huge | |
| - Material type | |
| - Location description | |
| - Current condition | |
| - Spatial context | |
| - Setting or environment | |
| - Relationship strength: strong, medium, weak | |
| - Spatial context | |
| - Scene context | |
| - Confidence: high, medium, low | |
| - Today's date (YYYY-MM-DD) | |
| Ignore what a person in an image is wearing. Return the results as one JSON file with the following structure exactly: | |
| ```json | |
| [ | |
| { | |
| "Vertex1": "specific_object_name_or_person", | |
| "Vertex2": "specific_object_name_or_person", | |
| "Relationship": "precise_functional_relationship_verb", | |
| "Vertex1_class": "Object|Person|Animal|Environment|Text|Brand", | |
| "Vertex1_purpose": "primary_function_or_role", | |
| "Vertex1_size": "small|medium|large|tiny|huge", | |
| "Vertex1_position": "location_description", | |
| "Vertex1_state": "current_condition", | |
| "Vertex2_class": "Object|Person|Animal|Environment|Text|Brand", | |
| "Vertex2_purpose": "primary_function_or_role", | |
| "Vertex2_size": "small|medium|large|tiny|huge", | |
| "Vertex2_position": "location_description", | |
| "Vertex2_state": "current_condition", | |
| "Relationship_type": "spatial|functional|contextual|interactive", | |
| "Relationship_strength": "strong|medium|weak", | |
| "Spatial_context": "detailed_spatial_description", | |
| "Scene_context": "setting_or_environment", | |
| "Confidence": "high|medium|low", | |
| "Date": "today's_date" | |
| } | |
| ] | |
| ``` | |
| Here is an example JSON output: | |
| ```json | |
| [ | |
| { | |
| "Vertex1": "Man", | |
| "Vertex2": "Bench", | |
| "Relationship": "Sits on", | |
| "Vertex1_class": "Person", | |
| "Vertex1_purpose": "Posing for photo", | |
| "Vertex1_size": "Medium", | |
| "Vertex1_position": "Left foreground", | |
| "Vertex1_state": "Visible", | |
| "Vertex2_class": "Object", | |
| "Vertex2_purpose": "A seat", | |
| "Vertex2_size": "Medium", | |
| "Vertex2_position": "Middle ground", | |
| "Vertex2_state": "Visible", | |
| "Relationship_type": "Functional", | |
| "Relationship_strength": "Strong", | |
| "Spatial_context": "Man is sitting on bench", | |
| "Scene_context": "Outdoor scene in the park", | |
| "Confidence": "High", | |
| "Date": "2025-07-16" | |
| } | |
| ] | |
| ``` | |
| """ | |
| response = client.models.generate_content( | |
| model="gemini-2.0-flash", | |
| contents=[types.Part.from_bytes(data=image_data, mime_type="image/jpeg"), prompt], | |
| config=GenerateContentConfig( | |
| tools=[google_search_tool], | |
| response_modalities=["TEXT"], | |
| response_mime_type="application/json", | |
| ) | |
| ) | |
| try: | |
| # convert response from string to JSON | |
| json_file = json.loads(response.text) | |
| # convert JSON into a DataFrame | |
| df = pd.DataFrame(json_file) | |
| return df | |
| except json.JSONDecodeError as e: | |
| print(f"Error decoding JSON for image: {image_data} - {e}") | |
| return None | |