shubham7080 commited on
Commit
96dcdde
·
verified ·
1 Parent(s): b8050be

Create tools/describe_image_tool.py

Browse files
Files changed (1) hide show
  1. tools/describe_image_tool.py +110 -0
tools/describe_image_tool.py ADDED
@@ -0,0 +1,110 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import base64
2
+ import os
3
+
4
+ from openai import OpenAI
5
+ from smolagents import Tool
6
+
7
+ client = OpenAI()
8
+
9
+
10
+ class DescribeImageTool(Tool):
11
+ """
12
+ Tool to analyze and describe any image using GPT-4 Vision API.
13
+ Args:
14
+ image_path (str): Path to the image file.
15
+ description_type (str): Type of description to generate. Options:
16
+ - "general": General description of the image
17
+ - "detailed": Detailed analysis of the image
18
+ - "chess": Analysis of a chess position
19
+ - "text": Extract and describe text from the image
20
+ - "custom": Custom description based on user prompt
21
+ Returns:
22
+ str: Description of the image based on the requested type.
23
+ """
24
+
25
+ name = "describe_image"
26
+ description = "Analyzes and describes images using GPT-4 Vision API"
27
+ inputs = {
28
+ "image_path": {"type": "string", "description": "Path to the image file"},
29
+ "description_type": {
30
+ "type": "string",
31
+ "description": "Type of description to generate (general, detailed, chess, text, custom)",
32
+ "nullable": True,
33
+ },
34
+ "custom_prompt": {
35
+ "type": "string",
36
+ "description": "Custom prompt for description (only used when description_type is 'custom')",
37
+ "nullable": True,
38
+ },
39
+ }
40
+ output_type = "string"
41
+
42
+ def encode_image(self, image_path: str) -> str:
43
+ """Encode image to base64 string."""
44
+ with open(image_path, "rb") as image_file:
45
+ return base64.b64encode(image_file.read()).decode("utf-8")
46
+
47
+ def get_prompt(self, description_type: str, custom_prompt: str = None) -> str:
48
+ """Get appropriate prompt based on description type."""
49
+ prompts = {
50
+ "general": "Provide a general description of this image. Focus on the main subjects, colors, and overall scene.",
51
+ "detailed": """Analyze this image in detail. Include:
52
+ 1. Main subjects and their relationships
53
+ 2. Colors, lighting, and composition
54
+ 3. Any text or symbols present
55
+ 4. Context or possible meaning
56
+ 5. Notable details or interesting elements""",
57
+ "chess": """Analyze this chess position and provide a detailed description including:
58
+ 1. List of pieces on the board for both white and black
59
+ 2. Whose turn it is to move
60
+ 3. Basic evaluation of the position
61
+ 4. Any immediate tactical opportunities or threats
62
+ 5. Suggested next moves with brief explanations""",
63
+ "text": "Extract and describe any text present in this image. If there are multiple pieces of text, organize them clearly.",
64
+ }
65
+ return (
66
+ custom_prompt
67
+ if description_type == "custom"
68
+ else prompts.get(description_type, prompts["general"])
69
+ )
70
+
71
+ def forward(
72
+ self,
73
+ image_path: str,
74
+ description_type: str = "general",
75
+ custom_prompt: str = None,
76
+ ) -> str:
77
+ try:
78
+ if not os.path.exists(image_path):
79
+ return f"Error: Image file not found at {image_path}"
80
+
81
+ # Encode the image
82
+ base64_image = self.encode_image(image_path)
83
+
84
+ # Get appropriate prompt
85
+ prompt = self.get_prompt(description_type, custom_prompt)
86
+
87
+ # Make the API call
88
+ response = client.chat.completions.create(
89
+ model="gpt-4.1",
90
+ messages=[
91
+ {
92
+ "role": "user",
93
+ "content": [
94
+ {"type": "text", "text": prompt},
95
+ {
96
+ "type": "image_url",
97
+ "image_url": {
98
+ "url": f"data:image/jpeg;base64,{base64_image}"
99
+ },
100
+ },
101
+ ],
102
+ }
103
+ ],
104
+ max_tokens=1000,
105
+ )
106
+
107
+ return response.choices[0].message.content
108
+
109
+ except Exception as e:
110
+ return f"Error analyzing image: {str(e)}"