giulia-fontanella commited on
Commit
3fae792
·
verified ·
1 Parent(s): a8e3583

Update tools.py

Browse files
Files changed (1) hide show
  1. tools.py +52 -1
tools.py CHANGED
@@ -47,5 +47,56 @@ def extract_text(img_path: str) -> str:
47
  print(error_msg)
48
  return ""
49
 
50
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
51
 
 
47
  print(error_msg)
48
  return ""
49
 
50
+
51
+ def describe_image(img_path: str, query: str) -> str:
52
+ """
53
+ Generate a detailed description of an image using a multimodal model.
54
+
55
+ This function reads a local image file, encodes it, and sends it to a
56
+ vision-capable language model to obtain a comprehensive, natural language
57
+ description of the image's content, including its objects, actions, and context,
58
+ following a specific query.
59
+
60
+ Args:
61
+ img_path: A string path to a local image file (e.g., PNG, JPEG).
62
+ query: Information to extract from the image
63
+
64
+ Returns:
65
+ A single string containing a detailed, human-readable description of the image.
66
+ """
67
+ try:
68
+ # Read image and encode as base64
69
+ with open(img_path, "rb") as image_file:
70
+ image_bytes = image_file.read()
71
+
72
+ image_base64 = base64.b64encode(image_bytes).decode("utf-8")
73
+
74
+ # Prepare message payload
75
+ message = [
76
+ HumanMessage(
77
+ content=[
78
+ {
79
+ "type": "text",
80
+ "text": (
81
+ f"Describe this image in rich detail. Include objects, people, setting, background elements, and any inferred actions or context. Avoid technical jargon. In particular, extract the following information: {query}" ),
82
+ },
83
+ {
84
+ "type": "image_url",
85
+ "image_url": {
86
+ "url": f"data:image/png;base64,{image_base64}"
87
+ },
88
+ },
89
+ ]
90
+ )
91
+ ]
92
+
93
+ # Call the vision model (assumes vision_llm is previously instantiated)
94
+ response = vision_llm.invoke(message)
95
+
96
+ return response.content.strip()
97
+
98
+ except Exception as e:
99
+ error_msg = f"Error describing image: {str(e)}"
100
+ print(error_msg)
101
+ return ""
102