Final_Assignment_Template

Sleeping

App Files Files Community

dlaima commited on Jun 3, 2025

Commit

0a0ae08

verified ·

1 Parent(s): 593012b

Update image_analyzer.py

Browse files

Files changed (1) hide show

image_analyzer.py +22 -14

image_analyzer.py CHANGED Viewed

@@ -5,7 +5,7 @@ from smolagents import Tool
 class ImageAnalysisTool(Tool):
     name = "image_analysis"
-    description = "Analyze the content of an image and answer a specific question about it using HF Inference API."
     inputs = {
         "image_path": {
             "type": "string",
@@ -20,10 +20,13 @@ class ImageAnalysisTool(Tool):
     def __init__(self):
         super().__init__()
-        # You can replace this with any vision model capable of VQA or image captioning
         self.api_url = "https://api-inference.huggingface.co/models/microsoft/git-base-captioning"
         self.headers = {
-            "Authorization": f"Bearer {os.getenv('HF_API_TOKEN')}"
         }
     def forward(self, image_path: str, question: str) -> str:
@@ -31,32 +34,36 @@ class ImageAnalysisTool(Tool):
             with open(image_path, "rb") as img_file:
                 image_bytes = img_file.read()
-            # Prepare the payload depending on the model API.
-            # Some models accept just the image bytes and return captions,
-            # some support multimodal input with text question + image.
-            # For this example, we'll assume a captioning model and append question manually.
             response = requests.post(
                 self.api_url,
                 headers=self.headers,
-                data=image_bytes,
                 timeout=60
             )
             if response.status_code == 200:
                 result = response.json()
                 caption = None
-                # The format depends on the model; check keys like 'generated_text' or 'caption'
                 if isinstance(result, dict):
-                    caption = result.get("generated_text") or result.get("caption")
-                elif isinstance(result, list) and len(result) > 0:
-                    caption = result[0].get("generated_text") if "generated_text" in result[0] else None
                 if not caption:
                     return "Error: No caption found in model response."
-                # Simple approach: combine caption + question to produce answer prompt
-                # If you want a deeper answer, you could chain a chat model here.
                 answer = f"Caption: {caption}\nAnswer to question '{question}': {caption}"
                 return answer.strip()
@@ -68,4 +75,5 @@ class ImageAnalysisTool(Tool):

 class ImageAnalysisTool(Tool):
     name = "image_analysis"
+    description = "Analyze the content of an image and answer a specific question about it using Hugging Face Inference API."
     inputs = {
         "image_path": {
             "type": "string",
     def __init__(self):
         super().__init__()
+        api_token = os.getenv("HF_API_TOKEN")
+        if not api_token:
+            raise EnvironmentError("HF_API_TOKEN not found in environment variables.")
         self.api_url = "https://api-inference.huggingface.co/models/microsoft/git-base-captioning"
         self.headers = {
+            "Authorization": f"Bearer {api_token}",
+            "Content-Type": "application/json"
         }
     def forward(self, image_path: str, question: str) -> str:
             with open(image_path, "rb") as img_file:
                 image_bytes = img_file.read()
+            # Encode image to base64 string
+            img_b64 = base64.b64encode(image_bytes).decode("utf-8")
+            # Prepare JSON payload - the exact structure depends on the model capabilities
+            # Here we send just the image for captioning
+            payload = {
+                "inputs": img_b64
+            }
             response = requests.post(
                 self.api_url,
                 headers=self.headers,
+                json=payload,
                 timeout=60
             )
             if response.status_code == 200:
                 result = response.json()
                 caption = None
+                # Try common keys for caption output
                 if isinstance(result, dict):
+                    caption = result.get("generated_text") or result.get("caption") or result.get("text")
+                elif isinstance(result, list) and len(result) > 0 and isinstance(result[0], dict):
+                    caption = result[0].get("generated_text") or result[0].get("caption") or result[0].get("text")
                 if not caption:
                     return "Error: No caption found in model response."
+                # Combine caption with the question to form a simple answer
                 answer = f"Caption: {caption}\nAnswer to question '{question}': {caption}"
                 return answer.strip()