import os import base64 import json import requests from time import sleep from tqdm import tqdm # ------------------------------------------------------------------- # Helper: encode image as base64 # ------------------------------------------------------------------- def encode_image(image_path: str) -> str: with open(image_path, "rb") as f: return base64.b64encode(f.read()).decode("utf-8") # ------------------------------------------------------------------- # Main function: use InternVL to analyze obstacles # ------------------------------------------------------------------- def analyze_obstacles_in_folder_internvl( image_dir: str, output_path: str, api_key: str = None, model: str = "internvl3.5-241b-a28b", temperature: float = 1.0, sleep_time: float = 1.0, ): """ For each .png in image_dir, send the image to InternVL with the obstacle prompt and write results to output_path. """ if api_key is None: # Try to read from environment variable as a fallback api_key = os.getenv("INTERNVL_API_KEY") if not api_key: raise ValueError( "No InternVL API key provided. " "Pass --api_key on the command line or set the INTERNVL_API_KEY env var." ) url = "https://chat.intern-ai.org.cn/api/v1/chat/completions" headers = { "Content-Type": "application/json", "Authorization": f"Bearer {api_key}", } # Collect all PNG images image_paths = [ os.path.join(image_dir, f) for f in os.listdir(image_dir) if f.lower().endswith(".png") ] image_paths.sort() # Keep behavior consistent with your original gpt.py # (skips the first image) # image_paths = image_paths[16:] if not image_paths: print(f"No .png images found in {image_dir}") return # The list of 8 questions – same as in gpt.py questions_prompt = ( "1. Identify the obstacle on the sidewalk or walkable path ahead." ) # "8. Identify the single object most likely to be hit by a pedestrian moving straight ahead, and assign it a formal Out-of-Place Score (0–100) based only on its position.\n" # " Scoring scale:\n" # " 0 = perfectly expected position (default/home location)\n" # " 50 = somewhat out of place from where it is typically expected\n" # " 100 = completely out of place and highly surprising\n" # "System" message text – we’ll prepend this to the text content system_message_text = ( "I am fully blind. You are a mobility assistant who analyzes the scene and describes obstacles for safe navigation. Be concise and accurate. Start your answer with 1. " ) with open(output_path, "a", encoding="utf-8") as out_f: for img_path in tqdm(image_paths, desc="Processing images with InternVL"): # cont_flag= True # if "GarbageBag_Set1_Pos1_OOPS0" in img_path or "Safety_Cone_Pos2_OOPS0.5" in img_path or "chair_0" in img_path: # cont_flag= False # if cont_flag: # continue try: img_b64 = encode_image(img_path) # Following the style of objects_name_intern.py: # first a short "session" exchange, then the actual multimodal user message data = { "model": model, "messages": [ {"role": "user", "content": "Start session."}, {"role": "assistant", "content": "Session started."}, { "role": "user", "content": [ { "type": "text", "text": system_message_text + "\n\n" + questions_prompt, }, { "type": "image_url", "image_url": { "url": f"data:image/png;base64,{img_b64}" }, }, ], }, ], "temperature": temperature, "top_p": 0.9, "max_tokens": 2048, } response = requests.post( url, headers=headers, data=json.dumps(data) ) response.raise_for_status() content = response.json()["choices"][0]["message"]["content"] out_f.write(f"IMAGE: {img_path}\n") out_f.write(content.strip() + "\n") out_f.write("\n" + "-" * 80 + "\n\n") out_f.flush() sleep(sleep_time) except Exception as e: print(f"Error processing {img_path}: {e}") out_f.write(f"IMAGE: {img_path}\n") out_f.write(f"ERROR: {e}\n") out_f.write("\n" + "-" * 80 + "\n\n") out_f.flush() print(f"Done. Results saved to {output_path}") # ------------------------------------------------------------------- # CLI # ------------------------------------------------------------------- if __name__ == "__main__": import argparse parser = argparse.ArgumentParser( description="Process PNG images with InternVL for obstacle analysis." ) parser.add_argument("--image_dir", required=True, help="Folder of .png images") parser.add_argument("--output", required=True, help="Output text file") parser.add_argument( "--api_key", default="sk-6yfk0jIHCoZk4mppCqI5O9wOBASwB0ZlbzeZ3F0FNFs5oN4S", help="InternVL API key (or set INTERNVL_API_KEY env var).", ) parser.add_argument( "--model", default="internvl3.5-241b-a28b", help="Model name for InternVL (default: internvl-latest)", ) parser.add_argument( "--temperature", type=float, default=0.2, help="Sampling temperature" ) parser.add_argument( "--sleep", type=float, default=1.0, help="Sleep time between requests (seconds)", ) args = parser.parse_args() analyze_obstacles_in_folder_internvl( image_dir=args.image_dir, output_path=args.output, api_key=args.api_key, model=args.model, temperature=args.temperature, sleep_time=args.sleep, )