import os import base64 import openai from time import sleep from tqdm import tqdm # ------------------------------------------------------------------- # Setup # ------------------------------------------------------------------- openai.api_key = "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" # ------------------------------------------------------------------- # Helper: encode image as base64 # ------------------------------------------------------------------- def encode_image(image_path: str) -> str: with open(image_path, "rb") as f: return base64.b64encode(f.read()).decode("utf-8") # ------------------------------------------------------------------- # Main function # ------------------------------------------------------------------- def analyze_obstacles_in_folder( image_dir: str, output_path: str, model: str = "gpt-5", temperature: float = 1, sleep_time: float = 1.0, ): """ For each .png in image_dir, send the image to GPT with the obstacle prompt and write results to output_path. """ image_paths = [ os.path.join(image_dir, f) for f in os.listdir(image_dir) if f.lower().endswith(".png") ] image_paths.sort() image_rest_names=["Flower_pot_Pos5_OOPS0.png","GarbageBag_Set1_Pos1_OOPS0.png","Safety_Cone_Pos3_OOPS1.png","trashcan_in_0L.png"] if not image_paths: print(f"No .png images found in {image_dir}") return # The list of 8 questions – stays in user message questions_prompt = ( "1. Identify the obstacle on the sidewalk or walkable path ahead. " "2.Identify the single object most likely to be hit by a pedestrian moving straight ahead, and assign it a formal Out-of-Place Score (0–100) based only on its position.\n" "Scoring scale:\n" "0 = perfectly expected position (default/home location)\n" "50 = somewhat out of place from where it is typically expected\n" "100 = completely out of place and highly surprising\n\n" ) # Correct system message (your exact paragraph) system_message_text = ( "I am fully blind. You are a mobility assistant who analyzes the scene " "and describes obstacles for safe navigation. Be concise and accurate. " ) with open(output_path, "a", encoding="utf-8") as out_f: for img_path in tqdm(image_paths, desc="Processing images"): cont_flag=True for fname in image_rest_names: if fname in img_path: cont_flag=False break if cont_flag: continue try: img_b64 = encode_image(img_path) response = openai.ChatCompletion.create( model=model, messages=[ { "role": "system", "content": system_message_text, }, { "role": "user", "content": [ {"type": "text", "text": questions_prompt}, { "type": "image_url", "image_url": { "url": f"data:image/png;base64,{img_b64}" }, }, ], }, ], max_completion_tokens=2048 ) answer = response.choices[0].message.content out_f.write(f"IMAGE: {img_path}\n") out_f.write(answer.strip() + "\n") out_f.write("\n" + "-" * 80 + "\n\n") out_f.flush() sleep(sleep_time) except Exception as e: print(f"Error processing {img_path}: {e}") out_f.write(f"IMAGE: {img_path}\n") out_f.write(f"ERROR: {e}\n") out_f.write("\n" + "-" * 80 + "\n\n") out_f.flush() print(f"Done. Results saved to {output_path}") # ------------------------------------------------------------------- # CLI # ------------------------------------------------------------------- if __name__ == "__main__": import argparse parser = argparse.ArgumentParser(description="Process PNG images with GPT.") parser.add_argument("--image_dir", required=True) parser.add_argument("--output", required=True) parser.add_argument("--model", default="gpt-5") parser.add_argument("--temperature", type=float, default=0.2) parser.add_argument("--sleep", type=float, default=1.0) args = parser.parse_args() analyze_obstacles_in_folder( args.image_dir, args.output, model=args.model, temperature=args.temperature, sleep_time=args.sleep, )