oops / internvl_fix.py

Upload folder using huggingface_hub

75f0bc0 verified about 2 months ago

6.67 kB

	import os
	import base64
	import json
	import requests
	from time import sleep
	from tqdm import tqdm

	# -------------------------------------------------------------------
	# Helper: encode image as base64
	# -------------------------------------------------------------------
	def encode_image(image_path: str) -> str:
	with open(image_path, "rb") as f:
	return base64.b64encode(f.read()).decode("utf-8")


	# -------------------------------------------------------------------
	# Main function: use InternVL to analyze obstacles
	# -------------------------------------------------------------------
	def analyze_obstacles_in_folder_internvl(
	image_dir: str,
	output_path: str,
	api_key: str = None,
	model: str = "internvl3.5-241b-a28b",
	temperature: float = 1.0,
	sleep_time: float = 1.0,
	):
	"""
	For each .png in image_dir, send the image to InternVL with the obstacle prompt
	and write results to output_path.
	"""

	if api_key is None:
	# Try to read from environment variable as a fallback
	api_key = os.getenv("INTERNVL_API_KEY")

	if not api_key:
	raise ValueError(
	"No InternVL API key provided. "
	"Pass --api_key on the command line or set the INTERNVL_API_KEY env var."
	)

	url = "https://chat.intern-ai.org.cn/api/v1/chat/completions"
	headers = {
	"Content-Type": "application/json",
	"Authorization": f"Bearer {api_key}",
	}

	# Collect all PNG images
	image_paths = [
	os.path.join(image_dir, f)
	for f in os.listdir(image_dir)
	if f.lower().endswith(".png")
	]
	image_paths.sort()

	# Keep behavior consistent with your original gpt.py
	# (skips the first image)
	# image_paths = image_paths[16:]

	if not image_paths:
	print(f"No .png images found in {image_dir}")
	return

	# The list of 8 questions – same as in gpt.py
	questions_prompt = (
	"1. Identify the obstacle on the sidewalk or walkable path ahead."
	)

	# "8. Identify the single object most likely to be hit by a pedestrian moving straight ahead, and assign it a formal Out-of-Place Score (0–100) based only on its position.\n"
	# " Scoring scale:\n"
	# " 0 = perfectly expected position (default/home location)\n"
	# " 50 = somewhat out of place from where it is typically expected\n"
	# " 100 = completely out of place and highly surprising\n"

	# "System" message text – we’ll prepend this to the text content
	system_message_text = (
	"I am fully blind. You are a mobility assistant who analyzes the scene and describes obstacles for safe navigation. Be concise and accurate. Start your answer with 1. "
	)

	with open(output_path, "a", encoding="utf-8") as out_f:
	for img_path in tqdm(image_paths, desc="Processing images with InternVL"):
	# cont_flag= True
	# if "GarbageBag_Set1_Pos1_OOPS0" in img_path or "Safety_Cone_Pos2_OOPS0.5" in img_path or "chair_0" in img_path:
	# cont_flag= False
	# if cont_flag:
	# continue
	try:
	img_b64 = encode_image(img_path)

	# Following the style of objects_name_intern.py:
	# first a short "session" exchange, then the actual multimodal user message
	data = {
	"model": model,
	"messages": [
	{"role": "user", "content": "Start session."},
	{"role": "assistant", "content": "Session started."},
	{
	"role": "user",
	"content": [
	{
	"type": "text",
	"text": system_message_text
	+ "\n\n"
	+ questions_prompt,
	},
	{
	"type": "image_url",
	"image_url": {
	"url": f"data:image/png;base64,{img_b64}"
	},
	},
	],
	},
	],
	"temperature": temperature,
	"top_p": 0.9,
	"max_tokens": 2048,
	}

	response = requests.post(
	url, headers=headers, data=json.dumps(data)
	)
	response.raise_for_status()
	content = response.json()["choices"][0]["message"]["content"]

	out_f.write(f"IMAGE: {img_path}\n")
	out_f.write(content.strip() + "\n")
	out_f.write("\n" + "-" * 80 + "\n\n")
	out_f.flush()

	sleep(sleep_time)

	except Exception as e:
	print(f"Error processing {img_path}: {e}")
	out_f.write(f"IMAGE: {img_path}\n")
	out_f.write(f"ERROR: {e}\n")
	out_f.write("\n" + "-" * 80 + "\n\n")
	out_f.flush()

	print(f"Done. Results saved to {output_path}")


	# -------------------------------------------------------------------
	# CLI
	# -------------------------------------------------------------------
	if __name__ == "__main__":
	import argparse

	parser = argparse.ArgumentParser(
	description="Process PNG images with InternVL for obstacle analysis."
	)
	parser.add_argument("--image_dir", required=True, help="Folder of .png images")
	parser.add_argument("--output", required=True, help="Output text file")
	parser.add_argument(
	"--api_key",
	default="sk-6yfk0jIHCoZk4mppCqI5O9wOBASwB0ZlbzeZ3F0FNFs5oN4S",
	help="InternVL API key (or set INTERNVL_API_KEY env var).",
	)
	parser.add_argument(
	"--model",
	default="internvl3.5-241b-a28b",
	help="Model name for InternVL (default: internvl-latest)",
	)
	parser.add_argument(
	"--temperature", type=float, default=0.2, help="Sampling temperature"
	)
	parser.add_argument(
	"--sleep",
	type=float,
	default=1.0,
	help="Sleep time between requests (seconds)",
	)

	args = parser.parse_args()

	analyze_obstacles_in_folder_internvl(
	image_dir=args.image_dir,
	output_path=args.output,
	api_key=args.api_key,
	model=args.model,
	temperature=args.temperature,
	sleep_time=args.sleep,
	)