oops / internvl_fix.py
deansmile123's picture
Upload folder using huggingface_hub
75f0bc0 verified
Raw
History Blame Contribute Delete
6.67 kB
import os
import base64
import json
import requests
from time import sleep
from tqdm import tqdm
# -------------------------------------------------------------------
# Helper: encode image as base64
# -------------------------------------------------------------------
def encode_image(image_path: str) -> str:
with open(image_path, "rb") as f:
return base64.b64encode(f.read()).decode("utf-8")
# -------------------------------------------------------------------
# Main function: use InternVL to analyze obstacles
# -------------------------------------------------------------------
def analyze_obstacles_in_folder_internvl(
image_dir: str,
output_path: str,
api_key: str = None,
model: str = "internvl3.5-241b-a28b",
temperature: float = 1.0,
sleep_time: float = 1.0,
):
"""
For each .png in image_dir, send the image to InternVL with the obstacle prompt
and write results to output_path.
"""
if api_key is None:
# Try to read from environment variable as a fallback
api_key = os.getenv("INTERNVL_API_KEY")
if not api_key:
raise ValueError(
"No InternVL API key provided. "
"Pass --api_key on the command line or set the INTERNVL_API_KEY env var."
)
url = "https://chat.intern-ai.org.cn/api/v1/chat/completions"
headers = {
"Content-Type": "application/json",
"Authorization": f"Bearer {api_key}",
}
# Collect all PNG images
image_paths = [
os.path.join(image_dir, f)
for f in os.listdir(image_dir)
if f.lower().endswith(".png")
]
image_paths.sort()
# Keep behavior consistent with your original gpt.py
# (skips the first image)
# image_paths = image_paths[16:]
if not image_paths:
print(f"No .png images found in {image_dir}")
return
# The list of 8 questions – same as in gpt.py
questions_prompt = (
"1. Identify the obstacle on the sidewalk or walkable path ahead."
)
# "8. Identify the single object most likely to be hit by a pedestrian moving straight ahead, and assign it a formal Out-of-Place Score (0–100) based only on its position.\n"
# " Scoring scale:\n"
# " 0 = perfectly expected position (default/home location)\n"
# " 50 = somewhat out of place from where it is typically expected\n"
# " 100 = completely out of place and highly surprising\n"
# "System" message text – we’ll prepend this to the text content
system_message_text = (
"I am fully blind. You are a mobility assistant who analyzes the scene and describes obstacles for safe navigation. Be concise and accurate. Start your answer with 1. "
)
with open(output_path, "a", encoding="utf-8") as out_f:
for img_path in tqdm(image_paths, desc="Processing images with InternVL"):
# cont_flag= True
# if "GarbageBag_Set1_Pos1_OOPS0" in img_path or "Safety_Cone_Pos2_OOPS0.5" in img_path or "chair_0" in img_path:
# cont_flag= False
# if cont_flag:
# continue
try:
img_b64 = encode_image(img_path)
# Following the style of objects_name_intern.py:
# first a short "session" exchange, then the actual multimodal user message
data = {
"model": model,
"messages": [
{"role": "user", "content": "Start session."},
{"role": "assistant", "content": "Session started."},
{
"role": "user",
"content": [
{
"type": "text",
"text": system_message_text
+ "\n\n"
+ questions_prompt,
},
{
"type": "image_url",
"image_url": {
"url": f"data:image/png;base64,{img_b64}"
},
},
],
},
],
"temperature": temperature,
"top_p": 0.9,
"max_tokens": 2048,
}
response = requests.post(
url, headers=headers, data=json.dumps(data)
)
response.raise_for_status()
content = response.json()["choices"][0]["message"]["content"]
out_f.write(f"IMAGE: {img_path}\n")
out_f.write(content.strip() + "\n")
out_f.write("\n" + "-" * 80 + "\n\n")
out_f.flush()
sleep(sleep_time)
except Exception as e:
print(f"Error processing {img_path}: {e}")
out_f.write(f"IMAGE: {img_path}\n")
out_f.write(f"ERROR: {e}\n")
out_f.write("\n" + "-" * 80 + "\n\n")
out_f.flush()
print(f"Done. Results saved to {output_path}")
# -------------------------------------------------------------------
# CLI
# -------------------------------------------------------------------
if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser(
description="Process PNG images with InternVL for obstacle analysis."
)
parser.add_argument("--image_dir", required=True, help="Folder of .png images")
parser.add_argument("--output", required=True, help="Output text file")
parser.add_argument(
"--api_key",
default="sk-6yfk0jIHCoZk4mppCqI5O9wOBASwB0ZlbzeZ3F0FNFs5oN4S",
help="InternVL API key (or set INTERNVL_API_KEY env var).",
)
parser.add_argument(
"--model",
default="internvl3.5-241b-a28b",
help="Model name for InternVL (default: internvl-latest)",
)
parser.add_argument(
"--temperature", type=float, default=0.2, help="Sampling temperature"
)
parser.add_argument(
"--sleep",
type=float,
default=1.0,
help="Sleep time between requests (seconds)",
)
args = parser.parse_args()
analyze_obstacles_in_folder_internvl(
image_dir=args.image_dir,
output_path=args.output,
api_key=args.api_key,
model=args.model,
temperature=args.temperature,
sleep_time=args.sleep,
)