import json import os import base64 from time import sleep from tqdm import tqdm import openai # Prefer environment variable; remove hardcoded keys for safety. openai.api_key = "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" def encode_image(image_path): with open(image_path, "rb") as image_file: return base64.b64encode(image_file.read()).decode("utf-8") def _build_annotation_prompt(): """ Returns a compact, deterministic prompt with the exact questions and options used by the GUI tool, and asks for strict JSON output. """ # Questions and options mirrored from gui1.py (keep these in lockstep). :contentReference[oaicite:2]{index=2} return ( "You are an expert at analyzing a single image of a line of people. " "Answer the following 17 questions STRICTLY as a single JSON object. " "Use the exact keys provided, and for multiple-choice fields choose ONE " "of the listed options verbatim. If something is not visible, pick the most appropriate option (e.g., 'N/A').\n\n" "Return ONLY JSON. No prose.\n\n" "Definitions:\n" "- **Start of the line (front):** the person closest to the counter or service point. " "This is usually the direction the line is facing towards.\n" "- **End of the line (back):** the person farthest from the counter or service point. " "This is usually the last person to join the line.\n\n" "Fields:\n" '1) "number_of_turns": integer\n' '2) "line_shape": one of ["Straight","Curved","S-shaped","Angled","other"]\n' '3) "line_facing_direction": one of ["Facing towards","Facing away","Facing sideways","other"]\n' '4) "number_of_people_in_line": integer\n' '5) "line_purpose": string (short phrase)\n' '6) "start_person_description": string (brief)\n' '7) "end_person_description": string (brief)\n' '8) "counter_person_description": string (brief)\n' '9) "boundary_present": one of ["yes","no"]\n' '10) "boundary_types": one of ["none","cones","rope dividers","stanchions","other"]\n' '11) "end_of_line_visible": one of ["yes","no"]\n' '12) "end_of_line_location_if_visible": one of ["far left","center left","center","center right","far right","N/A"]\n' '13) "direction_to_turn_to_see_end_if_not_visible": one of ["left","right","back","N/A"]\n' '14) "start_of_line_visible": one of ["yes","no"]\n' '15) "start_of_line_location_if_visible": one of ["far left","center left","center","center right","far right","N/A"]\n' '16) "direction_to_turn_to_see_start_if_not_visible": one of ["left","right","back","N/A"]\n' '17) "line_completeness": one of ["full","partial"]\n\n' "JSON schema example (values are placeholders):\n" "{\n" ' "number_of_turns": 0,\n' ' "line_shape": "Straight",\n' ' "line_facing_direction": "Facing towards",\n' ' "number_of_people_in_line": 16,\n' ' "line_purpose": "airport",\n' ' "start_person_description": "man wearing hat and blue shirt",\n' ' "end_person_description": "person wearing black t-shirt",\n' ' "counter_person_description": "unknown",\n' ' "boundary_present": "true",\n' ' "boundary_types": "rope dividers",\n' ' "end_of_line_visible": "yes",\n' ' "end_of_line_location_if_visible": "far left",\n' ' "direction_to_turn_to_see_end_if_not_visible": "N/A",\n' ' "start_of_line_visible": "no",\n' ' "start_of_line_location_if_visible": "N/A",\n' ' "direction_to_turn_to_see_start_if_not_visible": "right",\n' ' "line_completeness": "partial"\n' "}" ) def generate_reranking(image_paths, res_file_name, temperature=0.2): """ New behavior: given a list of single-image paths, ask the 17 GUI questions for each image and write a JSON answer per image to `res_file_name`. Args: image_paths (List[str]): list of absolute or relative image paths. res_file_name (str): output text file; we append one record per image: \n\n\n temperature (float): sampling temperature. """ prompt = _build_annotation_prompt() # from gui spec :contentReference[oaicite:3]{index=3} with open(res_file_name, "a", encoding="utf-8") as f: for img_path in tqdm(image_paths): basename=os.path.basename(img_path) img_b64 = encode_image(img_path) print(img_path) response = openai.ChatCompletion.create( model="gpt-4o", messages=[ { "role": "system", "content": [ { "type": "text", "text": ( "You analyze a SINGLE image and return ONLY valid JSON " "with the specified keys and enumerated options. Do not include any extra text." ), } ], }, { "role": "user", "content": [ {"type": "text", "text": prompt}, { "type": "image_url", "image_url": {"url": f"data:image/png;base64,{img_b64}"}, }, ], }, ], max_tokens=1000, temperature=temperature, ) content = response.choices[0].message.content.strip() # Optional: validate JSON quickly; if it fails, still write raw for debugging. try: parsed = json.loads(content) content = json.dumps(parsed, ensure_ascii=False) except Exception: pass # leave as-is for troubleshooting f.write(os.path.basename(img_path) + "\n" + content + "\n\n") sleep(0.5) root = "/vast/ds5725/linefinder/LineFinder/Images" # Subfolders subfolders = ["QueuesInAirports","QueuesInSupermarkets", "QueuesInBanks", "ImagesOnline"] # Collect all absolute file paths all_files = [] for sub in subfolders: folder_path = os.path.join(root, sub) for dirpath, _, filenames in os.walk(folder_path): for f in filenames: all_files.append(os.path.abspath(os.path.join(dirpath, f))) all_files.sort() # print(all_files[0]) # test=[all_files[0],all_files[1]] generate_reranking(all_files[63:], "gpt_line_test.jsonl")