| import json |
| import os |
| import base64 |
| from time import sleep |
| from tqdm import tqdm |
| import openai |
|
|
| |
| openai.api_key = "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" |
|
|
| def encode_image(image_path): |
| with open(image_path, "rb") as image_file: |
| return base64.b64encode(image_file.read()).decode("utf-8") |
|
|
| def _build_annotation_prompt(): |
| """ |
| Returns a compact, deterministic prompt with the exact questions and options |
| used by the GUI tool, and asks for strict JSON output. |
| """ |
| |
| return ( |
| "You are an expert at analyzing a single image of a line of people. " |
| "Answer the following 17 questions STRICTLY as a single JSON object. " |
| "Use the exact keys provided, and for multiple-choice fields choose ONE " |
| "of the listed options verbatim. If something is not visible, pick the most appropriate option (e.g., 'N/A').\n\n" |
| "Return ONLY JSON. No prose.\n\n" |
| "Definitions:\n" |
| "- **Start of the line (front):** the person closest to the counter or service point. " |
| "This is usually the direction the line is facing towards.\n" |
| "- **End of the line (back):** the person farthest from the counter or service point. " |
| "This is usually the last person to join the line.\n\n" |
| "Fields:\n" |
| '1) "number_of_turns": integer\n' |
| '2) "line_shape": one of ["Straight","Curved","S-shaped","Angled","other"]\n' |
| '3) "line_facing_direction": one of ["Facing towards","Facing away","Facing sideways","other"]\n' |
| '4) "number_of_people_in_line": integer\n' |
| '5) "line_purpose": string (short phrase)\n' |
| '6) "start_person_description": string (brief)\n' |
| '7) "end_person_description": string (brief)\n' |
| '8) "counter_person_description": string (brief)\n' |
| '9) "boundary_present": one of ["yes","no"]\n' |
| '10) "boundary_types": one of ["none","cones","rope dividers","stanchions","other"]\n' |
| '11) "end_of_line_visible": one of ["yes","no"]\n' |
| '12) "end_of_line_location_if_visible": one of ["far left","center left","center","center right","far right","N/A"]\n' |
| '13) "direction_to_turn_to_see_end_if_not_visible": one of ["left","right","back","N/A"]\n' |
| '14) "start_of_line_visible": one of ["yes","no"]\n' |
| '15) "start_of_line_location_if_visible": one of ["far left","center left","center","center right","far right","N/A"]\n' |
| '16) "direction_to_turn_to_see_start_if_not_visible": one of ["left","right","back","N/A"]\n' |
| '17) "line_completeness": one of ["full","partial"]\n\n' |
| "JSON schema example (values are placeholders):\n" |
| "{\n" |
| ' "number_of_turns": 0,\n' |
| ' "line_shape": "Straight",\n' |
| ' "line_facing_direction": "Facing towards",\n' |
| ' "number_of_people_in_line": 16,\n' |
| ' "line_purpose": "airport",\n' |
| ' "start_person_description": "man wearing hat and blue shirt",\n' |
| ' "end_person_description": "person wearing black t-shirt",\n' |
| ' "counter_person_description": "unknown",\n' |
| ' "boundary_present": "true",\n' |
| ' "boundary_types": "rope dividers",\n' |
| ' "end_of_line_visible": "yes",\n' |
| ' "end_of_line_location_if_visible": "far left",\n' |
| ' "direction_to_turn_to_see_end_if_not_visible": "N/A",\n' |
| ' "start_of_line_visible": "no",\n' |
| ' "start_of_line_location_if_visible": "N/A",\n' |
| ' "direction_to_turn_to_see_start_if_not_visible": "right",\n' |
| ' "line_completeness": "partial"\n' |
| "}" |
| ) |
|
|
| def generate_reranking(image_paths, res_file_name, temperature=0.2): |
| """ |
| New behavior: given a list of single-image paths, ask the 17 GUI questions |
| for each image and write a JSON answer per image to `res_file_name`. |
| |
| Args: |
| image_paths (List[str]): list of absolute or relative image paths. |
| res_file_name (str): output text file; we append one record per image: |
| <basename>\n<json>\n\n |
| temperature (float): sampling temperature. |
| """ |
| prompt = _build_annotation_prompt() |
|
|
| with open(res_file_name, "a", encoding="utf-8") as f: |
| for img_path in tqdm(image_paths): |
| basename=os.path.basename(img_path) |
|
|
| img_b64 = encode_image(img_path) |
| print(img_path) |
| response = openai.ChatCompletion.create( |
| model="gpt-4o", |
| messages=[ |
| { |
| "role": "system", |
| "content": [ |
| { |
| "type": "text", |
| "text": ( |
| "You analyze a SINGLE image and return ONLY valid JSON " |
| "with the specified keys and enumerated options. Do not include any extra text." |
| ), |
| } |
| ], |
| }, |
| { |
| "role": "user", |
| "content": [ |
| {"type": "text", "text": prompt}, |
| { |
| "type": "image_url", |
| "image_url": {"url": f"data:image/png;base64,{img_b64}"}, |
| }, |
| ], |
| }, |
| ], |
| max_tokens=1000, |
| temperature=temperature, |
| ) |
|
|
| content = response.choices[0].message.content.strip() |
| |
| try: |
| parsed = json.loads(content) |
| content = json.dumps(parsed, ensure_ascii=False) |
| except Exception: |
| pass |
|
|
| f.write(os.path.basename(img_path) + "\n" + content + "\n\n") |
| sleep(0.5) |
|
|
| root = "/vast/ds5725/linefinder/LineFinder/Images" |
|
|
| |
| subfolders = ["QueuesInAirports","QueuesInSupermarkets", "QueuesInBanks", "ImagesOnline"] |
|
|
| |
| all_files = [] |
| for sub in subfolders: |
| folder_path = os.path.join(root, sub) |
| for dirpath, _, filenames in os.walk(folder_path): |
| for f in filenames: |
| all_files.append(os.path.abspath(os.path.join(dirpath, f))) |
| all_files.sort() |
| |
| |
| generate_reranking(all_files[63:], "gpt_line_test.jsonl") |