|
|
import os |
|
|
import re |
|
|
import json |
|
|
import base64 |
|
|
import copy |
|
|
from io import BytesIO |
|
|
from PIL import Image |
|
|
import traceback |
|
|
import concurrent.futures |
|
|
|
|
|
def encode_image_from_pil(pil_image, max_size=256, quality=85): |
|
|
""" |
|
|
将PIL图像按比例缩放到指定最大边长后,编码为Base64字符串 |
|
|
|
|
|
参数: |
|
|
pil_image (Image): PIL图像对象 |
|
|
max_size (int): 缩放后的最长边像素(默认256) |
|
|
quality (int): JPEG压缩质量(1-95,默认85) |
|
|
|
|
|
返回: |
|
|
str: Base64编码的字符串,失败时返回None |
|
|
""" |
|
|
try: |
|
|
|
|
|
width, height = pil_image.width, pil_image.height |
|
|
if max(width, height) > max_size: |
|
|
ratio = min(max_size / width, max_size / height) |
|
|
new_size = (int(width * ratio), int(height * ratio)) |
|
|
resized_img = pil_image.resize(new_size) |
|
|
else: |
|
|
resized_img = pil_image.copy() |
|
|
|
|
|
|
|
|
buffered = BytesIO() |
|
|
resized_img.save(buffered, format="JPEG", quality=quality) |
|
|
return base64.b64encode(buffered.getvalue()).decode('utf-8') |
|
|
|
|
|
except Exception as e: |
|
|
print(f"Image processing failed: {str(e)}") |
|
|
return None |
|
|
|
|
|
|
|
|
system_prompt = """You are an expert Vision-Language Model assistant specializing in transforming explicit questions about images into sophisticated, implicit instructions. Your goal is to reframe direct queries into nuanced directives that guide a user or another AI to the same answer through observation and inference, rather than by a direct ask.\n\nYour Task:\nGiven an image and a corresponding explicit question-answer (QA) pair, your task is to:\n1. Convert the explicit question into a compelling implicit instruction that leverages the image content and guides the user to deduce the information sought by the original question.\n2. Classify the reframed question into one of five analytical dimensions (Spatial Relationships, Visual Attributes, Functional Context, Logical Reasoning, Semantic Connections) based on its core intent.\n\nMethodology: Reconstructing the Query Using Analytical Dimensions\nTo create the implicit instruction, you will:\n1. Actively draw insights from the provided image.\n2. Utilize one or more of the five analytical dimensions to adjust the original question, add contextual information, and reconstruct its intent into an implicit directive.\n3. Assign a classification to the reframed question based on which dimension(s) it primarily engages (e.g., questions about object positioning vs. symbolic meaning).\n4. The implicit instruction should make the answer feel like a natural consequence of observation or analysis tied to the classification.\n\n1. Spatial Relationships – Used to add spatial descriptive attributes\n1.1. Positional Arrangements\nOriginal Question Type (Location/Identification): What is the fountain in the middle of the photo like?\nImplicit Instruction (using positional arrangement description): Among the series of landscape structures in the square, please describe in detail the facility that is neither located at the very edge nor immediately adjacent to buildings, but is roughly in the geometric center area and jets water upwards.\n1.2. Directional Orientation\nOriginal Question Type (Location/Identification): Find the person in the picture who has their back to us.\nImplicit Instruction (using directional orientation description): Among the multiple individuals in the scene, please identify and describe the person whose main body part (especially the face) is oriented roughly opposite to the observer's line of sight.\n\n2. Visual Attributes – Used to add visual descriptive attributes\n2.1. Color\nOriginal Question Type (Location/Identification): How is the green apple in the basket? (Assuming there are red and green apples)\nImplicit Instruction (using color attribute description): In the container holding various fruits, please focus on the spherical fruits whose skin presents a hue similar to leaves or unripe bananas, and describe the condition of one of them.\n2.2. Shape\nOriginal Question Type (Location/Identification): Find the square cushion.\nImplicit Instruction (using shape attribute description): Among the multiple cushions on the sofa, please point out the fabric item used for comfortable leaning that has an outer contour with four roughly equal sides and internal angles close to right angles.\n2.3. Size\nOriginal Question Type (Location/Identification): What is the tallest book in that pile of books?\nImplicit Instruction (using size attribute description): Among the several books stacked together, please identify the printed material that significantly surpasses all other books in the vertical dimension, and describe its cover.\n2.4. Material Properties\nOriginal Question Type (Location/Identification): Which sculpture is made of stone?\nImplicit Instruction (using material attribute description): Among the multiple displayed artistic forms, please identify the work whose surface presents natural rock texture, feels cold to the touch, has a hard texture, and typically possesses a certain sense of weight.\n\n3. Functional Context – Used to add functional or behavioral descriptive attributes\n3.1. Item Purposes\nOriginal Question Type (Location/Identification): Find the knife used for cutting vegetables in the kitchen.\nImplicit Instruction (using item purpose description): Among the various utensils in the kitchen, please identify the hand-held metal tool that typically has a sharp single or double-edged blade and is designed for segmenting or slicing food ingredients.\n3.2. Human Actions\nOriginal Question Type (Location/Identification): Which waiter is wiping the table?\nImplicit Instruction (using human action description): Among the staff in the restaurant, please locate the employee who is currently holding a cloth or cleaning supplies and whose upper body and arms are repeatedly performing a wiping motion on a flat surface.\n3.3. Environmental Conditions\nOriginal Question Type (Location/Identification): Point out the rabbit in the snow. (Assuming a complex background where the environment helps)\nImplicit Instruction (using environmental condition description): On the vast ground covered with white ice crystals, carefully search for and point out the small mammal that contrasts with the surrounding snowy white environment, which it might be using for camouflage.\n\n4. Logical Reasoning – Used to add logical judgment attributes\n4.1. Quantity Comparisons\nOriginal Question Type (Location/Identification): Which team has the fewest people? (Assuming there are three teams)\nImplicit Instruction (using quantity comparison description for differentiation/location): Separately count the members of each clearly distinguishable group of people in the frame, and then indicate the group whose total number of members is at the lowest level in comparison.\n4.2. Conditional Evaluations\nOriginal Question Type (Location/Identification): If I were going out, which umbrella should I take? (Assuming one good umbrella and one broken one)\nImplicit Instruction (using conditional evaluation description): Examine all available rain gear in the image, assess their respective conditions and functionality, and select the umbrella that would provide reliable shelter in the event of wet weather and currently has no obvious damage.\n4.3. Causal Relationships\nOriginal Question Type (Location/Identification): Which child knocked over the milk? (Assuming spilled milk and several children nearby)\nImplicit Instruction (using causal relationship description): Observe the location where the milky white liquid was spilled and the position of its overturned container, and considering the positions of nearby children, their expressions, or any traces on their hands, infer the child most likely responsible for this accident.\n\n5. Semantic Connections – Used to add symbolic or emotional descriptive attributes\n5.1. Cultural Metaphors\nOriginal Question Type (Location/Identification): Find the traditional Chinese painting that features bamboo.\nImplicit Instruction (using cultural metaphor description): Among several traditional Eastern paintings, please identify the artwork whose main subject is a plant characterized by its hollow, segmented stems, often used in a specific culture as a symbol of gentlemanly qualities (such as integrity, modesty, and resilience).\n5.2. Emotional Expressions\nOriginal Question Type (Location/Identification): Which member of the audience looks the most disappointed?\nImplicit Instruction (using emotional expression description): Among the people watching the game or performance, find the individual whose facial expression (e.g., downturned mouth, dull eyes, furrowed brow) and body posture most clearly convey negative emotions, such as unmet expectations or dissatisfaction.\n5.3. Symbolic Meanings\nOriginal Question Type (Location/Identification): Identify the decoration on top of the wedding cake.\nImplicit Instruction (using symbolic meaning description): Observe the multi-tiered celebration cake and locate the small decorative object placed at its very top, which typically carries auspicious meanings (such as figures of the newlyweds, symbols of love, a shared future, etc.) and serves as a finishing touch for the overall ceremony.\n\nInput Format You Will Receive:\nImage (required)\nOriginal Question (required)\nAnswer/Context (optional): May include partial or no contextual constraints\n\nOutput Format You Should Generate:\nParse the reframed question and its classification into JSON format. Follow this structure:\n\n{\n \"question\": \"[Implicit instruction guiding observation/inference]\",\n \"classification\": \"[One of: Spatial Relationships, Visual Attributes, Functional Context, Logical Reasoning, Semantic Connections]\"\n}\nExample Output:\n{\n \"question\": \"Observe the multi-tiered celebration cake and locate the small decorative object placed at its very top, which typically carries auspicious meanings (such as figures of the newlyweds, symbols of love, a shared future, etc.) and serves as a finishing touch for the overall ceremony.\",\n \"classification\": \"Semantic Connections\"\n}""" |
|
|
|
|
|
base_template = { |
|
|
"custom_id": None, |
|
|
"method": "POST", |
|
|
"url": "/v1/chat/completions", |
|
|
"body": { |
|
|
"model": "qwen-vl-max-latest", |
|
|
"messages": [ |
|
|
{"role": "system", "content": system_prompt}, |
|
|
{"role": "user", "content": None} |
|
|
], |
|
|
"temperature": 0.1, |
|
|
"response_format":{"type": "json_object"}, |
|
|
"max_tokens": 200 |
|
|
} |
|
|
} |
|
|
|
|
|
def process_single_file(json_file, dataset_dir): |
|
|
print(f"\nProcessing JSON file: {json_file}") |
|
|
json_path = os.path.join(dataset_dir, json_file) |
|
|
file_requests = [] |
|
|
|
|
|
try: |
|
|
with open(json_path, 'r', encoding='utf-8') as f: |
|
|
data = json.load(f) |
|
|
except (IOError, json.JSONDecodeError) as e: |
|
|
print(f"Failed to load {json_path}: {str(e)}") |
|
|
return file_requests |
|
|
|
|
|
if not isinstance(data, list): |
|
|
print(f"Invalid data format in {json_file}, expected list") |
|
|
return file_requests |
|
|
|
|
|
for item_idx, item in enumerate(data): |
|
|
try: |
|
|
if not isinstance(item, dict): |
|
|
print(f"Invalid item format at index {item_idx} in {json_file}") |
|
|
continue |
|
|
|
|
|
img_path = os.path.join(dataset_dir, item['media_paths']) |
|
|
img_path = os.path.normpath(img_path) |
|
|
if not os.path.exists(img_path): |
|
|
print(f"Image file not found: {img_path}") |
|
|
continue |
|
|
|
|
|
try: |
|
|
image = Image.open(img_path) |
|
|
base64_img = encode_image_from_pil(image) |
|
|
if not base64_img: |
|
|
continue |
|
|
except (IOError, OSError) as e: |
|
|
print(f"Failed to process image {img_path}: {str(e)}") |
|
|
continue |
|
|
|
|
|
questions = item.get('question', []) |
|
|
answers = item.get('answer', []) |
|
|
options = item.get('options', []) |
|
|
|
|
|
if not isinstance(questions, list) or len(questions) == 0: |
|
|
print(f"Invalid questions in item {item_idx} of {json_file}") |
|
|
continue |
|
|
|
|
|
option_map = {} |
|
|
if options and isinstance(options, list): |
|
|
try: |
|
|
option_map = {opt['id']: opt['text'] for opt in options |
|
|
if 'id' in opt and 'text' in opt} |
|
|
except KeyError as e: |
|
|
print(f"Invalid option format in {json_file} item {item_idx}: {str(e)}") |
|
|
|
|
|
for qa_idx in range(len(questions)): |
|
|
try: |
|
|
q_text = questions[qa_idx] |
|
|
if not isinstance(q_text, str): |
|
|
print(f"Invalid question format at index {qa_idx} in {json_file} item {item_idx}") |
|
|
continue |
|
|
|
|
|
answer_text = "" |
|
|
if qa_idx < len(answers): |
|
|
original_answer = answers[qa_idx] |
|
|
try: |
|
|
if len(option_map) >= 1: |
|
|
answer_text = option_map.get(original_answer[0], original_answer[0]) |
|
|
else: |
|
|
answer_text = original_answer if isinstance(original_answer, str) else "" |
|
|
answer_text= re.sub(r'\s*\[.*?\]', '', answer_text).strip() |
|
|
except (TypeError, IndexError) as e: |
|
|
print(f"Answer processing error: {str(e)}") |
|
|
|
|
|
user_content = [] |
|
|
text_content = f"question: {q_text} answer: {answer_text}" if answer_text else f"question: {q_text}" |
|
|
|
|
|
user_content.append({"type": "text", "text": text_content}) |
|
|
user_content.append({ |
|
|
"type": "image_url", |
|
|
"image_url": {"url": f"data:image/jpeg;base64,{base64_img}"} |
|
|
}) |
|
|
|
|
|
request = copy.deepcopy(base_template) |
|
|
request['custom_id'] = f"{json_file[:-5]}-{item_idx}-{qa_idx}" |
|
|
request['body']['messages'][1]['content'] = user_content |
|
|
|
|
|
file_requests.append(request) |
|
|
except Exception as e: |
|
|
print(f"Error processing QA pair {qa_idx} in {json_file} item {item_idx}: {str(e)}") |
|
|
traceback.print_exc() |
|
|
|
|
|
except Exception as e: |
|
|
print(f"Error processing item {item_idx} in {json_file}: {str(e)}") |
|
|
continue |
|
|
|
|
|
return file_requests |
|
|
|
|
|
def process_dataset(dataset_dir): |
|
|
batch_requests = [] |
|
|
|
|
|
if not os.path.exists(dataset_dir): |
|
|
print(f"Error: Dataset directory {dataset_dir} does not exist") |
|
|
return batch_requests |
|
|
|
|
|
json_files = [f for f in os.listdir(dataset_dir) if f.endswith('.json')] |
|
|
|
|
|
|
|
|
with concurrent.futures.ThreadPoolExecutor(max_workers=16) as executor: |
|
|
futures = {executor.submit(process_single_file, json_file, dataset_dir): json_file |
|
|
for json_file in json_files} |
|
|
|
|
|
for future in concurrent.futures.as_completed(futures): |
|
|
json_file = futures[future] |
|
|
try: |
|
|
requests = future.result() |
|
|
batch_requests.extend(requests) |
|
|
except Exception as e: |
|
|
print(f"Error processing file {json_file}: {str(e)}") |
|
|
traceback.print_exc() |
|
|
|
|
|
return batch_requests |
|
|
|
|
|
if __name__ == "__main__": |
|
|
dataset_directory = "/mnt/data/users/zys/proj/vlm_reasoning/dataset" |
|
|
try: |
|
|
batch_requests = process_dataset(dataset_directory) |
|
|
|
|
|
with open("/mnt/data/users/zys/proj/vlm_reasoning/request/vlm_batch_requests.jsonl", 'w') as f: |
|
|
for req in batch_requests: |
|
|
try: |
|
|
f.write(json.dumps(req, ensure_ascii=False) + '\n') |
|
|
except (TypeError, IOError) as e: |
|
|
print(f"Failed to write request: {str(e)}") |
|
|
continue |
|
|
|
|
|
print(f"Successfully generated {len(batch_requests)} requests") |
|
|
except Exception as e: |
|
|
print(f"Critical error occurred: {str(e)}") |
|
|
traceback.print_exc() |