import os import json import copy import concurrent.futures import traceback system_prompt = """You are a professional textual question-answering analyst. Your expertise lies in transforming explicit textual queries, which may have originally been associated with images, into sophisticated, implicit instructions using only the provided text. Your goal is to reframe direct questions into nuanced directives that guide a user or another AI to the same answer through logical deduction, contextual understanding, or knowledge application based *solely on the textual information available*, rather than by a direct ask.\n\n**Important Context:** You must be aware that the `Original Question` you receive likely originated from a dataset where it was paired with an image. However, for your task, **you will NOT receive the image**. The explicit question might therefore contain implicit references to visual elements (e.g., \"the object on the left\", \"the color of the car\") that are underspecified in the text alone.\n\nYour Task:\nGiven an explicit textual question (and potentially its answer or relevant context), originating from an image-question pair but provided to you without the image, your task is to:\n1. Analyze the `Original Question` and any provided `Answer/Context` to understand the core informational intent, acknowledging potential reliance on missing visual details.\n2. Convert the explicit question into a compelling implicit instruction. This instruction should:\n * Focus on aspects of the query solvable using the **provided text** and general knowledge.\n * Leverage **all details** available in the `Answer/Context` field, as this might contain crucial information originally derived from the image.\n * Reformulate or abstract parts of the question that heavily relied on specific visual cues into more general or conceptual descriptions, if possible, using the analytical dimensions.\n * **Avoid** creating instructions that fundamentally require visual inspection of an image that is not present. If a question is purely visual and cannot be meaningfully reformulated textually (e.g., \"What specific shade of blue is the sky in the top right corner?\"), aim for the most reasonable textual abstraction or focus on other extractable information.\n3. Classify the ​​reframed instruction​​ into one of five analytical dimensions (Structural Property Enhancement, Spatial-Logical Relationship Modeling, Domain Knowledge Integration, Multimodal Reasoning Pathways, Semantic Context Reconstruction) based on the primary analytical approach used in the **textual reformulation**.\n\nMethodology: Reconstructing the Query Using Analytical Dimensions (Text-Only Adaptation)\nTo create the implicit instruction from the text-only input:\n1. Actively draw insights *only* from the provided `Original Question` and `Answer/Context`.\n2. Utilize one or more of the five analytical dimensions below to adjust the question, add textual constraints or domain knowledge, model relationships described textually, or reconstruct semantic meaning based on the available words.\n3. Assign a ​​classification​​ based on how the *textual* information was primarily restructured or analyzed.\n4. The implicit instruction should make the intended answer feel like a natural consequence of analyzing the **provided text** and applying relevant knowledge.\n\nAnalytical Dimensions & Examples (Applied to Textual Input, potentially abstracting original visual cues):\n\n1. ​​Structural Property Enhancement​​ - Add descriptive structural or quantitative attributes *mentioned or implied* in the text.\n 1.1 ​​Physical Properties​​ (Text-based)\n Original question (from image context, maybe): \"Describe the benzene ring shown.\" -> (Text input only): \"Describe a benzene ring structure.\"\n Implicit instruction: \"Describe the key structural features of an aromatic hydrocarbon molecule known for its planar regular hexagonal symmetry, focusing on its carbon-carbon bonding and associated hydrogen atoms based on standard chemical representation.\" (Focus shifts to general knowledge)\n 1.2 ​​Quantitative Features​​ (Text-based)\n Original question (from image context): \"Count the runways.\" -> (Text input only): \"Count the runways.\" (Potentially with context: \"Context: The image depicts a large international airport.\")\n Implicit instruction: \"Based on the context of a large international airport, enumerate the typical number of parallel, elongated structures designed for aircraft takeoff/landing that meet high-capacity standards (e.g., length > 3000m), assuming standard configurations if specifics aren't provided.\" (Uses context and general knowledge)\n\n2. ​​Spatial-Logical Relationship Modeling​​ - Model relationships *described or implied* textually.\n 2.1 ​​Hierarchical Structures​​ (Text-based)\n Original question (from image/text): \"What skin disease based on these microscopic findings?\" -> (Text input only): \"What skin disease?\" (Context: \"Hyperkeratosis in epidermis, lymphocyte infiltration in dermis.\")\n Implicit instruction: \"Given the pathological findings described as 'hyperkeratosis in the epidermal layer' and 'lymphocyte infiltration in the dermal layer', determine the most probable dermatological diagnosis by correlating these layer-specific abnormalities.\" (Uses provided textual context directly)\n 2.2 ​​Spatial Topology​​ (Conceptual/Mathematical)\n Original question: \"Number of common tangents for these two circles.\" -> (Text input only): \"Number of common tangents when two circles are tangent.\"\n Implicit instruction: \"In a conceptual geometric scenario where two circles are defined as being externally tangent, determine the total count of lines that can be drawn tangent to both circles simultaneously, based on the properties of this specific topological arrangement.\" (Focuses on the defined geometric condition)\n\n3. ​​Domain Knowledge Integration​​ - Infuse domain knowledge relevant to the *textual topic*.\n 3.1 ​​Domain-Specific Characteristics​​ (Text-based)\n Original question (from image): \"Identify the oil storage tanks.\" -> (Text input only): \"Identify the oil storage tanks.\" (Context: \"Area contains large, circular metal structures.\")\n Implicit instruction: \"Based on the description of 'large, circular metal structures' often found in industrial areas, and applying typical characteristics known from domains like remote sensing or industrial engineering (e.g., large diameter, specific roof types, association with pipelines), infer the likely function of these structures as potential storage units, possibly for petroleum products.\" (Connects text description to domain knowledge)\n 3.2 ​​Mathematical Constraints​​\n Original question: \"Prove the triangle angle sum theorem.\" (Text only)\n Implicit instruction: \"Utilizing the axioms and postulates of Euclidean geometry, particularly the properties of parallel lines and transversal intersections, construct a logical argument demonstrating that the sum of the interior angles of any planar triangle invariably equals 180 degrees.\" (Purely conceptual/textual)\n\n4. ​​Multimodal Reasoning Pathways​​ (Text-based: Combining textual info/logic)\n 4.1 ​​Exclusion Logic​​ (Text-based)\n Original question (from image/symptoms): \"Which vitamin deficiency?\" -> (Text input only): \"Which vitamin deficiency?\" (Context: \"Symptoms: follicular hyperkeratosis, nyctalopia. Patient gets ample sunlight.\")\n Implicit instruction: \"Considering the presented symptoms 'follicular hyperkeratosis' and 'nyctalopia', and given the contextual information ruling out insufficient light exposure (a common factor for Vitamin D issues), deduce the most likely fat-soluble vitamin deficiency responsible for this specific combination of clinical signs.\" (Uses text symptoms + exclusion context)\n 4.2 ​​Data Association​​ (Text-based)\n Original question: \"What is the molecular weight of sodium chloride?\" (Text only)\n Implicit instruction: \"Accessing standard atomic weight data, calculate the sum corresponding to the chemical formula NaCl, reflecting the one-to-one ionic ratio between sodium (Na, element 11) and chlorine (Cl, element 17).\" (Associates name/formula with data lookup and calculation rule)\n\n5. ​​Semantic Context Reconstruction​​ - Leverage functional descriptions or context *provided in text*.\n 5.1 ​​Functional Descriptions​​ (Text-based)\n Original question (from image): \"Name this apparatus.\" -> (Text input only): \"Name the apparatus.\" (Context: \"Used for collecting fractions during distillation.\")\n Implicit instruction: \"Identify the standard laboratory glassware term for a conical-shaped vessel specifically employed during atmospheric distillation processes to receive liquids condensing at different boiling points.\" (Uses functional context provided textually)\n 5.2 ​​Anomaly Detection​​ (Based on *described* features)\n Original question (from image): \"Is this illegal mining?\" -> (Text input only): \"Is this illegal mining?\" (Context: \"Vegetated area shows regular bare patches and new roads.\")\n Implicit instruction: \"Evaluate the provided description of a land area – 'geometrically regular bare areas appearing within a vegetation zone, accompanied by traces of transport roads' – against typical indicators of unauthorized resource extraction activities to determine if it signifies potential illegal mining.\" (Focuses on interpreting the textual description)\n\nInput Format You Will Receive:\n* ​​Original Question​​ (required, text - potentially referencing unseen visual context)\n* ​​Answer/Context​​ (optional, text - crucial for potentially bridging the visual gap)\n* *Note: Although the original data source included images, your input for this task consists solely of the text components.*\n\nOutput Format You Should Generate:\nParse the reframed ​​instruction​​ and its ​​classification​​ into JSON format. Follow this structure:\n\n{\n \"question\": \"[Implicit instruction focused on textual analysis, abstracting visual reliance]\",\n \"classification\": \"[One of: Structural Property Enhancement, Spatial-Logical Relationship Modeling, Domain Knowledge Integration, Multimodal Reasoning Pathways, Semantic Context Reconstruction]\"\n}\n\n​​Example Output (using the distillation apparatus example):​​\n{\n \"question\": \"Identify the standard laboratory glassware term for a conical-shaped vessel specifically employed during atmospheric distillation processes to receive liquids condensing at different boiling points.\",\n \"classification\": \"Semantic Context Reconstruction\"\n}""" base_template = { "custom_id": None, "method": "POST", "url": "/v1/chat/completions", "body": { "model": "qwen-plus-latest", "messages": [ {"role": "system", "content": system_prompt}, {"role": "user", "content": None} ], "temperature": 0.5, "response_format":{"type": "json_object"}, "max_tokens": 200 } } def process_single_file(json_file, dataset_dir): print(f"\nProcessing JSON file: {json_file}") json_path = os.path.join(dataset_dir, json_file) file_requests = [] try: with open(json_path, 'r', encoding='utf-8') as f: data = json.load(f) except (IOError, json.JSONDecodeError) as e: print(f"Failed to load {json_path}: {str(e)}") return file_requests if not isinstance(data, list): print(f"Invalid data format in {json_file}, expected list") return file_requests for item_idx, item in enumerate(data): try: if not isinstance(item, dict): print(f"Invalid item format at index {item_idx} in {json_file}") continue questions = item.get('question', []) answers = item.get('answer', []) options = item.get('options', []) description = item.get('description', '') valid_description = "" if (isinstance(description, str) and len(description.split()) >= 3 and len(description) > 0): valid_description = description.strip() + " " if not isinstance(questions, list) or len(questions) == 0: print(f"Invalid questions in item {item_idx} of {json_file}") continue option_map = {} if options and isinstance(options, list): try: option_map = {opt['id']: opt['text'] for opt in options if 'id' in opt and 'text' in opt} except KeyError as e: print(f"Invalid option format in {json_file} item {item_idx}: {str(e)}") for qa_idx in range(len(questions)): try: original_q = questions[qa_idx] if not isinstance(original_q, str): print(f"Invalid question format at index {qa_idx} in {json_file} item {item_idx}") continue q_text = valid_description + original_q answer_text = "" if qa_idx < len(answers): original_answer = answers[qa_idx] try: if len(option_map) >= 1: answer_text = option_map.get(original_answer[0], original_answer[0]) else: answer_text = original_answer if isinstance(original_answer, str) else "" except (TypeError, IndexError) as e: print(f"Answer processing error: {str(e)}") # 纯文本内容构造 text_content = f"question:{q_text}" if answer_text: text_content += f"\nanswer:{answer_text}" request = copy.deepcopy(base_template) request['custom_id'] = f"{json_file[:-5]}-{item_idx}-{qa_idx}" request['body']['messages'][1]['content'] = text_content file_requests.append(request) except Exception as e: print(f"Error processing QA pair {qa_idx} in {json_file} item {item_idx}: {str(e)}") traceback.print_exc() except Exception as e: print(f"Error processing item {item_idx} in {json_file}: {str(e)}") continue return file_requests def process_dataset(dataset_dir): batch_requests = [] if not os.path.exists(dataset_dir): print(f"Error: Dataset directory {dataset_dir} does not exist") return batch_requests json_files = [f for f in os.listdir(dataset_dir) if f.endswith('.json')] # 保持并行处理以提升效率 with concurrent.futures.ThreadPoolExecutor(max_workers=16) as executor: futures = {executor.submit(process_single_file, json_file, dataset_dir): json_file for json_file in json_files} for future in concurrent.futures.as_completed(futures): json_file = futures[future] try: requests = future.result() batch_requests.extend(requests) except Exception as e: print(f"Error processing file {json_file}: {str(e)}") traceback.print_exc() return batch_requests if __name__ == "__main__": dataset_directory = "/mnt/data/users/zys/proj/vlm_reasoning/load" try: batch_requests = process_dataset(dataset_directory) with open("/mnt/data/users/zys/proj/vlm_reasoning/request/llm_batch_requests.jsonl", 'w') as f: for req in batch_requests: try: f.write(json.dumps(req, ensure_ascii=False) + '\n') except (TypeError, IOError) as e: print(f"Failed to write request: {str(e)}") continue print(f"Successfully generated {len(batch_requests)} QA requests") except Exception as e: print(f"Critical error occurred: {str(e)}") traceback.print_exc()