Spaces:

stefanches
/

OpenBIDSifier

Sleeping

File size: 4,529 Bytes

a7ff3d0

import requests
import json
import os

url = "http://localhost:1234/v1/chat/completions"
headers = {
    "Content-Type": "application/json"
}

# System instruction
conversation_history = [
    {
        "role": "system",
        "content": "You are an assistant that helps build a BIDS dataset_description.json file. "
                   "Because the file tree may be extremely large, you will receive only 100 items at a time."
    }
]

# Step 1: Collect basic dataset info
def get_basic_dataset_info():
    print("Please provide the basic details of your dataset:")
    dataset_name = input("Dataset Name: ")
    dataset_version = input("Dataset Version: ")
    dataset_description = input("Dataset Description: ")

    # Authors
    authors_input = input("Authors (comma separated, type 'None' if unknown): ").strip()
    authors = None if authors_input.lower() == "none" else [a.strip() for a in authors_input.split(",")]

    # References
    references_input = input("References and Citations (comma separated, type 'None' if unknown): ").strip()
    references = None if references_input.lower() == "none" else [r.strip() for r in references_input.split(",")]

    return {
        "name": dataset_name,
        "version": dataset_version,
        "description": dataset_description,
        "authors": authors,
        "references": references
    }

# Step 2: Root folder
def get_root_folder():
    folder = input("Please provide the root folder containing the dataset files: ")
    while not os.path.isdir(folder):
        print("Invalid folder. Try again.")
        folder = input("Please provide the root folder containing the dataset files: ")
    return folder

# Step 3: Load folder content in batches of 100
def get_folder_batches(folder, batch_size=100):
    items = os.listdir(folder)
    full_paths = [os.path.join(folder, item) for item in items]

    # Break into batches of up to 100
    for i in range(0, len(full_paths), batch_size):
        yield full_paths[i:i + batch_size], i // batch_size + 1  # return (batch_items, batch_number)

# Step 4: LLM interaction + JSON building
def process_and_build_json(batches, basic_info):
    dataset_description = {
        "Name": basic_info["name"],
        "BIDSVersion": "1.0.0",
        "DatasetType": "raw",
        "License": "CC0",
        "Authors": basic_info["authors"] if basic_info["authors"] else "None",
        "Acknowledgements": "None",
        "HowToAcknowledge": "None",
        "ReferencesAndLinks": basic_info["references"] if basic_info["references"] else "None",
        "DatasetDescription": basic_info["description"]
    }

    for batch_items, batch_number in batches:
        # Describe what is being sent
        message = (
            f"Batch {batch_number}: Here are up to 100 items from the dataset root.\n"
            f"Total items in this batch: {len(batch_items)}\n"
            f"Items:\n" + "\n".join(batch_items)
        )

        conversation_history.append({"role": "user", "content": message})

        data = {
            "model": "deepseek/deepseek-r1-0528-qwen3-8b",
            "messages": conversation_history,
            "temperature": 0.7,
            "max_tokens": 500,
            "stream": False
        }

        response = requests.post(url, headers=headers, data=json.dumps(data))
        model_response = response.json()
        last_message = model_response['choices'][0]['message']['content']

        print("\n--- LLM Response for Batch", batch_number, "---")
        print(last_message)

        conversation_history.append({"role": "assistant", "content": last_message})

        # Store response under the batch key
        dataset_description[f"batch_{batch_number}"] = last_message

    return dataset_description

# Step 5: Save JSON
def save_json(dataset_description):
    out = "dataset_description.json"
    with open(out, "w") as f:
        json.dump(dataset_description, f, indent=4)
    print(f"\nSaved: {out}")

# Main
def main():
    basic_info = get_basic_dataset_info()

    # Infer authors from citations if possible
    if basic_info["authors"] is None and basic_info["references"]:
        print("Attempting to infer authors from citations...")
        basic_info["authors"] = ["Author inferred from reference: " + r for r in basic_info["references"]]

    root_folder = get_root_folder()
    batches = get_folder_batches(root_folder, batch_size=100)

    dataset_json = process_and_build_json(batches, basic_info)
    save_json(dataset_json)

if __name__ == "__main__":
    main()