File size: 4,529 Bytes
a7ff3d0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
import requests
import json
import os

url = "http://localhost:1234/v1/chat/completions"
headers = {
    "Content-Type": "application/json"
}

# System instruction
conversation_history = [
    {
        "role": "system",
        "content": "You are an assistant that helps build a BIDS dataset_description.json file. "
                   "Because the file tree may be extremely large, you will receive only 100 items at a time."
    }
]

# Step 1: Collect basic dataset info
def get_basic_dataset_info():
    print("Please provide the basic details of your dataset:")
    dataset_name = input("Dataset Name: ")
    dataset_version = input("Dataset Version: ")
    dataset_description = input("Dataset Description: ")

    # Authors
    authors_input = input("Authors (comma separated, type 'None' if unknown): ").strip()
    authors = None if authors_input.lower() == "none" else [a.strip() for a in authors_input.split(",")]

    # References
    references_input = input("References and Citations (comma separated, type 'None' if unknown): ").strip()
    references = None if references_input.lower() == "none" else [r.strip() for r in references_input.split(",")]

    return {
        "name": dataset_name,
        "version": dataset_version,
        "description": dataset_description,
        "authors": authors,
        "references": references
    }

# Step 2: Root folder
def get_root_folder():
    folder = input("Please provide the root folder containing the dataset files: ")
    while not os.path.isdir(folder):
        print("Invalid folder. Try again.")
        folder = input("Please provide the root folder containing the dataset files: ")
    return folder

# Step 3: Load folder content in batches of 100
def get_folder_batches(folder, batch_size=100):
    items = os.listdir(folder)
    full_paths = [os.path.join(folder, item) for item in items]

    # Break into batches of up to 100
    for i in range(0, len(full_paths), batch_size):
        yield full_paths[i:i + batch_size], i // batch_size + 1  # return (batch_items, batch_number)

# Step 4: LLM interaction + JSON building
def process_and_build_json(batches, basic_info):
    dataset_description = {
        "Name": basic_info["name"],
        "BIDSVersion": "1.0.0",
        "DatasetType": "raw",
        "License": "CC0",
        "Authors": basic_info["authors"] if basic_info["authors"] else "None",
        "Acknowledgements": "None",
        "HowToAcknowledge": "None",
        "ReferencesAndLinks": basic_info["references"] if basic_info["references"] else "None",
        "DatasetDescription": basic_info["description"]
    }

    for batch_items, batch_number in batches:
        # Describe what is being sent
        message = (
            f"Batch {batch_number}: Here are up to 100 items from the dataset root.\n"
            f"Total items in this batch: {len(batch_items)}\n"
            f"Items:\n" + "\n".join(batch_items)
        )

        conversation_history.append({"role": "user", "content": message})

        data = {
            "model": "deepseek/deepseek-r1-0528-qwen3-8b",
            "messages": conversation_history,
            "temperature": 0.7,
            "max_tokens": 500,
            "stream": False
        }

        response = requests.post(url, headers=headers, data=json.dumps(data))
        model_response = response.json()
        last_message = model_response['choices'][0]['message']['content']

        print("\n--- LLM Response for Batch", batch_number, "---")
        print(last_message)

        conversation_history.append({"role": "assistant", "content": last_message})

        # Store response under the batch key
        dataset_description[f"batch_{batch_number}"] = last_message

    return dataset_description

# Step 5: Save JSON
def save_json(dataset_description):
    out = "dataset_description.json"
    with open(out, "w") as f:
        json.dump(dataset_description, f, indent=4)
    print(f"\nSaved: {out}")

# Main
def main():
    basic_info = get_basic_dataset_info()

    # Infer authors from citations if possible
    if basic_info["authors"] is None and basic_info["references"]:
        print("Attempting to infer authors from citations...")
        basic_info["authors"] = ["Author inferred from reference: " + r for r in basic_info["references"]]

    root_folder = get_root_folder()
    batches = get_folder_batches(root_folder, batch_size=100)

    dataset_json = process_and_build_json(batches, basic_info)
    save_json(dataset_json)

if __name__ == "__main__":
    main()