File size: 8,091 Bytes
69800cb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
"""Generate a larger, more diverse STDP training dataset."""
import os
import json
import logging
from typing import List, Dict
import random

# Configure logging
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s")
logger = logging.getLogger(__name__)

# Categories of examples to generate diverse data
CATEGORIES = {
    "programming": [
        "Write a Python function to calculate the factorial of a number using recursion.",
        "Create a JavaScript function that sorts an array of objects by a specific property.",
        "Implement a binary search algorithm in C++.",
        "Write a SQL query to find the top 5 customers by total purchase amount.",
        "Create a regular expression to validate email addresses.",
        "Implement a simple web server in Node.js.",
        "Write a Python class for managing a library system with books and members.",
        "Create a function that converts Roman numerals to integers.",
        "Implement a Stack data structure with push, pop, and peek operations.",
        "Write code to find the longest palindromic substring in a given string."
    ],
    "machine_learning": [
        "Explain how backpropagation works in neural networks.",
        "Describe the difference between supervised and unsupervised learning.",
        "Explain the concept of gradient descent optimization.",
        "How does a decision tree algorithm work?",
        "What is the difference between precision and recall in classification?",
        "Explain the concept of overfitting and how to prevent it.",
        "How does K-means clustering algorithm work?",
        "Describe the architecture of a convolutional neural network.",
        "What is transfer learning and when is it useful?",
        "Explain how LSTM networks handle sequential data."
    ],
    "science": [
        "Describe the process of photosynthesis in plants.",
        "Explain Newton's three laws of motion.",
        "How does DNA replication work in human cells?",
        "Describe the water cycle and its importance to Earth's ecosystems.",
        "Explain the theory of relativity in simple terms.",
        "How do vaccines create immunity in the human body?",
        "Describe the structure of an atom and its components.",
        "What causes climate change and its effects on global ecosystems?",
        "Explain how sound waves travel through different mediums.",
        "Describe the process of natural selection in evolution."
    ],
    "mathematics": [
        "Explain the concept of limits in calculus.",
        "Prove the Pythagorean theorem.",
        "Explain how to solve a system of linear equations using matrices.",
        "What is the fundamental theorem of calculus?",
        "Explain the concept of mathematical induction.",
        "Describe the properties of prime numbers.",
        "How do you find the eigenvalues of a matrix?",
        "Explain the binomial theorem and its applications.",
        "What is a Fourier transform and how is it used?",
        "Explain the concept of conditional probability."
    ],
    "general_knowledge": [
        "Describe the causes and effects of the Industrial Revolution.",
        "Explain the concept of supply and demand in economics.",
        "What are the main components of a democratic government?",
        "Describe the impact of social media on modern communication.",
        "Explain the difference between renewable and non-renewable energy sources.",
        "What are the primary functions of the United Nations?",
        "Describe the process of creative writing.",
        "Explain the concept of blockchain technology.",
        "What are the ethical considerations in artificial intelligence?",
        "Describe how cloud computing has transformed business operations."
    ]
}

def generate_expanded_dataset(size: int = 50, output_path: str = None) -> Dict:
    """

    Generate a diverse STDP training dataset with specified size.

    

    Args:

        size: Number of examples to generate (default: 50)

        output_path: Path to save the dataset

        

    Returns:

        Dictionary containing the generated dataset

    """
    # Determine correct output path if not specified
    if output_path is None:
        # Get the project root directory
        project_root = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
        output_path = os.path.join(project_root, "STDP_Communicator", "expanded_stdp_dataset.json")
    
    # Generate dataset entries
    all_examples = []
    for category, examples in CATEGORIES.items():
        all_examples.extend([{"text": text, "category": category} for text in examples])
    
    # If we need more examples than available, repeat with variations
    if size > len(all_examples):
        # Create variations by adding prefixes/suffixes
        prefixes = [
            "Can you ", "Please ", "I need help to ", "How would you ", 
            "I'm interested in learning about ", "Explain how to "
        ]
        
        suffixes = [
            "?", ". Make it simple.", ". Include examples.", ". Be concise.",
            ". I'm a beginner.", ". Provide detailed steps."
        ]
        
        original_count = len(all_examples)
        variations_needed = size - original_count
        
        for _ in range(variations_needed):
            # Pick a random example to modify
            source = random.choice(all_examples[:original_count])
            prefix = random.choice(prefixes)
            suffix = random.choice(suffixes)
            
            # Create variation with prefix/suffix
            text = source["text"]
            if text.endswith("?") and suffix.startswith("?"):
                suffix = random.choice([s for s in suffixes if not s.startswith("?")])
                
            # Remove period if adding a question mark
            if text.endswith(".") and suffix.startswith("?"):
                text = text[:-1]
                
            variation = {
                "text": prefix + text.lower() if random.random() > 0.5 else text + suffix,
                "category": source["category"]
            }
            all_examples.append(variation)
    
    # Trim to exact size and shuffle
    dataset = random.sample(all_examples, size)
    
    # Create final dataset structure
    final_dataset = {
        "user_inputs": [{"text": item["text"]} for item in dataset],
        "metadata": {
            "description": "Expanded dataset for STDP training",
            "size": len(dataset),
            "version": "2.0",
            "categories": list(set(item["category"] for item in dataset))
        }
    }
    
    # Save dataset
    os.makedirs(os.path.dirname(output_path), exist_ok=True)
    with open(output_path, 'w', encoding='utf-8') as f:
        json.dump(final_dataset, f, indent=2)
    
    logger.info(f"Generated dataset with {len(dataset)} examples saved to {output_path}")
    logger.info(f"Category distribution: {get_category_counts(dataset)}")
    
    return final_dataset

def get_category_counts(dataset: List[Dict]) -> Dict[str, int]:
    """Count examples per category"""
    counts = {}
    for item in dataset:
        category = item["category"]
        if category in counts:
            counts[category] += 1
        else:
            counts[category] = 1
    return counts

if __name__ == "__main__":
    import argparse
    parser = argparse.ArgumentParser(description="Generate STDP training dataset")
    parser.add_argument("--size", type=int, default=50, help="Number of examples to generate")
    parser.add_argument("--output", type=str, help="Output file path")
    args = parser.parse_args()
    
    generate_expanded_dataset(args.size, args.output)
    print("To use this dataset, run: python -m STDP_Communicator.train_stdp --dataset STDP_Communicator/expanded_stdp_dataset.json")