"""Generate a larger, more diverse STDP training dataset.""" import os import json import logging from typing import List, Dict import random # Configure logging logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s") logger = logging.getLogger(__name__) # Categories of examples to generate diverse data CATEGORIES = { "programming": [ "Write a Python function to calculate the factorial of a number using recursion.", "Create a JavaScript function that sorts an array of objects by a specific property.", "Implement a binary search algorithm in C++.", "Write a SQL query to find the top 5 customers by total purchase amount.", "Create a regular expression to validate email addresses.", "Implement a simple web server in Node.js.", "Write a Python class for managing a library system with books and members.", "Create a function that converts Roman numerals to integers.", "Implement a Stack data structure with push, pop, and peek operations.", "Write code to find the longest palindromic substring in a given string." ], "machine_learning": [ "Explain how backpropagation works in neural networks.", "Describe the difference between supervised and unsupervised learning.", "Explain the concept of gradient descent optimization.", "How does a decision tree algorithm work?", "What is the difference between precision and recall in classification?", "Explain the concept of overfitting and how to prevent it.", "How does K-means clustering algorithm work?", "Describe the architecture of a convolutional neural network.", "What is transfer learning and when is it useful?", "Explain how LSTM networks handle sequential data." ], "science": [ "Describe the process of photosynthesis in plants.", "Explain Newton's three laws of motion.", "How does DNA replication work in human cells?", "Describe the water cycle and its importance to Earth's ecosystems.", "Explain the theory of relativity in simple terms.", "How do vaccines create immunity in the human body?", "Describe the structure of an atom and its components.", "What causes climate change and its effects on global ecosystems?", "Explain how sound waves travel through different mediums.", "Describe the process of natural selection in evolution." ], "mathematics": [ "Explain the concept of limits in calculus.", "Prove the Pythagorean theorem.", "Explain how to solve a system of linear equations using matrices.", "What is the fundamental theorem of calculus?", "Explain the concept of mathematical induction.", "Describe the properties of prime numbers.", "How do you find the eigenvalues of a matrix?", "Explain the binomial theorem and its applications.", "What is a Fourier transform and how is it used?", "Explain the concept of conditional probability." ], "general_knowledge": [ "Describe the causes and effects of the Industrial Revolution.", "Explain the concept of supply and demand in economics.", "What are the main components of a democratic government?", "Describe the impact of social media on modern communication.", "Explain the difference between renewable and non-renewable energy sources.", "What are the primary functions of the United Nations?", "Describe the process of creative writing.", "Explain the concept of blockchain technology.", "What are the ethical considerations in artificial intelligence?", "Describe how cloud computing has transformed business operations." ] } def generate_expanded_dataset(size: int = 50, output_path: str = None) -> Dict: """ Generate a diverse STDP training dataset with specified size. Args: size: Number of examples to generate (default: 50) output_path: Path to save the dataset Returns: Dictionary containing the generated dataset """ # Determine correct output path if not specified if output_path is None: # Get the project root directory project_root = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) output_path = os.path.join(project_root, "STDP_Communicator", "expanded_stdp_dataset.json") # Generate dataset entries all_examples = [] for category, examples in CATEGORIES.items(): all_examples.extend([{"text": text, "category": category} for text in examples]) # If we need more examples than available, repeat with variations if size > len(all_examples): # Create variations by adding prefixes/suffixes prefixes = [ "Can you ", "Please ", "I need help to ", "How would you ", "I'm interested in learning about ", "Explain how to " ] suffixes = [ "?", ". Make it simple.", ". Include examples.", ". Be concise.", ". I'm a beginner.", ". Provide detailed steps." ] original_count = len(all_examples) variations_needed = size - original_count for _ in range(variations_needed): # Pick a random example to modify source = random.choice(all_examples[:original_count]) prefix = random.choice(prefixes) suffix = random.choice(suffixes) # Create variation with prefix/suffix text = source["text"] if text.endswith("?") and suffix.startswith("?"): suffix = random.choice([s for s in suffixes if not s.startswith("?")]) # Remove period if adding a question mark if text.endswith(".") and suffix.startswith("?"): text = text[:-1] variation = { "text": prefix + text.lower() if random.random() > 0.5 else text + suffix, "category": source["category"] } all_examples.append(variation) # Trim to exact size and shuffle dataset = random.sample(all_examples, size) # Create final dataset structure final_dataset = { "user_inputs": [{"text": item["text"]} for item in dataset], "metadata": { "description": "Expanded dataset for STDP training", "size": len(dataset), "version": "2.0", "categories": list(set(item["category"] for item in dataset)) } } # Save dataset os.makedirs(os.path.dirname(output_path), exist_ok=True) with open(output_path, 'w', encoding='utf-8') as f: json.dump(final_dataset, f, indent=2) logger.info(f"Generated dataset with {len(dataset)} examples saved to {output_path}") logger.info(f"Category distribution: {get_category_counts(dataset)}") return final_dataset def get_category_counts(dataset: List[Dict]) -> Dict[str, int]: """Count examples per category""" counts = {} for item in dataset: category = item["category"] if category in counts: counts[category] += 1 else: counts[category] = 1 return counts if __name__ == "__main__": import argparse parser = argparse.ArgumentParser(description="Generate STDP training dataset") parser.add_argument("--size", type=int, default=50, help="Number of examples to generate") parser.add_argument("--output", type=str, help="Output file path") args = parser.parse_args() generate_expanded_dataset(args.size, args.output) print("To use this dataset, run: python -m STDP_Communicator.train_stdp --dataset STDP_Communicator/expanded_stdp_dataset.json")