|
|
"""Generate a larger, more diverse STDP training dataset."""
|
|
|
import os
|
|
|
import json
|
|
|
import logging
|
|
|
from typing import List, Dict
|
|
|
import random
|
|
|
|
|
|
|
|
|
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s")
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
|
|
|
CATEGORIES = {
|
|
|
"programming": [
|
|
|
"Write a Python function to calculate the factorial of a number using recursion.",
|
|
|
"Create a JavaScript function that sorts an array of objects by a specific property.",
|
|
|
"Implement a binary search algorithm in C++.",
|
|
|
"Write a SQL query to find the top 5 customers by total purchase amount.",
|
|
|
"Create a regular expression to validate email addresses.",
|
|
|
"Implement a simple web server in Node.js.",
|
|
|
"Write a Python class for managing a library system with books and members.",
|
|
|
"Create a function that converts Roman numerals to integers.",
|
|
|
"Implement a Stack data structure with push, pop, and peek operations.",
|
|
|
"Write code to find the longest palindromic substring in a given string."
|
|
|
],
|
|
|
"machine_learning": [
|
|
|
"Explain how backpropagation works in neural networks.",
|
|
|
"Describe the difference between supervised and unsupervised learning.",
|
|
|
"Explain the concept of gradient descent optimization.",
|
|
|
"How does a decision tree algorithm work?",
|
|
|
"What is the difference between precision and recall in classification?",
|
|
|
"Explain the concept of overfitting and how to prevent it.",
|
|
|
"How does K-means clustering algorithm work?",
|
|
|
"Describe the architecture of a convolutional neural network.",
|
|
|
"What is transfer learning and when is it useful?",
|
|
|
"Explain how LSTM networks handle sequential data."
|
|
|
],
|
|
|
"science": [
|
|
|
"Describe the process of photosynthesis in plants.",
|
|
|
"Explain Newton's three laws of motion.",
|
|
|
"How does DNA replication work in human cells?",
|
|
|
"Describe the water cycle and its importance to Earth's ecosystems.",
|
|
|
"Explain the theory of relativity in simple terms.",
|
|
|
"How do vaccines create immunity in the human body?",
|
|
|
"Describe the structure of an atom and its components.",
|
|
|
"What causes climate change and its effects on global ecosystems?",
|
|
|
"Explain how sound waves travel through different mediums.",
|
|
|
"Describe the process of natural selection in evolution."
|
|
|
],
|
|
|
"mathematics": [
|
|
|
"Explain the concept of limits in calculus.",
|
|
|
"Prove the Pythagorean theorem.",
|
|
|
"Explain how to solve a system of linear equations using matrices.",
|
|
|
"What is the fundamental theorem of calculus?",
|
|
|
"Explain the concept of mathematical induction.",
|
|
|
"Describe the properties of prime numbers.",
|
|
|
"How do you find the eigenvalues of a matrix?",
|
|
|
"Explain the binomial theorem and its applications.",
|
|
|
"What is a Fourier transform and how is it used?",
|
|
|
"Explain the concept of conditional probability."
|
|
|
],
|
|
|
"general_knowledge": [
|
|
|
"Describe the causes and effects of the Industrial Revolution.",
|
|
|
"Explain the concept of supply and demand in economics.",
|
|
|
"What are the main components of a democratic government?",
|
|
|
"Describe the impact of social media on modern communication.",
|
|
|
"Explain the difference between renewable and non-renewable energy sources.",
|
|
|
"What are the primary functions of the United Nations?",
|
|
|
"Describe the process of creative writing.",
|
|
|
"Explain the concept of blockchain technology.",
|
|
|
"What are the ethical considerations in artificial intelligence?",
|
|
|
"Describe how cloud computing has transformed business operations."
|
|
|
]
|
|
|
}
|
|
|
|
|
|
def generate_expanded_dataset(size: int = 50, output_path: str = None) -> Dict:
|
|
|
"""
|
|
|
Generate a diverse STDP training dataset with specified size.
|
|
|
|
|
|
Args:
|
|
|
size: Number of examples to generate (default: 50)
|
|
|
output_path: Path to save the dataset
|
|
|
|
|
|
Returns:
|
|
|
Dictionary containing the generated dataset
|
|
|
"""
|
|
|
|
|
|
if output_path is None:
|
|
|
|
|
|
project_root = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
|
|
output_path = os.path.join(project_root, "STDP_Communicator", "expanded_stdp_dataset.json")
|
|
|
|
|
|
|
|
|
all_examples = []
|
|
|
for category, examples in CATEGORIES.items():
|
|
|
all_examples.extend([{"text": text, "category": category} for text in examples])
|
|
|
|
|
|
|
|
|
if size > len(all_examples):
|
|
|
|
|
|
prefixes = [
|
|
|
"Can you ", "Please ", "I need help to ", "How would you ",
|
|
|
"I'm interested in learning about ", "Explain how to "
|
|
|
]
|
|
|
|
|
|
suffixes = [
|
|
|
"?", ". Make it simple.", ". Include examples.", ". Be concise.",
|
|
|
". I'm a beginner.", ". Provide detailed steps."
|
|
|
]
|
|
|
|
|
|
original_count = len(all_examples)
|
|
|
variations_needed = size - original_count
|
|
|
|
|
|
for _ in range(variations_needed):
|
|
|
|
|
|
source = random.choice(all_examples[:original_count])
|
|
|
prefix = random.choice(prefixes)
|
|
|
suffix = random.choice(suffixes)
|
|
|
|
|
|
|
|
|
text = source["text"]
|
|
|
if text.endswith("?") and suffix.startswith("?"):
|
|
|
suffix = random.choice([s for s in suffixes if not s.startswith("?")])
|
|
|
|
|
|
|
|
|
if text.endswith(".") and suffix.startswith("?"):
|
|
|
text = text[:-1]
|
|
|
|
|
|
variation = {
|
|
|
"text": prefix + text.lower() if random.random() > 0.5 else text + suffix,
|
|
|
"category": source["category"]
|
|
|
}
|
|
|
all_examples.append(variation)
|
|
|
|
|
|
|
|
|
dataset = random.sample(all_examples, size)
|
|
|
|
|
|
|
|
|
final_dataset = {
|
|
|
"user_inputs": [{"text": item["text"]} for item in dataset],
|
|
|
"metadata": {
|
|
|
"description": "Expanded dataset for STDP training",
|
|
|
"size": len(dataset),
|
|
|
"version": "2.0",
|
|
|
"categories": list(set(item["category"] for item in dataset))
|
|
|
}
|
|
|
}
|
|
|
|
|
|
|
|
|
os.makedirs(os.path.dirname(output_path), exist_ok=True)
|
|
|
with open(output_path, 'w', encoding='utf-8') as f:
|
|
|
json.dump(final_dataset, f, indent=2)
|
|
|
|
|
|
logger.info(f"Generated dataset with {len(dataset)} examples saved to {output_path}")
|
|
|
logger.info(f"Category distribution: {get_category_counts(dataset)}")
|
|
|
|
|
|
return final_dataset
|
|
|
|
|
|
def get_category_counts(dataset: List[Dict]) -> Dict[str, int]:
|
|
|
"""Count examples per category"""
|
|
|
counts = {}
|
|
|
for item in dataset:
|
|
|
category = item["category"]
|
|
|
if category in counts:
|
|
|
counts[category] += 1
|
|
|
else:
|
|
|
counts[category] = 1
|
|
|
return counts
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
import argparse
|
|
|
parser = argparse.ArgumentParser(description="Generate STDP training dataset")
|
|
|
parser.add_argument("--size", type=int, default=50, help="Number of examples to generate")
|
|
|
parser.add_argument("--output", type=str, help="Output file path")
|
|
|
args = parser.parse_args()
|
|
|
|
|
|
generate_expanded_dataset(args.size, args.output)
|
|
|
print("To use this dataset, run: python -m STDP_Communicator.train_stdp --dataset STDP_Communicator/expanded_stdp_dataset.json")
|
|
|
|