""" Initialize the vector database with sample educational resources. This provides some starter content for the Learning Path Generator. """ import os import json from pathlib import Path from dotenv import load_dotenv # Load environment variables load_dotenv() # Ensure OPENAI API key is set if not os.getenv("OPENAI_API_KEY"): print("ERROR: OPENAI_API_KEY not set in environment variables") print("Please update your .env file with your API key") exit(1) # Import after checking API key from src.data.document_store import DocumentStore from src.data.resources import ResourceManager from langchain.schema.document import Document def load_sample_resources(): """Load sample resources from JSON file""" resources_path = Path("samples/sample_resources.json") if not resources_path.exists(): # Create directory if it doesn't exist resources_path.parent.mkdir(exist_ok=True, parents=True) # Create sample resources file with basic content sample_resources = [ { "title": "Introduction to Machine Learning", "type": "course", "description": "A comprehensive beginner's course covering ML fundamentals", "difficulty": "beginner", "time_estimate": "10 hours", "url": "https://example.com/intro-ml", "topic": "machine learning", "learning_styles": ["visual", "reading"] }, { "title": "Python for Data Science Handbook", "type": "book", "description": "Essential guide to using Python for data analysis and ML", "difficulty": "intermediate", "time_estimate": "20 hours", "url": "https://jakevdp.github.io/PythonDataScienceHandbook/", "topic": "python,data science", "learning_styles": ["reading"] }, { "title": "Web Development Bootcamp", "type": "course", "description": "Full stack web development from scratch", "difficulty": "beginner", "time_estimate": "40 hours", "url": "https://example.com/web-dev-bootcamp", "topic": "web development", "learning_styles": ["visual", "kinesthetic"] }, { "title": "Advanced JavaScript Patterns", "type": "video", "description": "Deep dive into advanced JS design patterns", "difficulty": "advanced", "time_estimate": "3 hours", "url": "https://example.com/js-patterns", "topic": "javascript", "learning_styles": ["visual", "auditory"] }, { "title": "Spanish Learning Podcast", "type": "podcast", "description": "Learn Spanish through immersive audio lessons", "difficulty": "beginner", "time_estimate": "10 hours", "url": "https://example.com/spanish-podcast", "topic": "spanish,language learning", "learning_styles": ["auditory"] } ] with open(resources_path, "w") as f: json.dump(sample_resources, f, indent=2) print(f"Created sample resources file at {resources_path}") return sample_resources else: # Load existing resources with open(resources_path, "r") as f: return json.load(f) def initialize_database(): """Initialize the vector database with sample resources""" print("Initializing vector database...") # Create document store document_store = DocumentStore() # Load sample resources resources = load_sample_resources() # Convert to Document objects documents = [] for resource in resources: # Create content from resource information content = f""" Title: {resource['title']} Description: {resource['description']} Type: {resource['type']} Difficulty: {resource['difficulty']} Topics: {resource.get('topic', '')} """ # Create metadata metadata = { "title": resource["title"], "type": resource["type"], "difficulty": resource["difficulty"], "url": resource["url"], "topic": resource.get("topic", "").split(",") } # Add learning styles if available if "learning_styles" in resource: metadata["learning_styles"] = resource["learning_styles"] # Create document doc = Document(page_content=content, metadata=metadata) documents.append(doc) # Add documents to vector store document_store.add_documents(documents) print(f"Added {len(documents)} sample resources to vector database") # Test search functionality print("\nTesting search functionality...") results = document_store.search_documents("machine learning beginner", top_k=2) print(f"Found {len(results)} results for 'machine learning beginner'") for result in results: print(f"- {result.metadata.get('title')} (Relevance: {result.metadata.get('relevance_score', 0):.2f})") print("\nDatabase initialization complete!") if __name__ == "__main__": initialize_database()