File size: 5,540 Bytes
7644eac
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
"""
Initialize the vector database with sample educational resources.
This provides some starter content for the Learning Path Generator.
"""
import os
import json
from pathlib import Path
from dotenv import load_dotenv

# Load environment variables
load_dotenv()

# Ensure OPENAI API key is set
if not os.getenv("OPENAI_API_KEY"):
    print("ERROR: OPENAI_API_KEY not set in environment variables")
    print("Please update your .env file with your API key")
    exit(1)

# Import after checking API key
from src.data.document_store import DocumentStore
from src.data.resources import ResourceManager
from langchain.schema.document import Document

def load_sample_resources():
    """Load sample resources from JSON file"""
    resources_path = Path("samples/sample_resources.json")
    
    if not resources_path.exists():
        # Create directory if it doesn't exist
        resources_path.parent.mkdir(exist_ok=True, parents=True)
        
        # Create sample resources file with basic content
        sample_resources = [
            {
                "title": "Introduction to Machine Learning",
                "type": "course",
                "description": "A comprehensive beginner's course covering ML fundamentals",
                "difficulty": "beginner",
                "time_estimate": "10 hours",
                "url": "https://example.com/intro-ml",
                "topic": "machine learning",
                "learning_styles": ["visual", "reading"]
            },
            {
                "title": "Python for Data Science Handbook",
                "type": "book",
                "description": "Essential guide to using Python for data analysis and ML",
                "difficulty": "intermediate",
                "time_estimate": "20 hours",
                "url": "https://jakevdp.github.io/PythonDataScienceHandbook/",
                "topic": "python,data science",
                "learning_styles": ["reading"]
            },
            {
                "title": "Web Development Bootcamp",
                "type": "course",
                "description": "Full stack web development from scratch",
                "difficulty": "beginner",
                "time_estimate": "40 hours",
                "url": "https://example.com/web-dev-bootcamp",
                "topic": "web development",
                "learning_styles": ["visual", "kinesthetic"]
            },
            {
                "title": "Advanced JavaScript Patterns",
                "type": "video",
                "description": "Deep dive into advanced JS design patterns",
                "difficulty": "advanced",
                "time_estimate": "3 hours",
                "url": "https://example.com/js-patterns",
                "topic": "javascript",
                "learning_styles": ["visual", "auditory"]
            },
            {
                "title": "Spanish Learning Podcast",
                "type": "podcast",
                "description": "Learn Spanish through immersive audio lessons",
                "difficulty": "beginner",
                "time_estimate": "10 hours",
                "url": "https://example.com/spanish-podcast",
                "topic": "spanish,language learning",
                "learning_styles": ["auditory"]
            }
        ]
        
        with open(resources_path, "w") as f:
            json.dump(sample_resources, f, indent=2)
            
        print(f"Created sample resources file at {resources_path}")
        return sample_resources
    else:
        # Load existing resources
        with open(resources_path, "r") as f:
            return json.load(f)

def initialize_database():
    """Initialize the vector database with sample resources"""
    print("Initializing vector database...")
    
    # Create document store
    document_store = DocumentStore()
    
    # Load sample resources
    resources = load_sample_resources()
    
    # Convert to Document objects
    documents = []
    for resource in resources:
        # Create content from resource information
        content = f"""
        Title: {resource['title']}
        Description: {resource['description']}
        Type: {resource['type']}
        Difficulty: {resource['difficulty']}
        Topics: {resource.get('topic', '')}
        """
        
        # Create metadata
        metadata = {
            "title": resource["title"],
            "type": resource["type"],
            "difficulty": resource["difficulty"],
            "url": resource["url"],
            "topic": resource.get("topic", "").split(",")
        }
        
        # Add learning styles if available
        if "learning_styles" in resource:
            metadata["learning_styles"] = resource["learning_styles"]
        
        # Create document
        doc = Document(page_content=content, metadata=metadata)
        documents.append(doc)
    
    # Add documents to vector store
    document_store.add_documents(documents)
    print(f"Added {len(documents)} sample resources to vector database")
    
    # Test search functionality
    print("\nTesting search functionality...")
    results = document_store.search_documents("machine learning beginner", top_k=2)
    print(f"Found {len(results)} results for 'machine learning beginner'")
    for result in results:
        print(f"- {result.metadata.get('title')} (Relevance: {result.metadata.get('relevance_score', 0):.2f})")
    
    print("\nDatabase initialization complete!")

if __name__ == "__main__":
    initialize_database()