File size: 15,502 Bytes
9e60d50
 
286c429
 
9e60d50
 
 
 
286c429
 
9ad6dea
9e60d50
 
 
286c429
 
 
 
9e60d50
286c429
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9e60d50
286c429
 
 
 
 
 
 
 
 
 
 
 
 
9ad889b
286c429
 
9ad889b
 
 
 
 
286c429
 
 
 
9e60d50
9ad889b
 
 
 
 
 
 
286c429
 
9ad889b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
286c429
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9e60d50
286c429
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9e60d50
9ad6dea
 
9e60d50
32fecea
9ad6dea
 
 
 
 
 
 
9e60d50
9ad6dea
 
 
9e60d50
 
 
 
 
 
 
286c429
 
 
 
9e60d50
 
 
 
 
9ad6dea
9e60d50
9ad6dea
286c429
9e60d50
 
 
9ad6dea
9e60d50
286c429
9e60d50
 
 
 
 
9ad6dea
9e60d50
 
 
 
 
 
 
32fecea
9e60d50
9ad6dea
9e60d50
 
 
9ad6dea
286c429
9e60d50
 
 
9ad889b
 
 
 
 
 
 
9e60d50
 
 
 
 
9ad889b
 
 
 
 
9e60d50
 
 
9ad889b
 
 
 
 
 
 
9e60d50
9ad6dea
9e60d50
 
 
32fecea
9e60d50
 
32fecea
9e60d50
 
9e37d03
9e60d50
 
 
9ad6dea
 
9e60d50
32fecea
9ad6dea
 
 
9e60d50
286c429
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
#!/usr/bin/env python
"""
Sample data loader for database initialization.
Loads curated examples of traces and knowledge graphs from JSON files for new users.
"""

import json
import logging
import os
from pathlib import Path
from typing import Dict, List, Any

logger = logging.getLogger(__name__)

# Get the directory where this file is located
CURRENT_DIR = Path(__file__).parent
SAMPLES_DIR = CURRENT_DIR / "samples"
CONFIG_FILE = SAMPLES_DIR / "samples_config.json"


class SampleDataLoader:
    """Loads sample data from JSON files."""
    
    def __init__(self):
        self._config = None
        self._traces = None
        self._knowledge_graphs = None
    
    def _load_config(self) -> Dict[str, Any]:
        """Load the samples configuration."""
        if self._config is None:
            try:
                with open(CONFIG_FILE, 'r', encoding='utf-8') as f:
                    self._config = json.load(f)
                logger.info(f"Loaded sample data configuration from {CONFIG_FILE}")
            except FileNotFoundError:
                logger.error(f"Configuration file not found: {CONFIG_FILE}")
                raise
            except json.JSONDecodeError as e:
                logger.error(f"Invalid JSON in configuration file: {e}")
                raise
        return self._config
    
    def _load_trace(self, trace_file: str) -> Dict[str, Any]:
        """Load a single trace from JSON file."""
        trace_path = SAMPLES_DIR / trace_file
        try:
            with open(trace_path, 'r', encoding='utf-8') as f:
                return json.load(f)
        except FileNotFoundError:
            logger.error(f"Trace file not found: {trace_path}")
            raise
        except json.JSONDecodeError as e:
            logger.error(f"Invalid JSON in trace file {trace_path}: {e}")
            raise
    
    def _load_knowledge_graph(self, kg_file: str) -> Dict[str, Any]:
        """Load a single knowledge graph from JSON file."""
        kg_path = SAMPLES_DIR / kg_file
        try:
            with open(kg_path, 'r', encoding='utf-8') as f:
                return json.load(f)
        except FileNotFoundError:
            logger.error(f"Knowledge graph file not found: {kg_path}")
            raise
        except json.JSONDecodeError as e:
            logger.error(f"Invalid JSON in knowledge graph file {kg_path}: {e}")
            raise
    
    def get_traces(self) -> List[Dict[str, Any]]:
        """Get all sample traces in the expected format."""
        if self._traces is None:
            config = self._load_config()
            self._traces = []
            
            for sample in config["samples"]:
                # Load the trace data
                trace_data = self._load_trace(sample["trace_file"])
                
                # Convert to the expected format
                trace_entry = {
                    "filename": sample["name"].replace(" ", "_").lower() + ".json",
                    "title": sample["name"],
                    "description": sample["description"],
                    "trace_type": sample["trace_type"],
                    "trace_source": sample["trace_source"],
                    "tags": sample["tags"],
                    "content": json.dumps(trace_data["content"])  # Convert content back to JSON string
                }
                self._traces.append(trace_entry)
                
            logger.info(f"Loaded {len(self._traces)} sample traces")
        
        return self._traces
    
    def get_knowledge_graphs(self) -> List[Dict[str, Any]]:
        """Get all sample knowledge graphs in the expected format."""
        if self._knowledge_graphs is None:
            config = self._load_config()
            self._knowledge_graphs = []
            
            for i, sample in enumerate(config["samples"]):
                # Load the main knowledge graph data
                kg_data = self._load_knowledge_graph(sample["knowledge_graph_file"])
                
                # Check if this sample supports replay (has window KGs)
                supports_replay = sample.get("supports_replay", False)
                window_info = sample.get("window_info", {})
                
                # Convert main KG to the expected format
                kg_entry = {
                    "filename": sample["knowledge_graph_file"].split("/")[-1],  # Get just the filename
                    "trace_index": i,  # Links to trace by index
                    "graph_data": kg_data["graph_data"]
                }
                
                # Add window metadata for final KG (window_index=None, window_total=count)
                if supports_replay and window_info:
                    kg_entry["window_total"] = window_info.get("window_count", 0)
                    kg_entry["processing_run_id"] = window_info.get("processing_run_id")
                    logger.debug(f"Main KG {kg_entry['filename']} configured with {kg_entry['window_total']} windows")
                
                self._knowledge_graphs.append(kg_entry)
                
                # Load window KGs if they exist
                if supports_replay and window_info.get("window_files"):
                    window_files = window_info["window_files"]
                    window_count = window_info.get("window_count", len(window_files))
                    processing_run_id = window_info.get("processing_run_id")
                    
                    logger.debug(f"Loading {len(window_files)} window KGs for {sample['id']}")
                    
                    for window_index, window_file in enumerate(window_files):
                        try:
                            # Load window KG data
                            window_kg_data = self._load_knowledge_graph(window_file)
                            
                            # Convert window KG to the expected format
                            window_kg_entry = {
                                "filename": window_file.split("/")[-1],  # Get just the filename
                                "trace_index": i,  # Links to same trace
                                "graph_data": window_kg_data["graph_data"],
                                "window_index": window_index,  # This makes it a window KG
                                "window_total": window_count,
                                "processing_run_id": processing_run_id,
                                "window_start_char": window_kg_data.get("window_start_char"),
                                "window_end_char": window_kg_data.get("window_end_char")
                            }
                            
                            self._knowledge_graphs.append(window_kg_entry)
                            logger.debug(f"Loaded window KG {window_index}: {window_kg_entry['filename']}")
                            
                        except Exception as e:
                            logger.error(f"Failed to load window KG {window_file}: {e}")
                            continue
                
            logger.info(f"Loaded {len(self._knowledge_graphs)} sample knowledge graphs (including window KGs)")
        
        return self._knowledge_graphs
    
    def get_sample_info(self) -> Dict[str, Any]:
        """Get information about the available sample data."""
        config = self._load_config()
        traces = self.get_traces()
        knowledge_graphs = self.get_knowledge_graphs()
        
        # Extract unique features from all samples
        all_features = set()
        for sample in config["samples"]:
            all_features.update(sample.get("features", []))
        
        return {
            "traces_count": len(traces),
            "knowledge_graphs_count": len(knowledge_graphs),
            "trace_types": list(set(t["trace_type"] for t in traces)),
            "complexity_levels": list(set(sample.get("complexity", "standard") for sample in config["samples"])),
            "features": list(all_features),
            "description": config["metadata"]["description"],
            "version": config["metadata"]["version"]
        }


# Create a global loader instance
_loader = SampleDataLoader()

# Maintain backward compatibility by exposing the same interface
def get_sample_traces() -> List[Dict[str, Any]]:
    """Get sample traces (backward compatibility)."""
    return _loader.get_traces()

def get_sample_knowledge_graphs() -> List[Dict[str, Any]]:
    """Get sample knowledge graphs (backward compatibility)."""
    return _loader.get_knowledge_graphs()

# Legacy global variables for backward compatibility
@property 
def SAMPLE_TRACES():
    """Legacy property for backward compatibility."""
    return _loader.get_traces()

@property
def SAMPLE_KNOWLEDGE_GRAPHS():
    """Legacy property for backward compatibility."""
    return _loader.get_knowledge_graphs()

# Make them accessible as module-level variables
import sys
current_module = sys.modules[__name__]
current_module.SAMPLE_TRACES = _loader.get_traces()
current_module.SAMPLE_KNOWLEDGE_GRAPHS = _loader.get_knowledge_graphs()


def insert_sample_data(session, force_insert=False):
    """
    Insert sample traces and knowledge graphs into the database.
    
    Args:
        session: Database session
        force_insert: If True, insert even if data already exists
    
    Returns:
        Dict with insertion results
    """
    from backend.database.utils import save_trace, save_knowledge_graph
    from backend.database.models import Trace, KnowledgeGraph
    
    results = {
        "traces_inserted": 0,
        "knowledge_graphs_inserted": 0,
        "skipped": 0,
        "errors": []
    }
    
    # Get sample data from loader
    sample_traces = _loader.get_traces()
    sample_knowledge_graphs = _loader.get_knowledge_graphs()
    
    # Check if sample data already exists
    if not force_insert:
        existing_sample = session.query(Trace).filter(
            Trace.trace_source == "sample_data"
        ).first()
        
        if existing_sample:
            logger.info("Sample data already exists, skipping insertion")
            results["skipped"] = len(sample_traces)
            return results
    
    try:
        # Insert sample traces
        trace_ids = []
        for i, trace_data in enumerate(sample_traces):
            try:
                trace = save_trace(
                    session=session,
                    content=trace_data["content"],
                    filename=trace_data["filename"],
                    title=trace_data["title"],
                    description=trace_data["description"],
                    trace_type=trace_data["trace_type"],
                    trace_source=trace_data["trace_source"],
                    tags=trace_data["tags"]
                )
                trace_ids.append(trace.trace_id)
                results["traces_inserted"] += 1
                logger.info(f"Inserted sample trace: {trace_data['title']}")
            except Exception as e:
                error_msg = f"Error inserting trace {i}: {str(e)}"
                logger.error(error_msg)
                results["errors"].append(error_msg)
        
        # Insert corresponding knowledge graphs
        for kg_data in sample_knowledge_graphs:
            try:
                trace_index = kg_data["trace_index"]
                if trace_index < len(trace_ids):
                    # Extract window information from the KG data
                    window_index = kg_data.get("window_index")  # None for final KG, index for window KG
                    window_total = kg_data.get("window_total", 1)  # Use provided window_total or default to 1
                    window_start_char = kg_data.get("window_start_char")
                    window_end_char = kg_data.get("window_end_char")
                    processing_run_id = kg_data.get("processing_run_id")
                    
                    save_knowledge_graph(
                        session=session,
                        filename=kg_data["filename"],
                        graph_data=kg_data["graph_data"],
                        trace_id=trace_ids[trace_index],
                        window_index=window_index,
                        window_total=window_total,
                        window_start_char=window_start_char,
                        window_end_char=window_end_char,
                        processing_run_id=processing_run_id,
                        is_original=True
                    )
                    results["knowledge_graphs_inserted"] += 1
                    
                    # Log different messages for final vs window KGs
                    if window_index is None:
                        logger.info(f"Inserted sample knowledge graph: {kg_data['filename']} (final, {window_total} windows)")
                    else:
                        logger.info(f"Inserted sample knowledge graph: {kg_data['filename']} (window {window_index})")
                        
            except Exception as e:
                error_msg = f"Error inserting knowledge graph {kg_data['filename']}: {str(e)}"
                logger.error(error_msg)
                results["errors"].append(error_msg)
        
        logger.info(f"Sample data insertion completed: {results}")
        
    except Exception as e:
        error_msg = f"Fatal error during sample data insertion: {str(e)}"
        logger.error(error_msg)
        results["errors"].append(error_msg)
        raise  # Re-raise to trigger rollback in calling code
    
    return results


def get_sample_data_info():
    """
    Get information about the available sample data.
    
    Returns:
        Dict with sample data statistics
    """
    return _loader.get_sample_info()


# Additional utility functions for managing samples
def add_sample(sample_id: str, name: str, description: str, trace_file: str, 
               knowledge_graph_file: str, tags: List[str], trace_type: str = "custom",
               trace_source: str = "sample_data", complexity: str = "standard",
               features: List[str] = None):
    """
    Add a new sample to the configuration (utility function for future use).
    
    Args:
        sample_id: Unique identifier for the sample
        name: Human-readable name
        description: Description of the sample
        trace_file: Path to trace JSON file relative to samples directory
        knowledge_graph_file: Path to KG JSON file relative to samples directory
        tags: List of tags
        trace_type: Type of trace
        trace_source: Source of trace
        complexity: Complexity level
        features: List of features demonstrated
    """
    # This would modify the config file - implementation depends on requirements
    logger.info(f"Add sample feature called for: {sample_id}")
    pass


def list_available_samples() -> List[Dict[str, Any]]:
    """List all available samples with their metadata."""
    config = _loader._load_config()
    return config["samples"]


if __name__ == "__main__":
    # Quick test of the loader
    try:
        info = get_sample_data_info()
        print("Sample Data Info:", json.dumps(info, indent=2))
        
        traces = get_sample_traces()
        print(f"Loaded {len(traces)} traces")
        
        kgs = get_sample_knowledge_graphs()
        print(f"Loaded {len(kgs)} knowledge graphs")
        
    except Exception as e:
        print(f"Error testing sample data loader: {e}")